aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorEric Paris <eparis@redhat.com>2013-11-22 18:57:08 -0500
committerEric Paris <eparis@redhat.com>2013-11-22 18:57:54 -0500
commitfc582aef7dcc27a7120cf232c1e76c569c7b6eab (patch)
tree7d275dd4ceab6067b91e9a25a5f6338b425fbccd /kernel
parent9175c9d2aed528800175ef81c90569d00d23f9be (diff)
parent5e01dc7b26d9f24f39abace5da98ccbd6a5ceb52 (diff)
Merge tag 'v3.12'
Linux 3.12 Conflicts: fs/exec.c
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c5
-rw-r--r--kernel/capability.c13
-rw-r--r--kernel/cgroup.c1676
-rw-r--r--kernel/cgroup_freezer.c155
-rw-r--r--kernel/context_tracking.c137
-rw-r--r--kernel/cpu.c9
-rw-r--r--kernel/cpuset.c317
-rw-r--r--kernel/events/callchain.c3
-rw-r--r--kernel/events/core.c442
-rw-r--r--kernel/events/ring_buffer.c31
-rw-r--r--kernel/events/uprobes.c4
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/fork.c40
-rw-r--r--kernel/gcov/fs.c2
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/hung_task.c13
-rw-r--r--kernel/irq/Kconfig12
-rw-r--r--kernel/jump_label.c1
-rw-r--r--kernel/kexec.c5
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c95
-rw-r--r--kernel/ksysfs.c2
-rw-r--r--kernel/lglock.c12
-rw-r--r--kernel/modsign_pubkey.c6
-rw-r--r--kernel/module.c30
-rw-r--r--kernel/mutex.c75
-rw-r--r--kernel/nsproxy.c36
-rw-r--r--kernel/padata.c32
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/params.c34
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/power/hibernate.c49
-rw-r--r--kernel/power/snapshot.c17
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/power/user.c32
-rw-r--r--kernel/printk/printk.c7
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcu.h12
-rw-r--r--kernel/rcupdate.c104
-rw-r--r--kernel/rcutiny.c2
-rw-r--r--kernel/rcutiny_plugin.h2
-rw-r--r--kernel/rcutorture.c396
-rw-r--r--kernel/rcutree.c255
-rw-r--r--kernel/rcutree.h19
-rw-r--r--kernel/rcutree_plugin.h460
-rw-r--r--kernel/reboot.c9
-rw-r--r--kernel/res_counter.c25
-rw-r--r--kernel/sched/core.c156
-rw-r--r--kernel/sched/cpuacct.c51
-rw-r--r--kernel/sched/cputime.c74
-rw-r--r--kernel/sched/debug.c6
-rw-r--r--kernel/sched/fair.c631
-rw-r--r--kernel/sched/sched.h14
-rw-r--r--kernel/sched/stats.h5
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/smp.c32
-rw-r--r--kernel/softirq.c17
-rw-r--r--kernel/spinlock.c14
-rw-r--r--kernel/sys.c20
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/task_work.c40
-rw-r--r--kernel/time/Kconfig51
-rw-r--r--kernel/time/clockevents.c65
-rw-r--r--kernel/time/ntp.c6
-rw-r--r--kernel/time/tick-sched.c61
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/trace/ftrace.c17
-rw-r--r--kernel/trace/trace.c37
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_events.c207
-rw-r--r--kernel/trace/trace_printk.c19
-rw-r--r--kernel/trace/trace_syscalls.c10
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/up.c58
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/watchdog.c68
-rw-r--r--kernel/workqueue.c148
81 files changed, 3684 insertions, 2757 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 35ef1185e359..1ce47553fb02 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -26,6 +26,7 @@ obj-y += sched/
26obj-y += power/ 26obj-y += power/
27obj-y += printk/ 27obj-y += printk/
28obj-y += cpu/ 28obj-y += cpu/
29obj-y += irq/
29 30
30obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o 31obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
31obj-$(CONFIG_FREEZER) += freezer.o 32obj-$(CONFIG_FREEZER) += freezer.o
@@ -79,7 +80,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o
79obj-$(CONFIG_KGDB) += debug/ 80obj-$(CONFIG_KGDB) += debug/
80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 81obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
81obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o 82obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
82obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
83obj-$(CONFIG_SECCOMP) += seccomp.o 83obj-$(CONFIG_SECCOMP) += seccomp.o
84obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 84obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
85obj-$(CONFIG_TREE_RCU) += rcutree.o 85obj-$(CONFIG_TREE_RCU) += rcutree.o
diff --git a/kernel/audit.c b/kernel/audit.c
index b8831ac25b70..906ae5a0233a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1222,9 +1222,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1222 1222
1223 sleep_time = timeout_start + audit_backlog_wait_time - 1223 sleep_time = timeout_start + audit_backlog_wait_time -
1224 jiffies; 1224 jiffies;
1225 if ((long)sleep_time > 0) 1225 if ((long)sleep_time > 0) {
1226 wait_for_auditd(sleep_time); 1226 wait_for_auditd(sleep_time);
1227 continue; 1227 continue;
1228 }
1228 } 1229 }
1229 if (audit_rate_check() && printk_ratelimit()) 1230 if (audit_rate_check() && printk_ratelimit())
1230 printk(KERN_WARNING 1231 printk(KERN_WARNING
diff --git a/kernel/capability.c b/kernel/capability.c
index f6c2ce5701e1..4e66bf9275b0 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -433,18 +433,6 @@ bool capable(int cap)
433EXPORT_SYMBOL(capable); 433EXPORT_SYMBOL(capable);
434 434
435/** 435/**
436 * nsown_capable - Check superior capability to one's own user_ns
437 * @cap: The capability in question
438 *
439 * Return true if the current task has the given superior capability
440 * targeted at its own user namespace.
441 */
442bool nsown_capable(int cap)
443{
444 return ns_capable(current_user_ns(), cap);
445}
446
447/**
448 * inode_capable - Check superior capability over inode 436 * inode_capable - Check superior capability over inode
449 * @inode: The inode in question 437 * @inode: The inode in question
450 * @cap: The capability in question 438 * @cap: The capability in question
@@ -464,3 +452,4 @@ bool inode_capable(const struct inode *inode, int cap)
464 452
465 return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); 453 return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
466} 454}
455EXPORT_SYMBOL(inode_capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e91963302c0d..8bd9cfdc70d7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,6 +60,7 @@
60#include <linux/poll.h> 60#include <linux/poll.h>
61#include <linux/flex_array.h> /* used in cgroup_attach_task */ 61#include <linux/flex_array.h> /* used in cgroup_attach_task */
62#include <linux/kthread.h> 62#include <linux/kthread.h>
63#include <linux/file.h>
63 64
64#include <linux/atomic.h> 65#include <linux/atomic.h>
65 66
@@ -81,7 +82,7 @@
81 */ 82 */
82#ifdef CONFIG_PROVE_RCU 83#ifdef CONFIG_PROVE_RCU
83DEFINE_MUTEX(cgroup_mutex); 84DEFINE_MUTEX(cgroup_mutex);
84EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ 85EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
85#else 86#else
86static DEFINE_MUTEX(cgroup_mutex); 87static DEFINE_MUTEX(cgroup_mutex);
87#endif 88#endif
@@ -117,6 +118,7 @@ struct cfent {
117 struct list_head node; 118 struct list_head node;
118 struct dentry *dentry; 119 struct dentry *dentry;
119 struct cftype *type; 120 struct cftype *type;
121 struct cgroup_subsys_state *css;
120 122
121 /* file xattrs */ 123 /* file xattrs */
122 struct simple_xattrs xattrs; 124 struct simple_xattrs xattrs;
@@ -159,9 +161,9 @@ struct css_id {
159 */ 161 */
160struct cgroup_event { 162struct cgroup_event {
161 /* 163 /*
162 * Cgroup which the event belongs to. 164 * css which the event belongs to.
163 */ 165 */
164 struct cgroup *cgrp; 166 struct cgroup_subsys_state *css;
165 /* 167 /*
166 * Control file which the event associated. 168 * Control file which the event associated.
167 */ 169 */
@@ -215,10 +217,33 @@ static u64 cgroup_serial_nr_next = 1;
215 */ 217 */
216static int need_forkexit_callback __read_mostly; 218static int need_forkexit_callback __read_mostly;
217 219
218static void cgroup_offline_fn(struct work_struct *work); 220static struct cftype cgroup_base_files[];
221
222static void cgroup_destroy_css_killed(struct cgroup *cgrp);
219static int cgroup_destroy_locked(struct cgroup *cgrp); 223static int cgroup_destroy_locked(struct cgroup *cgrp);
220static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 224static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
221 struct cftype cfts[], bool is_add); 225 bool is_add);
226
227/**
228 * cgroup_css - obtain a cgroup's css for the specified subsystem
229 * @cgrp: the cgroup of interest
230 * @ss: the subsystem of interest (%NULL returns the dummy_css)
231 *
232 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
233 * function must be called either under cgroup_mutex or rcu_read_lock() and
234 * the caller is responsible for pinning the returned css if it wants to
235 * keep accessing it outside the said locks. This function may return
236 * %NULL if @cgrp doesn't have @subsys_id enabled.
237 */
238static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
239 struct cgroup_subsys *ss)
240{
241 if (ss)
242 return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
243 lockdep_is_held(&cgroup_mutex));
244 else
245 return &cgrp->dummy_css;
246}
222 247
223/* convenient tests for these bits */ 248/* convenient tests for these bits */
224static inline bool cgroup_is_dead(const struct cgroup *cgrp) 249static inline bool cgroup_is_dead(const struct cgroup *cgrp)
@@ -365,9 +390,11 @@ static struct cgrp_cset_link init_cgrp_cset_link;
365static int cgroup_init_idr(struct cgroup_subsys *ss, 390static int cgroup_init_idr(struct cgroup_subsys *ss,
366 struct cgroup_subsys_state *css); 391 struct cgroup_subsys_state *css);
367 392
368/* css_set_lock protects the list of css_set objects, and the 393/*
369 * chain of tasks off each css_set. Nests outside task->alloc_lock 394 * css_set_lock protects the list of css_set objects, and the chain of
370 * due to cgroup_iter_start() */ 395 * tasks off each css_set. Nests outside task->alloc_lock due to
396 * css_task_iter_start().
397 */
371static DEFINE_RWLOCK(css_set_lock); 398static DEFINE_RWLOCK(css_set_lock);
372static int css_set_count; 399static int css_set_count;
373 400
@@ -392,10 +419,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
392 return key; 419 return key;
393} 420}
394 421
395/* We don't maintain the lists running through each css_set to its 422/*
396 * task until after the first call to cgroup_iter_start(). This 423 * We don't maintain the lists running through each css_set to its task
397 * reduces the fork()/exit() overhead for people who have cgroups 424 * until after the first call to css_task_iter_start(). This reduces the
398 * compiled into their kernel but not actually in use */ 425 * fork()/exit() overhead for people who have cgroups compiled into their
426 * kernel but not actually in use.
427 */
399static int use_task_css_set_links __read_mostly; 428static int use_task_css_set_links __read_mostly;
400 429
401static void __put_css_set(struct css_set *cset, int taskexit) 430static void __put_css_set(struct css_set *cset, int taskexit)
@@ -464,7 +493,7 @@ static inline void put_css_set_taskexit(struct css_set *cset)
464 * @new_cgrp: cgroup that's being entered by the task 493 * @new_cgrp: cgroup that's being entered by the task
465 * @template: desired set of css pointers in css_set (pre-calculated) 494 * @template: desired set of css pointers in css_set (pre-calculated)
466 * 495 *
467 * Returns true if "cg" matches "old_cg" except for the hierarchy 496 * Returns true if "cset" matches "old_cset" except for the hierarchy
468 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 497 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
469 */ 498 */
470static bool compare_css_sets(struct css_set *cset, 499static bool compare_css_sets(struct css_set *cset,
@@ -555,7 +584,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
555 /* Subsystem is in this hierarchy. So we want 584 /* Subsystem is in this hierarchy. So we want
556 * the subsystem state from the new 585 * the subsystem state from the new
557 * cgroup */ 586 * cgroup */
558 template[i] = cgrp->subsys[i]; 587 template[i] = cgroup_css(cgrp, ss);
559 } else { 588 } else {
560 /* Subsystem is not in this hierarchy, so we 589 /* Subsystem is not in this hierarchy, so we
561 * don't want to change the subsystem state */ 590 * don't want to change the subsystem state */
@@ -803,8 +832,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
803 832
804static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 833static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
805static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 834static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
806static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 835static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
807 unsigned long subsys_mask);
808static const struct inode_operations cgroup_dir_inode_operations; 836static const struct inode_operations cgroup_dir_inode_operations;
809static const struct file_operations proc_cgroupstats_operations; 837static const struct file_operations proc_cgroupstats_operations;
810 838
@@ -813,8 +841,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
813 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 841 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
814}; 842};
815 843
816static int alloc_css_id(struct cgroup_subsys *ss, 844static int alloc_css_id(struct cgroup_subsys_state *child_css);
817 struct cgroup *parent, struct cgroup *child);
818 845
819static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) 846static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
820{ 847{
@@ -845,15 +872,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
845static void cgroup_free_fn(struct work_struct *work) 872static void cgroup_free_fn(struct work_struct *work)
846{ 873{
847 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 874 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
848 struct cgroup_subsys *ss;
849 875
850 mutex_lock(&cgroup_mutex); 876 mutex_lock(&cgroup_mutex);
851 /*
852 * Release the subsystem state objects.
853 */
854 for_each_root_subsys(cgrp->root, ss)
855 ss->css_free(cgrp);
856
857 cgrp->root->number_of_cgroups--; 877 cgrp->root->number_of_cgroups--;
858 mutex_unlock(&cgroup_mutex); 878 mutex_unlock(&cgroup_mutex);
859 879
@@ -864,8 +884,6 @@ static void cgroup_free_fn(struct work_struct *work)
864 */ 884 */
865 dput(cgrp->parent->dentry); 885 dput(cgrp->parent->dentry);
866 886
867 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
868
869 /* 887 /*
870 * Drop the active superblock reference that we took when we 888 * Drop the active superblock reference that we took when we
871 * created the cgroup. This will free cgrp->root, if we are 889 * created the cgroup. This will free cgrp->root, if we are
@@ -956,27 +974,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
956} 974}
957 975
958/** 976/**
959 * cgroup_clear_directory - selective removal of base and subsystem files 977 * cgroup_clear_dir - remove subsys files in a cgroup directory
960 * @dir: directory containing the files 978 * @cgrp: target cgroup
961 * @base_files: true if the base files should be removed
962 * @subsys_mask: mask of the subsystem ids whose files should be removed 979 * @subsys_mask: mask of the subsystem ids whose files should be removed
963 */ 980 */
964static void cgroup_clear_directory(struct dentry *dir, bool base_files, 981static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
965 unsigned long subsys_mask)
966{ 982{
967 struct cgroup *cgrp = __d_cgrp(dir);
968 struct cgroup_subsys *ss; 983 struct cgroup_subsys *ss;
984 int i;
969 985
970 for_each_root_subsys(cgrp->root, ss) { 986 for_each_subsys(ss, i) {
971 struct cftype_set *set; 987 struct cftype_set *set;
972 if (!test_bit(ss->subsys_id, &subsys_mask)) 988
989 if (!test_bit(i, &subsys_mask))
973 continue; 990 continue;
974 list_for_each_entry(set, &ss->cftsets, node) 991 list_for_each_entry(set, &ss->cftsets, node)
975 cgroup_addrm_files(cgrp, NULL, set->cfts, false); 992 cgroup_addrm_files(cgrp, set->cfts, false);
976 }
977 if (base_files) {
978 while (!list_empty(&cgrp->files))
979 cgroup_rm_file(cgrp, NULL);
980 } 993 }
981} 994}
982 995
@@ -986,9 +999,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
986static void cgroup_d_remove_dir(struct dentry *dentry) 999static void cgroup_d_remove_dir(struct dentry *dentry)
987{ 1000{
988 struct dentry *parent; 1001 struct dentry *parent;
989 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
990
991 cgroup_clear_directory(dentry, true, root->subsys_mask);
992 1002
993 parent = dentry->d_parent; 1003 parent = dentry->d_parent;
994 spin_lock(&parent->d_lock); 1004 spin_lock(&parent->d_lock);
@@ -1009,79 +1019,84 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1009{ 1019{
1010 struct cgroup *cgrp = &root->top_cgroup; 1020 struct cgroup *cgrp = &root->top_cgroup;
1011 struct cgroup_subsys *ss; 1021 struct cgroup_subsys *ss;
1012 int i; 1022 unsigned long pinned = 0;
1023 int i, ret;
1013 1024
1014 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1025 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1015 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1026 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
1016 1027
1017 /* Check that any added subsystems are currently free */ 1028 /* Check that any added subsystems are currently free */
1018 for_each_subsys(ss, i) { 1029 for_each_subsys(ss, i) {
1019 unsigned long bit = 1UL << i; 1030 if (!(added_mask & (1 << i)))
1020
1021 if (!(bit & added_mask))
1022 continue; 1031 continue;
1023 1032
1033 /* is the subsystem mounted elsewhere? */
1024 if (ss->root != &cgroup_dummy_root) { 1034 if (ss->root != &cgroup_dummy_root) {
1025 /* Subsystem isn't free */ 1035 ret = -EBUSY;
1026 return -EBUSY; 1036 goto out_put;
1037 }
1038
1039 /* pin the module */
1040 if (!try_module_get(ss->module)) {
1041 ret = -ENOENT;
1042 goto out_put;
1027 } 1043 }
1044 pinned |= 1 << i;
1028 } 1045 }
1029 1046
1030 /* Currently we don't handle adding/removing subsystems when 1047 /* subsys could be missing if unloaded between parsing and here */
1031 * any child cgroups exist. This is theoretically supportable 1048 if (added_mask != pinned) {
1032 * but involves complex error handling, so it's being left until 1049 ret = -ENOENT;
1033 * later */ 1050 goto out_put;
1034 if (root->number_of_cgroups > 1) 1051 }
1035 return -EBUSY; 1052
1053 ret = cgroup_populate_dir(cgrp, added_mask);
1054 if (ret)
1055 goto out_put;
1056
1057 /*
1058 * Nothing can fail from this point on. Remove files for the
1059 * removed subsystems and rebind each subsystem.
1060 */
1061 cgroup_clear_dir(cgrp, removed_mask);
1036 1062
1037 /* Process each subsystem */
1038 for_each_subsys(ss, i) { 1063 for_each_subsys(ss, i) {
1039 unsigned long bit = 1UL << i; 1064 unsigned long bit = 1UL << i;
1040 1065
1041 if (bit & added_mask) { 1066 if (bit & added_mask) {
1042 /* We're binding this subsystem to this hierarchy */ 1067 /* We're binding this subsystem to this hierarchy */
1043 BUG_ON(cgrp->subsys[i]); 1068 BUG_ON(cgroup_css(cgrp, ss));
1044 BUG_ON(!cgroup_dummy_top->subsys[i]); 1069 BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1045 BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); 1070 BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1071
1072 rcu_assign_pointer(cgrp->subsys[i],
1073 cgroup_css(cgroup_dummy_top, ss));
1074 cgroup_css(cgrp, ss)->cgroup = cgrp;
1046 1075
1047 cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
1048 cgrp->subsys[i]->cgroup = cgrp;
1049 list_move(&ss->sibling, &root->subsys_list); 1076 list_move(&ss->sibling, &root->subsys_list);
1050 ss->root = root; 1077 ss->root = root;
1051 if (ss->bind) 1078 if (ss->bind)
1052 ss->bind(cgrp); 1079 ss->bind(cgroup_css(cgrp, ss));
1053 1080
1054 /* refcount was already taken, and we're keeping it */ 1081 /* refcount was already taken, and we're keeping it */
1055 root->subsys_mask |= bit; 1082 root->subsys_mask |= bit;
1056 } else if (bit & removed_mask) { 1083 } else if (bit & removed_mask) {
1057 /* We're removing this subsystem */ 1084 /* We're removing this subsystem */
1058 BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); 1085 BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1059 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1086 BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1060 1087
1061 if (ss->bind) 1088 if (ss->bind)
1062 ss->bind(cgroup_dummy_top); 1089 ss->bind(cgroup_css(cgroup_dummy_top, ss));
1063 cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; 1090
1064 cgrp->subsys[i] = NULL; 1091 cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top;
1092 RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1093
1065 cgroup_subsys[i]->root = &cgroup_dummy_root; 1094 cgroup_subsys[i]->root = &cgroup_dummy_root;
1066 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); 1095 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1067 1096
1068 /* subsystem is now free - drop reference on module */ 1097 /* subsystem is now free - drop reference on module */
1069 module_put(ss->module); 1098 module_put(ss->module);
1070 root->subsys_mask &= ~bit; 1099 root->subsys_mask &= ~bit;
1071 } else if (bit & root->subsys_mask) {
1072 /* Subsystem state should already exist */
1073 BUG_ON(!cgrp->subsys[i]);
1074 /*
1075 * a refcount was taken, but we already had one, so
1076 * drop the extra reference.
1077 */
1078 module_put(ss->module);
1079#ifdef CONFIG_MODULE_UNLOAD
1080 BUG_ON(ss->module && !module_refcount(ss->module));
1081#endif
1082 } else {
1083 /* Subsystem state shouldn't exist */
1084 BUG_ON(cgrp->subsys[i]);
1085 } 1100 }
1086 } 1101 }
1087 1102
@@ -1092,6 +1107,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1092 root->flags |= CGRP_ROOT_SUBSYS_BOUND; 1107 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1093 1108
1094 return 0; 1109 return 0;
1110
1111out_put:
1112 for_each_subsys(ss, i)
1113 if (pinned & (1 << i))
1114 module_put(ss->module);
1115 return ret;
1095} 1116}
1096 1117
1097static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1118static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
@@ -1142,7 +1163,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1142 char *token, *o = data; 1163 char *token, *o = data;
1143 bool all_ss = false, one_ss = false; 1164 bool all_ss = false, one_ss = false;
1144 unsigned long mask = (unsigned long)-1; 1165 unsigned long mask = (unsigned long)-1;
1145 bool module_pin_failed = false;
1146 struct cgroup_subsys *ss; 1166 struct cgroup_subsys *ss;
1147 int i; 1167 int i;
1148 1168
@@ -1285,52 +1305,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1285 if (!opts->subsys_mask && !opts->name) 1305 if (!opts->subsys_mask && !opts->name)
1286 return -EINVAL; 1306 return -EINVAL;
1287 1307
1288 /*
1289 * Grab references on all the modules we'll need, so the subsystems
1290 * don't dance around before rebind_subsystems attaches them. This may
1291 * take duplicate reference counts on a subsystem that's already used,
1292 * but rebind_subsystems handles this case.
1293 */
1294 for_each_subsys(ss, i) {
1295 if (!(opts->subsys_mask & (1UL << i)))
1296 continue;
1297 if (!try_module_get(cgroup_subsys[i]->module)) {
1298 module_pin_failed = true;
1299 break;
1300 }
1301 }
1302 if (module_pin_failed) {
1303 /*
1304 * oops, one of the modules was going away. this means that we
1305 * raced with a module_delete call, and to the user this is
1306 * essentially a "subsystem doesn't exist" case.
1307 */
1308 for (i--; i >= 0; i--) {
1309 /* drop refcounts only on the ones we took */
1310 unsigned long bit = 1UL << i;
1311
1312 if (!(bit & opts->subsys_mask))
1313 continue;
1314 module_put(cgroup_subsys[i]->module);
1315 }
1316 return -ENOENT;
1317 }
1318
1319 return 0; 1308 return 0;
1320} 1309}
1321 1310
1322static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1323{
1324 struct cgroup_subsys *ss;
1325 int i;
1326
1327 mutex_lock(&cgroup_mutex);
1328 for_each_subsys(ss, i)
1329 if (subsys_mask & (1UL << i))
1330 module_put(cgroup_subsys[i]->module);
1331 mutex_unlock(&cgroup_mutex);
1332}
1333
1334static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1311static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1335{ 1312{
1336 int ret = 0; 1313 int ret = 0;
@@ -1370,22 +1347,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1370 goto out_unlock; 1347 goto out_unlock;
1371 } 1348 }
1372 1349
1373 /* 1350 /* remounting is not allowed for populated hierarchies */
1374 * Clear out the files of subsystems that should be removed, do 1351 if (root->number_of_cgroups > 1) {
1375 * this before rebind_subsystems, since rebind_subsystems may 1352 ret = -EBUSY;
1376 * change this hierarchy's subsys_list.
1377 */
1378 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1379
1380 ret = rebind_subsystems(root, added_mask, removed_mask);
1381 if (ret) {
1382 /* rebind_subsystems failed, re-populate the removed files */
1383 cgroup_populate_dir(cgrp, false, removed_mask);
1384 goto out_unlock; 1353 goto out_unlock;
1385 } 1354 }
1386 1355
1387 /* re-populate subsystem files */ 1356 ret = rebind_subsystems(root, added_mask, removed_mask);
1388 cgroup_populate_dir(cgrp, false, added_mask); 1357 if (ret)
1358 goto out_unlock;
1389 1359
1390 if (opts.release_agent) 1360 if (opts.release_agent)
1391 strcpy(root->release_agent_path, opts.release_agent); 1361 strcpy(root->release_agent_path, opts.release_agent);
@@ -1395,8 +1365,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1395 mutex_unlock(&cgroup_root_mutex); 1365 mutex_unlock(&cgroup_root_mutex);
1396 mutex_unlock(&cgroup_mutex); 1366 mutex_unlock(&cgroup_mutex);
1397 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1367 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1398 if (ret)
1399 drop_parsed_module_refcounts(opts.subsys_mask);
1400 return ret; 1368 return ret;
1401} 1369}
1402 1370
@@ -1416,6 +1384,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1416 INIT_LIST_HEAD(&cgrp->release_list); 1384 INIT_LIST_HEAD(&cgrp->release_list);
1417 INIT_LIST_HEAD(&cgrp->pidlists); 1385 INIT_LIST_HEAD(&cgrp->pidlists);
1418 mutex_init(&cgrp->pidlist_mutex); 1386 mutex_init(&cgrp->pidlist_mutex);
1387 cgrp->dummy_css.cgroup = cgrp;
1419 INIT_LIST_HEAD(&cgrp->event_list); 1388 INIT_LIST_HEAD(&cgrp->event_list);
1420 spin_lock_init(&cgrp->event_list_lock); 1389 spin_lock_init(&cgrp->event_list_lock);
1421 simple_xattrs_init(&cgrp->xattrs); 1390 simple_xattrs_init(&cgrp->xattrs);
@@ -1431,6 +1400,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1431 cgrp->root = root; 1400 cgrp->root = root;
1432 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); 1401 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1433 init_cgroup_housekeeping(cgrp); 1402 init_cgroup_housekeeping(cgrp);
1403 idr_init(&root->cgroup_idr);
1434} 1404}
1435 1405
1436static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) 1406static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
@@ -1503,7 +1473,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1503 */ 1473 */
1504 root->subsys_mask = opts->subsys_mask; 1474 root->subsys_mask = opts->subsys_mask;
1505 root->flags = opts->flags; 1475 root->flags = opts->flags;
1506 ida_init(&root->cgroup_ida);
1507 if (opts->release_agent) 1476 if (opts->release_agent)
1508 strcpy(root->release_agent_path, opts->release_agent); 1477 strcpy(root->release_agent_path, opts->release_agent);
1509 if (opts->name) 1478 if (opts->name)
@@ -1519,7 +1488,7 @@ static void cgroup_free_root(struct cgroupfs_root *root)
1519 /* hierarhcy ID shoulid already have been released */ 1488 /* hierarhcy ID shoulid already have been released */
1520 WARN_ON_ONCE(root->hierarchy_id); 1489 WARN_ON_ONCE(root->hierarchy_id);
1521 1490
1522 ida_destroy(&root->cgroup_ida); 1491 idr_destroy(&root->cgroup_idr);
1523 kfree(root); 1492 kfree(root);
1524 } 1493 }
1525} 1494}
@@ -1584,7 +1553,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1584 int ret = 0; 1553 int ret = 0;
1585 struct super_block *sb; 1554 struct super_block *sb;
1586 struct cgroupfs_root *new_root; 1555 struct cgroupfs_root *new_root;
1556 struct list_head tmp_links;
1587 struct inode *inode; 1557 struct inode *inode;
1558 const struct cred *cred;
1588 1559
1589 /* First find the desired set of subsystems */ 1560 /* First find the desired set of subsystems */
1590 mutex_lock(&cgroup_mutex); 1561 mutex_lock(&cgroup_mutex);
@@ -1600,7 +1571,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1600 new_root = cgroup_root_from_opts(&opts); 1571 new_root = cgroup_root_from_opts(&opts);
1601 if (IS_ERR(new_root)) { 1572 if (IS_ERR(new_root)) {
1602 ret = PTR_ERR(new_root); 1573 ret = PTR_ERR(new_root);
1603 goto drop_modules; 1574 goto out_err;
1604 } 1575 }
1605 opts.new_root = new_root; 1576 opts.new_root = new_root;
1606 1577
@@ -1609,17 +1580,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1609 if (IS_ERR(sb)) { 1580 if (IS_ERR(sb)) {
1610 ret = PTR_ERR(sb); 1581 ret = PTR_ERR(sb);
1611 cgroup_free_root(opts.new_root); 1582 cgroup_free_root(opts.new_root);
1612 goto drop_modules; 1583 goto out_err;
1613 } 1584 }
1614 1585
1615 root = sb->s_fs_info; 1586 root = sb->s_fs_info;
1616 BUG_ON(!root); 1587 BUG_ON(!root);
1617 if (root == opts.new_root) { 1588 if (root == opts.new_root) {
1618 /* We used the new root structure, so this is a new hierarchy */ 1589 /* We used the new root structure, so this is a new hierarchy */
1619 struct list_head tmp_links;
1620 struct cgroup *root_cgrp = &root->top_cgroup; 1590 struct cgroup *root_cgrp = &root->top_cgroup;
1621 struct cgroupfs_root *existing_root; 1591 struct cgroupfs_root *existing_root;
1622 const struct cred *cred;
1623 int i; 1592 int i;
1624 struct css_set *cset; 1593 struct css_set *cset;
1625 1594
@@ -1634,6 +1603,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1634 mutex_lock(&cgroup_mutex); 1603 mutex_lock(&cgroup_mutex);
1635 mutex_lock(&cgroup_root_mutex); 1604 mutex_lock(&cgroup_root_mutex);
1636 1605
1606 root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
1607 0, 1, GFP_KERNEL);
1608 if (root_cgrp->id < 0)
1609 goto unlock_drop;
1610
1637 /* Check for name clashes with existing mounts */ 1611 /* Check for name clashes with existing mounts */
1638 ret = -EBUSY; 1612 ret = -EBUSY;
1639 if (strlen(root->name)) 1613 if (strlen(root->name))
@@ -1657,26 +1631,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1657 if (ret) 1631 if (ret)
1658 goto unlock_drop; 1632 goto unlock_drop;
1659 1633
1634 sb->s_root->d_fsdata = root_cgrp;
1635 root_cgrp->dentry = sb->s_root;
1636
1637 /*
1638 * We're inside get_sb() and will call lookup_one_len() to
1639 * create the root files, which doesn't work if SELinux is
1640 * in use. The following cred dancing somehow works around
1641 * it. See 2ce9738ba ("cgroupfs: use init_cred when
1642 * populating new cgroupfs mount") for more details.
1643 */
1644 cred = override_creds(&init_cred);
1645
1646 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1647 if (ret)
1648 goto rm_base_files;
1649
1660 ret = rebind_subsystems(root, root->subsys_mask, 0); 1650 ret = rebind_subsystems(root, root->subsys_mask, 0);
1661 if (ret == -EBUSY) { 1651 if (ret)
1662 free_cgrp_cset_links(&tmp_links); 1652 goto rm_base_files;
1663 goto unlock_drop; 1653
1664 } 1654 revert_creds(cred);
1655
1665 /* 1656 /*
1666 * There must be no failure case after here, since rebinding 1657 * There must be no failure case after here, since rebinding
1667 * takes care of subsystems' refcounts, which are explicitly 1658 * takes care of subsystems' refcounts, which are explicitly
1668 * dropped in the failure exit path. 1659 * dropped in the failure exit path.
1669 */ 1660 */
1670 1661
1671 /* EBUSY should be the only error here */
1672 BUG_ON(ret);
1673
1674 list_add(&root->root_list, &cgroup_roots); 1662 list_add(&root->root_list, &cgroup_roots);
1675 cgroup_root_count++; 1663 cgroup_root_count++;
1676 1664
1677 sb->s_root->d_fsdata = root_cgrp;
1678 root->top_cgroup.dentry = sb->s_root;
1679
1680 /* Link the top cgroup in this hierarchy into all 1665 /* Link the top cgroup in this hierarchy into all
1681 * the css_set objects */ 1666 * the css_set objects */
1682 write_lock(&css_set_lock); 1667 write_lock(&css_set_lock);
@@ -1689,9 +1674,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1689 BUG_ON(!list_empty(&root_cgrp->children)); 1674 BUG_ON(!list_empty(&root_cgrp->children));
1690 BUG_ON(root->number_of_cgroups != 1); 1675 BUG_ON(root->number_of_cgroups != 1);
1691 1676
1692 cred = override_creds(&init_cred);
1693 cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
1694 revert_creds(cred);
1695 mutex_unlock(&cgroup_root_mutex); 1677 mutex_unlock(&cgroup_root_mutex);
1696 mutex_unlock(&cgroup_mutex); 1678 mutex_unlock(&cgroup_mutex);
1697 mutex_unlock(&inode->i_mutex); 1679 mutex_unlock(&inode->i_mutex);
@@ -1711,15 +1693,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1711 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1693 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1712 } 1694 }
1713 } 1695 }
1714
1715 /* no subsys rebinding, so refcounts don't change */
1716 drop_parsed_module_refcounts(opts.subsys_mask);
1717 } 1696 }
1718 1697
1719 kfree(opts.release_agent); 1698 kfree(opts.release_agent);
1720 kfree(opts.name); 1699 kfree(opts.name);
1721 return dget(sb->s_root); 1700 return dget(sb->s_root);
1722 1701
1702 rm_base_files:
1703 free_cgrp_cset_links(&tmp_links);
1704 cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
1705 revert_creds(cred);
1723 unlock_drop: 1706 unlock_drop:
1724 cgroup_exit_root_id(root); 1707 cgroup_exit_root_id(root);
1725 mutex_unlock(&cgroup_root_mutex); 1708 mutex_unlock(&cgroup_root_mutex);
@@ -1727,8 +1710,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1727 mutex_unlock(&inode->i_mutex); 1710 mutex_unlock(&inode->i_mutex);
1728 drop_new_super: 1711 drop_new_super:
1729 deactivate_locked_super(sb); 1712 deactivate_locked_super(sb);
1730 drop_modules:
1731 drop_parsed_module_refcounts(opts.subsys_mask);
1732 out_err: 1713 out_err:
1733 kfree(opts.release_agent); 1714 kfree(opts.release_agent);
1734 kfree(opts.name); 1715 kfree(opts.name);
@@ -1746,6 +1727,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1746 BUG_ON(root->number_of_cgroups != 1); 1727 BUG_ON(root->number_of_cgroups != 1);
1747 BUG_ON(!list_empty(&cgrp->children)); 1728 BUG_ON(!list_empty(&cgrp->children));
1748 1729
1730 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1749 mutex_lock(&cgroup_mutex); 1731 mutex_lock(&cgroup_mutex);
1750 mutex_lock(&cgroup_root_mutex); 1732 mutex_lock(&cgroup_root_mutex);
1751 1733
@@ -1778,6 +1760,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1778 1760
1779 mutex_unlock(&cgroup_root_mutex); 1761 mutex_unlock(&cgroup_root_mutex);
1780 mutex_unlock(&cgroup_mutex); 1762 mutex_unlock(&cgroup_mutex);
1763 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1781 1764
1782 simple_xattrs_free(&cgrp->xattrs); 1765 simple_xattrs_free(&cgrp->xattrs);
1783 1766
@@ -1889,7 +1872,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path);
1889struct task_and_cgroup { 1872struct task_and_cgroup {
1890 struct task_struct *task; 1873 struct task_struct *task;
1891 struct cgroup *cgrp; 1874 struct cgroup *cgrp;
1892 struct css_set *cg; 1875 struct css_set *cset;
1893}; 1876};
1894 1877
1895struct cgroup_taskset { 1878struct cgroup_taskset {
@@ -1939,18 +1922,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1939EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1922EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1940 1923
1941/** 1924/**
1942 * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task 1925 * cgroup_taskset_cur_css - return the matching css for the current task
1943 * @tset: taskset of interest 1926 * @tset: taskset of interest
1927 * @subsys_id: the ID of the target subsystem
1944 * 1928 *
1945 * Return the cgroup for the current (last returned) task of @tset. This 1929 * Return the css for the current (last returned) task of @tset for
1946 * function must be preceded by either cgroup_taskset_first() or 1930 * subsystem specified by @subsys_id. This function must be preceded by
1947 * cgroup_taskset_next(). 1931 * either cgroup_taskset_first() or cgroup_taskset_next().
1948 */ 1932 */
1949struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) 1933struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1934 int subsys_id)
1950{ 1935{
1951 return tset->cur_cgrp; 1936 return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1952} 1937}
1953EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); 1938EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1954 1939
1955/** 1940/**
1956 * cgroup_taskset_size - return the number of tasks in taskset 1941 * cgroup_taskset_size - return the number of tasks in taskset
@@ -2054,7 +2039,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2054 2039
2055 /* @tsk either already exited or can't exit until the end */ 2040 /* @tsk either already exited or can't exit until the end */
2056 if (tsk->flags & PF_EXITING) 2041 if (tsk->flags & PF_EXITING)
2057 continue; 2042 goto next;
2058 2043
2059 /* as per above, nr_threads may decrease, but not increase. */ 2044 /* as per above, nr_threads may decrease, but not increase. */
2060 BUG_ON(i >= group_size); 2045 BUG_ON(i >= group_size);
@@ -2062,7 +2047,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2062 ent.cgrp = task_cgroup_from_root(tsk, root); 2047 ent.cgrp = task_cgroup_from_root(tsk, root);
2063 /* nothing to do if this task is already in the cgroup */ 2048 /* nothing to do if this task is already in the cgroup */
2064 if (ent.cgrp == cgrp) 2049 if (ent.cgrp == cgrp)
2065 continue; 2050 goto next;
2066 /* 2051 /*
2067 * saying GFP_ATOMIC has no effect here because we did prealloc 2052 * saying GFP_ATOMIC has no effect here because we did prealloc
2068 * earlier, but it's good form to communicate our expectations. 2053 * earlier, but it's good form to communicate our expectations.
@@ -2070,7 +2055,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2070 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2055 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2071 BUG_ON(retval != 0); 2056 BUG_ON(retval != 0);
2072 i++; 2057 i++;
2073 2058 next:
2074 if (!threadgroup) 2059 if (!threadgroup)
2075 break; 2060 break;
2076 } while_each_thread(leader, tsk); 2061 } while_each_thread(leader, tsk);
@@ -2089,8 +2074,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2089 * step 1: check that we can legitimately attach to the cgroup. 2074 * step 1: check that we can legitimately attach to the cgroup.
2090 */ 2075 */
2091 for_each_root_subsys(root, ss) { 2076 for_each_root_subsys(root, ss) {
2077 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2078
2092 if (ss->can_attach) { 2079 if (ss->can_attach) {
2093 retval = ss->can_attach(cgrp, &tset); 2080 retval = ss->can_attach(css, &tset);
2094 if (retval) { 2081 if (retval) {
2095 failed_ss = ss; 2082 failed_ss = ss;
2096 goto out_cancel_attach; 2083 goto out_cancel_attach;
@@ -2107,8 +2094,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2107 2094
2108 tc = flex_array_get(group, i); 2095 tc = flex_array_get(group, i);
2109 old_cset = task_css_set(tc->task); 2096 old_cset = task_css_set(tc->task);
2110 tc->cg = find_css_set(old_cset, cgrp); 2097 tc->cset = find_css_set(old_cset, cgrp);
2111 if (!tc->cg) { 2098 if (!tc->cset) {
2112 retval = -ENOMEM; 2099 retval = -ENOMEM;
2113 goto out_put_css_set_refs; 2100 goto out_put_css_set_refs;
2114 } 2101 }
@@ -2121,7 +2108,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2121 */ 2108 */
2122 for (i = 0; i < group_size; i++) { 2109 for (i = 0; i < group_size; i++) {
2123 tc = flex_array_get(group, i); 2110 tc = flex_array_get(group, i);
2124 cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); 2111 cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2125 } 2112 }
2126 /* nothing is sensitive to fork() after this point. */ 2113 /* nothing is sensitive to fork() after this point. */
2127 2114
@@ -2129,8 +2116,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2129 * step 4: do subsystem attach callbacks. 2116 * step 4: do subsystem attach callbacks.
2130 */ 2117 */
2131 for_each_root_subsys(root, ss) { 2118 for_each_root_subsys(root, ss) {
2119 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2120
2132 if (ss->attach) 2121 if (ss->attach)
2133 ss->attach(cgrp, &tset); 2122 ss->attach(css, &tset);
2134 } 2123 }
2135 2124
2136 /* 2125 /*
@@ -2141,18 +2130,20 @@ out_put_css_set_refs:
2141 if (retval) { 2130 if (retval) {
2142 for (i = 0; i < group_size; i++) { 2131 for (i = 0; i < group_size; i++) {
2143 tc = flex_array_get(group, i); 2132 tc = flex_array_get(group, i);
2144 if (!tc->cg) 2133 if (!tc->cset)
2145 break; 2134 break;
2146 put_css_set(tc->cg); 2135 put_css_set(tc->cset);
2147 } 2136 }
2148 } 2137 }
2149out_cancel_attach: 2138out_cancel_attach:
2150 if (retval) { 2139 if (retval) {
2151 for_each_root_subsys(root, ss) { 2140 for_each_root_subsys(root, ss) {
2141 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2142
2152 if (ss == failed_ss) 2143 if (ss == failed_ss)
2153 break; 2144 break;
2154 if (ss->cancel_attach) 2145 if (ss->cancel_attach)
2155 ss->cancel_attach(cgrp, &tset); 2146 ss->cancel_attach(css, &tset);
2156 } 2147 }
2157 } 2148 }
2158out_free_group_list: 2149out_free_group_list:
@@ -2253,9 +2244,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2253 2244
2254 mutex_lock(&cgroup_mutex); 2245 mutex_lock(&cgroup_mutex);
2255 for_each_active_root(root) { 2246 for_each_active_root(root) {
2256 struct cgroup *from_cg = task_cgroup_from_root(from, root); 2247 struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
2257 2248
2258 retval = cgroup_attach_task(from_cg, tsk, false); 2249 retval = cgroup_attach_task(from_cgrp, tsk, false);
2259 if (retval) 2250 if (retval)
2260 break; 2251 break;
2261 } 2252 }
@@ -2265,34 +2256,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2265} 2256}
2266EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2257EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2267 2258
2268static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2259static int cgroup_tasks_write(struct cgroup_subsys_state *css,
2260 struct cftype *cft, u64 pid)
2269{ 2261{
2270 return attach_task_by_pid(cgrp, pid, false); 2262 return attach_task_by_pid(css->cgroup, pid, false);
2271} 2263}
2272 2264
2273static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2265static int cgroup_procs_write(struct cgroup_subsys_state *css,
2266 struct cftype *cft, u64 tgid)
2274{ 2267{
2275 return attach_task_by_pid(cgrp, tgid, true); 2268 return attach_task_by_pid(css->cgroup, tgid, true);
2276} 2269}
2277 2270
2278static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 2271static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2279 const char *buffer) 2272 struct cftype *cft, const char *buffer)
2280{ 2273{
2281 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 2274 BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
2282 if (strlen(buffer) >= PATH_MAX) 2275 if (strlen(buffer) >= PATH_MAX)
2283 return -EINVAL; 2276 return -EINVAL;
2284 if (!cgroup_lock_live_group(cgrp)) 2277 if (!cgroup_lock_live_group(css->cgroup))
2285 return -ENODEV; 2278 return -ENODEV;
2286 mutex_lock(&cgroup_root_mutex); 2279 mutex_lock(&cgroup_root_mutex);
2287 strcpy(cgrp->root->release_agent_path, buffer); 2280 strcpy(css->cgroup->root->release_agent_path, buffer);
2288 mutex_unlock(&cgroup_root_mutex); 2281 mutex_unlock(&cgroup_root_mutex);
2289 mutex_unlock(&cgroup_mutex); 2282 mutex_unlock(&cgroup_mutex);
2290 return 0; 2283 return 0;
2291} 2284}
2292 2285
2293static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, 2286static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
2294 struct seq_file *seq) 2287 struct cftype *cft, struct seq_file *seq)
2295{ 2288{
2289 struct cgroup *cgrp = css->cgroup;
2290
2296 if (!cgroup_lock_live_group(cgrp)) 2291 if (!cgroup_lock_live_group(cgrp))
2297 return -ENODEV; 2292 return -ENODEV;
2298 seq_puts(seq, cgrp->root->release_agent_path); 2293 seq_puts(seq, cgrp->root->release_agent_path);
@@ -2301,20 +2296,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2301 return 0; 2296 return 0;
2302} 2297}
2303 2298
2304static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, 2299static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
2305 struct seq_file *seq) 2300 struct cftype *cft, struct seq_file *seq)
2306{ 2301{
2307 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); 2302 seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
2308 return 0; 2303 return 0;
2309} 2304}
2310 2305
2311/* A buffer size big enough for numbers or short strings */ 2306/* A buffer size big enough for numbers or short strings */
2312#define CGROUP_LOCAL_BUFFER_SIZE 64 2307#define CGROUP_LOCAL_BUFFER_SIZE 64
2313 2308
2314static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, 2309static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
2315 struct file *file, 2310 struct cftype *cft, struct file *file,
2316 const char __user *userbuf, 2311 const char __user *userbuf, size_t nbytes,
2317 size_t nbytes, loff_t *unused_ppos) 2312 loff_t *unused_ppos)
2318{ 2313{
2319 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2314 char buffer[CGROUP_LOCAL_BUFFER_SIZE];
2320 int retval = 0; 2315 int retval = 0;
@@ -2332,22 +2327,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
2332 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2327 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
2333 if (*end) 2328 if (*end)
2334 return -EINVAL; 2329 return -EINVAL;
2335 retval = cft->write_u64(cgrp, cft, val); 2330 retval = cft->write_u64(css, cft, val);
2336 } else { 2331 } else {
2337 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2332 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
2338 if (*end) 2333 if (*end)
2339 return -EINVAL; 2334 return -EINVAL;
2340 retval = cft->write_s64(cgrp, cft, val); 2335 retval = cft->write_s64(css, cft, val);
2341 } 2336 }
2342 if (!retval) 2337 if (!retval)
2343 retval = nbytes; 2338 retval = nbytes;
2344 return retval; 2339 return retval;
2345} 2340}
2346 2341
2347static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, 2342static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
2348 struct file *file, 2343 struct cftype *cft, struct file *file,
2349 const char __user *userbuf, 2344 const char __user *userbuf, size_t nbytes,
2350 size_t nbytes, loff_t *unused_ppos) 2345 loff_t *unused_ppos)
2351{ 2346{
2352 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2347 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
2353 int retval = 0; 2348 int retval = 0;
@@ -2370,7 +2365,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
2370 } 2365 }
2371 2366
2372 buffer[nbytes] = 0; /* nul-terminate */ 2367 buffer[nbytes] = 0; /* nul-terminate */
2373 retval = cft->write_string(cgrp, cft, strstrip(buffer)); 2368 retval = cft->write_string(css, cft, strstrip(buffer));
2374 if (!retval) 2369 if (!retval)
2375 retval = nbytes; 2370 retval = nbytes;
2376out: 2371out:
@@ -2380,65 +2375,60 @@ out:
2380} 2375}
2381 2376
2382static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2377static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2383 size_t nbytes, loff_t *ppos) 2378 size_t nbytes, loff_t *ppos)
2384{ 2379{
2380 struct cfent *cfe = __d_cfe(file->f_dentry);
2385 struct cftype *cft = __d_cft(file->f_dentry); 2381 struct cftype *cft = __d_cft(file->f_dentry);
2386 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2382 struct cgroup_subsys_state *css = cfe->css;
2387 2383
2388 if (cgroup_is_dead(cgrp))
2389 return -ENODEV;
2390 if (cft->write) 2384 if (cft->write)
2391 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 2385 return cft->write(css, cft, file, buf, nbytes, ppos);
2392 if (cft->write_u64 || cft->write_s64) 2386 if (cft->write_u64 || cft->write_s64)
2393 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); 2387 return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
2394 if (cft->write_string) 2388 if (cft->write_string)
2395 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); 2389 return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
2396 if (cft->trigger) { 2390 if (cft->trigger) {
2397 int ret = cft->trigger(cgrp, (unsigned int)cft->private); 2391 int ret = cft->trigger(css, (unsigned int)cft->private);
2398 return ret ? ret : nbytes; 2392 return ret ? ret : nbytes;
2399 } 2393 }
2400 return -EINVAL; 2394 return -EINVAL;
2401} 2395}
2402 2396
2403static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, 2397static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
2404 struct file *file, 2398 struct cftype *cft, struct file *file,
2405 char __user *buf, size_t nbytes, 2399 char __user *buf, size_t nbytes, loff_t *ppos)
2406 loff_t *ppos)
2407{ 2400{
2408 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2401 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2409 u64 val = cft->read_u64(cgrp, cft); 2402 u64 val = cft->read_u64(css, cft);
2410 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 2403 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2411 2404
2412 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2405 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2413} 2406}
2414 2407
2415static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, 2408static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
2416 struct file *file, 2409 struct cftype *cft, struct file *file,
2417 char __user *buf, size_t nbytes, 2410 char __user *buf, size_t nbytes, loff_t *ppos)
2418 loff_t *ppos)
2419{ 2411{
2420 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2412 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2421 s64 val = cft->read_s64(cgrp, cft); 2413 s64 val = cft->read_s64(css, cft);
2422 int len = sprintf(tmp, "%lld\n", (long long) val); 2414 int len = sprintf(tmp, "%lld\n", (long long) val);
2423 2415
2424 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2416 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2425} 2417}
2426 2418
2427static ssize_t cgroup_file_read(struct file *file, char __user *buf, 2419static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2428 size_t nbytes, loff_t *ppos) 2420 size_t nbytes, loff_t *ppos)
2429{ 2421{
2422 struct cfent *cfe = __d_cfe(file->f_dentry);
2430 struct cftype *cft = __d_cft(file->f_dentry); 2423 struct cftype *cft = __d_cft(file->f_dentry);
2431 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2424 struct cgroup_subsys_state *css = cfe->css;
2432
2433 if (cgroup_is_dead(cgrp))
2434 return -ENODEV;
2435 2425
2436 if (cft->read) 2426 if (cft->read)
2437 return cft->read(cgrp, cft, file, buf, nbytes, ppos); 2427 return cft->read(css, cft, file, buf, nbytes, ppos);
2438 if (cft->read_u64) 2428 if (cft->read_u64)
2439 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); 2429 return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
2440 if (cft->read_s64) 2430 if (cft->read_s64)
2441 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); 2431 return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
2442 return -EINVAL; 2432 return -EINVAL;
2443} 2433}
2444 2434
@@ -2447,11 +2437,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2447 * supports string->u64 maps, but can be extended in future. 2437 * supports string->u64 maps, but can be extended in future.
2448 */ 2438 */
2449 2439
2450struct cgroup_seqfile_state {
2451 struct cftype *cft;
2452 struct cgroup *cgroup;
2453};
2454
2455static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) 2440static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2456{ 2441{
2457 struct seq_file *sf = cb->state; 2442 struct seq_file *sf = cb->state;
@@ -2460,69 +2445,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2460 2445
2461static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2446static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2462{ 2447{
2463 struct cgroup_seqfile_state *state = m->private; 2448 struct cfent *cfe = m->private;
2464 struct cftype *cft = state->cft; 2449 struct cftype *cft = cfe->type;
2450 struct cgroup_subsys_state *css = cfe->css;
2451
2465 if (cft->read_map) { 2452 if (cft->read_map) {
2466 struct cgroup_map_cb cb = { 2453 struct cgroup_map_cb cb = {
2467 .fill = cgroup_map_add, 2454 .fill = cgroup_map_add,
2468 .state = m, 2455 .state = m,
2469 }; 2456 };
2470 return cft->read_map(state->cgroup, cft, &cb); 2457 return cft->read_map(css, cft, &cb);
2471 } 2458 }
2472 return cft->read_seq_string(state->cgroup, cft, m); 2459 return cft->read_seq_string(css, cft, m);
2473}
2474
2475static int cgroup_seqfile_release(struct inode *inode, struct file *file)
2476{
2477 struct seq_file *seq = file->private_data;
2478 kfree(seq->private);
2479 return single_release(inode, file);
2480} 2460}
2481 2461
2482static const struct file_operations cgroup_seqfile_operations = { 2462static const struct file_operations cgroup_seqfile_operations = {
2483 .read = seq_read, 2463 .read = seq_read,
2484 .write = cgroup_file_write, 2464 .write = cgroup_file_write,
2485 .llseek = seq_lseek, 2465 .llseek = seq_lseek,
2486 .release = cgroup_seqfile_release, 2466 .release = single_release,
2487}; 2467};
2488 2468
2489static int cgroup_file_open(struct inode *inode, struct file *file) 2469static int cgroup_file_open(struct inode *inode, struct file *file)
2490{ 2470{
2471 struct cfent *cfe = __d_cfe(file->f_dentry);
2472 struct cftype *cft = __d_cft(file->f_dentry);
2473 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2474 struct cgroup_subsys_state *css;
2491 int err; 2475 int err;
2492 struct cftype *cft;
2493 2476
2494 err = generic_file_open(inode, file); 2477 err = generic_file_open(inode, file);
2495 if (err) 2478 if (err)
2496 return err; 2479 return err;
2497 cft = __d_cft(file->f_dentry);
2498 2480
2499 if (cft->read_map || cft->read_seq_string) { 2481 /*
2500 struct cgroup_seqfile_state *state; 2482 * If the file belongs to a subsystem, pin the css. Will be
2483 * unpinned either on open failure or release. This ensures that
2484 * @css stays alive for all file operations.
2485 */
2486 rcu_read_lock();
2487 css = cgroup_css(cgrp, cft->ss);
2488 if (cft->ss && !css_tryget(css))
2489 css = NULL;
2490 rcu_read_unlock();
2501 2491
2502 state = kzalloc(sizeof(*state), GFP_USER); 2492 if (!css)
2503 if (!state) 2493 return -ENODEV;
2504 return -ENOMEM;
2505 2494
2506 state->cft = cft; 2495 /*
2507 state->cgroup = __d_cgrp(file->f_dentry->d_parent); 2496 * @cfe->css is used by read/write/close to determine the
2497 * associated css. @file->private_data would be a better place but
2498 * that's already used by seqfile. Multiple accessors may use it
2499 * simultaneously which is okay as the association never changes.
2500 */
2501 WARN_ON_ONCE(cfe->css && cfe->css != css);
2502 cfe->css = css;
2503
2504 if (cft->read_map || cft->read_seq_string) {
2508 file->f_op = &cgroup_seqfile_operations; 2505 file->f_op = &cgroup_seqfile_operations;
2509 err = single_open(file, cgroup_seqfile_show, state); 2506 err = single_open(file, cgroup_seqfile_show, cfe);
2510 if (err < 0) 2507 } else if (cft->open) {
2511 kfree(state);
2512 } else if (cft->open)
2513 err = cft->open(inode, file); 2508 err = cft->open(inode, file);
2514 else 2509 }
2515 err = 0;
2516 2510
2511 if (css->ss && err)
2512 css_put(css);
2517 return err; 2513 return err;
2518} 2514}
2519 2515
2520static int cgroup_file_release(struct inode *inode, struct file *file) 2516static int cgroup_file_release(struct inode *inode, struct file *file)
2521{ 2517{
2518 struct cfent *cfe = __d_cfe(file->f_dentry);
2522 struct cftype *cft = __d_cft(file->f_dentry); 2519 struct cftype *cft = __d_cft(file->f_dentry);
2520 struct cgroup_subsys_state *css = cfe->css;
2521 int ret = 0;
2522
2523 if (cft->release) 2523 if (cft->release)
2524 return cft->release(inode, file); 2524 ret = cft->release(inode, file);
2525 return 0; 2525 if (css->ss)
2526 css_put(css);
2527 return ret;
2526} 2528}
2527 2529
2528/* 2530/*
@@ -2736,8 +2738,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2736 return mode; 2738 return mode;
2737} 2739}
2738 2740
2739static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2741static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2740 struct cftype *cft)
2741{ 2742{
2742 struct dentry *dir = cgrp->dentry; 2743 struct dentry *dir = cgrp->dentry;
2743 struct cgroup *parent = __d_cgrp(dir); 2744 struct cgroup *parent = __d_cgrp(dir);
@@ -2747,8 +2748,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2747 umode_t mode; 2748 umode_t mode;
2748 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2749 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2749 2750
2750 if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { 2751 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
2751 strcpy(name, subsys->name); 2752 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2753 strcpy(name, cft->ss->name);
2752 strcat(name, "."); 2754 strcat(name, ".");
2753 } 2755 }
2754 strcat(name, cft->name); 2756 strcat(name, cft->name);
@@ -2782,11 +2784,25 @@ out:
2782 return error; 2784 return error;
2783} 2785}
2784 2786
2785static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2787/**
2786 struct cftype cfts[], bool is_add) 2788 * cgroup_addrm_files - add or remove files to a cgroup directory
2789 * @cgrp: the target cgroup
2790 * @cfts: array of cftypes to be added
2791 * @is_add: whether to add or remove
2792 *
2793 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2794 * For removals, this function never fails. If addition fails, this
2795 * function doesn't remove files already added. The caller is responsible
2796 * for cleaning up.
2797 */
2798static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2799 bool is_add)
2787{ 2800{
2788 struct cftype *cft; 2801 struct cftype *cft;
2789 int err, ret = 0; 2802 int ret;
2803
2804 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
2805 lockdep_assert_held(&cgroup_mutex);
2790 2806
2791 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2807 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2792 /* does cft->flags tell us to skip this file on @cgrp? */ 2808 /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2798,16 +2814,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2798 continue; 2814 continue;
2799 2815
2800 if (is_add) { 2816 if (is_add) {
2801 err = cgroup_add_file(cgrp, subsys, cft); 2817 ret = cgroup_add_file(cgrp, cft);
2802 if (err) 2818 if (ret) {
2803 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", 2819 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2804 cft->name, err); 2820 cft->name, ret);
2805 ret = err; 2821 return ret;
2822 }
2806 } else { 2823 } else {
2807 cgroup_rm_file(cgrp, cft); 2824 cgroup_rm_file(cgrp, cft);
2808 } 2825 }
2809 } 2826 }
2810 return ret; 2827 return 0;
2811} 2828}
2812 2829
2813static void cgroup_cfts_prepare(void) 2830static void cgroup_cfts_prepare(void)
@@ -2816,28 +2833,30 @@ static void cgroup_cfts_prepare(void)
2816 /* 2833 /*
2817 * Thanks to the entanglement with vfs inode locking, we can't walk 2834 * Thanks to the entanglement with vfs inode locking, we can't walk
2818 * the existing cgroups under cgroup_mutex and create files. 2835 * the existing cgroups under cgroup_mutex and create files.
2819 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU 2836 * Instead, we use css_for_each_descendant_pre() and drop RCU read
2820 * read lock before calling cgroup_addrm_files(). 2837 * lock before calling cgroup_addrm_files().
2821 */ 2838 */
2822 mutex_lock(&cgroup_mutex); 2839 mutex_lock(&cgroup_mutex);
2823} 2840}
2824 2841
2825static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2842static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2826 struct cftype *cfts, bool is_add)
2827 __releases(&cgroup_mutex) 2843 __releases(&cgroup_mutex)
2828{ 2844{
2829 LIST_HEAD(pending); 2845 LIST_HEAD(pending);
2830 struct cgroup *cgrp, *root = &ss->root->top_cgroup; 2846 struct cgroup_subsys *ss = cfts[0].ss;
2847 struct cgroup *root = &ss->root->top_cgroup;
2831 struct super_block *sb = ss->root->sb; 2848 struct super_block *sb = ss->root->sb;
2832 struct dentry *prev = NULL; 2849 struct dentry *prev = NULL;
2833 struct inode *inode; 2850 struct inode *inode;
2851 struct cgroup_subsys_state *css;
2834 u64 update_before; 2852 u64 update_before;
2853 int ret = 0;
2835 2854
2836 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2855 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2837 if (!cfts || ss->root == &cgroup_dummy_root || 2856 if (!cfts || ss->root == &cgroup_dummy_root ||
2838 !atomic_inc_not_zero(&sb->s_active)) { 2857 !atomic_inc_not_zero(&sb->s_active)) {
2839 mutex_unlock(&cgroup_mutex); 2858 mutex_unlock(&cgroup_mutex);
2840 return; 2859 return 0;
2841 } 2860 }
2842 2861
2843 /* 2862 /*
@@ -2849,17 +2868,11 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2849 2868
2850 mutex_unlock(&cgroup_mutex); 2869 mutex_unlock(&cgroup_mutex);
2851 2870
2852 /* @root always needs to be updated */
2853 inode = root->dentry->d_inode;
2854 mutex_lock(&inode->i_mutex);
2855 mutex_lock(&cgroup_mutex);
2856 cgroup_addrm_files(root, ss, cfts, is_add);
2857 mutex_unlock(&cgroup_mutex);
2858 mutex_unlock(&inode->i_mutex);
2859
2860 /* add/rm files for all cgroups created before */ 2871 /* add/rm files for all cgroups created before */
2861 rcu_read_lock(); 2872 rcu_read_lock();
2862 cgroup_for_each_descendant_pre(cgrp, root) { 2873 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2874 struct cgroup *cgrp = css->cgroup;
2875
2863 if (cgroup_is_dead(cgrp)) 2876 if (cgroup_is_dead(cgrp))
2864 continue; 2877 continue;
2865 2878
@@ -2873,15 +2886,18 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2873 mutex_lock(&inode->i_mutex); 2886 mutex_lock(&inode->i_mutex);
2874 mutex_lock(&cgroup_mutex); 2887 mutex_lock(&cgroup_mutex);
2875 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) 2888 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2876 cgroup_addrm_files(cgrp, ss, cfts, is_add); 2889 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2877 mutex_unlock(&cgroup_mutex); 2890 mutex_unlock(&cgroup_mutex);
2878 mutex_unlock(&inode->i_mutex); 2891 mutex_unlock(&inode->i_mutex);
2879 2892
2880 rcu_read_lock(); 2893 rcu_read_lock();
2894 if (ret)
2895 break;
2881 } 2896 }
2882 rcu_read_unlock(); 2897 rcu_read_unlock();
2883 dput(prev); 2898 dput(prev);
2884 deactivate_super(sb); 2899 deactivate_super(sb);
2900 return ret;
2885} 2901}
2886 2902
2887/** 2903/**
@@ -2901,49 +2917,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2901int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2917int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2902{ 2918{
2903 struct cftype_set *set; 2919 struct cftype_set *set;
2920 struct cftype *cft;
2921 int ret;
2904 2922
2905 set = kzalloc(sizeof(*set), GFP_KERNEL); 2923 set = kzalloc(sizeof(*set), GFP_KERNEL);
2906 if (!set) 2924 if (!set)
2907 return -ENOMEM; 2925 return -ENOMEM;
2908 2926
2927 for (cft = cfts; cft->name[0] != '\0'; cft++)
2928 cft->ss = ss;
2929
2909 cgroup_cfts_prepare(); 2930 cgroup_cfts_prepare();
2910 set->cfts = cfts; 2931 set->cfts = cfts;
2911 list_add_tail(&set->node, &ss->cftsets); 2932 list_add_tail(&set->node, &ss->cftsets);
2912 cgroup_cfts_commit(ss, cfts, true); 2933 ret = cgroup_cfts_commit(cfts, true);
2913 2934 if (ret)
2914 return 0; 2935 cgroup_rm_cftypes(cfts);
2936 return ret;
2915} 2937}
2916EXPORT_SYMBOL_GPL(cgroup_add_cftypes); 2938EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2917 2939
2918/** 2940/**
2919 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2941 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2920 * @ss: target cgroup subsystem
2921 * @cfts: zero-length name terminated array of cftypes 2942 * @cfts: zero-length name terminated array of cftypes
2922 * 2943 *
2923 * Unregister @cfts from @ss. Files described by @cfts are removed from 2944 * Unregister @cfts. Files described by @cfts are removed from all
2924 * all existing cgroups to which @ss is attached and all future cgroups 2945 * existing cgroups and all future cgroups won't have them either. This
2925 * won't have them either. This function can be called anytime whether @ss 2946 * function can be called anytime whether @cfts' subsys is attached or not.
2926 * is attached or not.
2927 * 2947 *
2928 * Returns 0 on successful unregistration, -ENOENT if @cfts is not 2948 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2929 * registered with @ss. 2949 * registered.
2930 */ 2950 */
2931int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2951int cgroup_rm_cftypes(struct cftype *cfts)
2932{ 2952{
2933 struct cftype_set *set; 2953 struct cftype_set *set;
2934 2954
2955 if (!cfts || !cfts[0].ss)
2956 return -ENOENT;
2957
2935 cgroup_cfts_prepare(); 2958 cgroup_cfts_prepare();
2936 2959
2937 list_for_each_entry(set, &ss->cftsets, node) { 2960 list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
2938 if (set->cfts == cfts) { 2961 if (set->cfts == cfts) {
2939 list_del(&set->node); 2962 list_del(&set->node);
2940 kfree(set); 2963 kfree(set);
2941 cgroup_cfts_commit(ss, cfts, false); 2964 cgroup_cfts_commit(cfts, false);
2942 return 0; 2965 return 0;
2943 } 2966 }
2944 } 2967 }
2945 2968
2946 cgroup_cfts_commit(ss, NULL, false); 2969 cgroup_cfts_commit(NULL, false);
2947 return -ENOENT; 2970 return -ENOENT;
2948} 2971}
2949 2972
@@ -2966,34 +2989,10 @@ int cgroup_task_count(const struct cgroup *cgrp)
2966} 2989}
2967 2990
2968/* 2991/*
2969 * Advance a list_head iterator. The iterator should be positioned at 2992 * To reduce the fork() overhead for systems that are not actually using
2970 * the start of a css_set 2993 * their cgroups capability, we don't maintain the lists running through
2971 */ 2994 * each css_set to its tasks until we see the list actually used - in other
2972static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) 2995 * words after the first call to css_task_iter_start().
2973{
2974 struct list_head *l = it->cset_link;
2975 struct cgrp_cset_link *link;
2976 struct css_set *cset;
2977
2978 /* Advance to the next non-empty css_set */
2979 do {
2980 l = l->next;
2981 if (l == &cgrp->cset_links) {
2982 it->cset_link = NULL;
2983 return;
2984 }
2985 link = list_entry(l, struct cgrp_cset_link, cset_link);
2986 cset = link->cset;
2987 } while (list_empty(&cset->tasks));
2988 it->cset_link = l;
2989 it->task = cset->tasks.next;
2990}
2991
2992/*
2993 * To reduce the fork() overhead for systems that are not actually
2994 * using their cgroups capability, we don't maintain the lists running
2995 * through each css_set to its tasks until we see the list actually
2996 * used - in other words after the first call to cgroup_iter_start().
2997 */ 2996 */
2998static void cgroup_enable_task_cg_lists(void) 2997static void cgroup_enable_task_cg_lists(void)
2999{ 2998{
@@ -3024,16 +3023,21 @@ static void cgroup_enable_task_cg_lists(void)
3024} 3023}
3025 3024
3026/** 3025/**
3027 * cgroup_next_sibling - find the next sibling of a given cgroup 3026 * css_next_child - find the next child of a given css
3028 * @pos: the current cgroup 3027 * @pos_css: the current position (%NULL to initiate traversal)
3028 * @parent_css: css whose children to walk
3029 * 3029 *
3030 * This function returns the next sibling of @pos and should be called 3030 * This function returns the next child of @parent_css and should be called
3031 * under RCU read lock. The only requirement is that @pos is accessible. 3031 * under RCU read lock. The only requirement is that @parent_css and
3032 * The next sibling is guaranteed to be returned regardless of @pos's 3032 * @pos_css are accessible. The next sibling is guaranteed to be returned
3033 * state. 3033 * regardless of their states.
3034 */ 3034 */
3035struct cgroup *cgroup_next_sibling(struct cgroup *pos) 3035struct cgroup_subsys_state *
3036css_next_child(struct cgroup_subsys_state *pos_css,
3037 struct cgroup_subsys_state *parent_css)
3036{ 3038{
3039 struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
3040 struct cgroup *cgrp = parent_css->cgroup;
3037 struct cgroup *next; 3041 struct cgroup *next;
3038 3042
3039 WARN_ON_ONCE(!rcu_read_lock_held()); 3043 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -3048,78 +3052,81 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos)
3048 * safe to dereference from this RCU critical section. If 3052 * safe to dereference from this RCU critical section. If
3049 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed 3053 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3050 * to be visible as %true here. 3054 * to be visible as %true here.
3055 *
3056 * If @pos is dead, its next pointer can't be dereferenced;
3057 * however, as each cgroup is given a monotonically increasing
3058 * unique serial number and always appended to the sibling list,
3059 * the next one can be found by walking the parent's children until
3060 * we see a cgroup with higher serial number than @pos's. While
3061 * this path can be slower, it's taken only when either the current
3062 * cgroup is removed or iteration and removal race.
3051 */ 3063 */
3052 if (likely(!cgroup_is_dead(pos))) { 3064 if (!pos) {
3065 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
3066 } else if (likely(!cgroup_is_dead(pos))) {
3053 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3067 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3054 if (&next->sibling != &pos->parent->children) 3068 } else {
3055 return next; 3069 list_for_each_entry_rcu(next, &cgrp->children, sibling)
3056 return NULL; 3070 if (next->serial_nr > pos->serial_nr)
3071 break;
3057 } 3072 }
3058 3073
3059 /* 3074 if (&next->sibling == &cgrp->children)
3060 * Can't dereference the next pointer. Each cgroup is given a 3075 return NULL;
3061 * monotonically increasing unique serial number and always 3076
3062 * appended to the sibling list, so the next one can be found by 3077 return cgroup_css(next, parent_css->ss);
3063 * walking the parent's children until we see a cgroup with higher
3064 * serial number than @pos's.
3065 *
3066 * While this path can be slow, it's taken only when either the
3067 * current cgroup is removed or iteration and removal race.
3068 */
3069 list_for_each_entry_rcu(next, &pos->parent->children, sibling)
3070 if (next->serial_nr > pos->serial_nr)
3071 return next;
3072 return NULL;
3073} 3078}
3074EXPORT_SYMBOL_GPL(cgroup_next_sibling); 3079EXPORT_SYMBOL_GPL(css_next_child);
3075 3080
3076/** 3081/**
3077 * cgroup_next_descendant_pre - find the next descendant for pre-order walk 3082 * css_next_descendant_pre - find the next descendant for pre-order walk
3078 * @pos: the current position (%NULL to initiate traversal) 3083 * @pos: the current position (%NULL to initiate traversal)
3079 * @cgroup: cgroup whose descendants to walk 3084 * @root: css whose descendants to walk
3080 * 3085 *
3081 * To be used by cgroup_for_each_descendant_pre(). Find the next 3086 * To be used by css_for_each_descendant_pre(). Find the next descendant
3082 * descendant to visit for pre-order traversal of @cgroup's descendants. 3087 * to visit for pre-order traversal of @root's descendants. @root is
3088 * included in the iteration and the first node to be visited.
3083 * 3089 *
3084 * While this function requires RCU read locking, it doesn't require the 3090 * While this function requires RCU read locking, it doesn't require the
3085 * whole traversal to be contained in a single RCU critical section. This 3091 * whole traversal to be contained in a single RCU critical section. This
3086 * function will return the correct next descendant as long as both @pos 3092 * function will return the correct next descendant as long as both @pos
3087 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3093 * and @root are accessible and @pos is a descendant of @root.
3088 */ 3094 */
3089struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 3095struct cgroup_subsys_state *
3090 struct cgroup *cgroup) 3096css_next_descendant_pre(struct cgroup_subsys_state *pos,
3097 struct cgroup_subsys_state *root)
3091{ 3098{
3092 struct cgroup *next; 3099 struct cgroup_subsys_state *next;
3093 3100
3094 WARN_ON_ONCE(!rcu_read_lock_held()); 3101 WARN_ON_ONCE(!rcu_read_lock_held());
3095 3102
3096 /* if first iteration, pretend we just visited @cgroup */ 3103 /* if first iteration, visit @root */
3097 if (!pos) 3104 if (!pos)
3098 pos = cgroup; 3105 return root;
3099 3106
3100 /* visit the first child if exists */ 3107 /* visit the first child if exists */
3101 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); 3108 next = css_next_child(NULL, pos);
3102 if (next) 3109 if (next)
3103 return next; 3110 return next;
3104 3111
3105 /* no child, visit my or the closest ancestor's next sibling */ 3112 /* no child, visit my or the closest ancestor's next sibling */
3106 while (pos != cgroup) { 3113 while (pos != root) {
3107 next = cgroup_next_sibling(pos); 3114 next = css_next_child(pos, css_parent(pos));
3108 if (next) 3115 if (next)
3109 return next; 3116 return next;
3110 pos = pos->parent; 3117 pos = css_parent(pos);
3111 } 3118 }
3112 3119
3113 return NULL; 3120 return NULL;
3114} 3121}
3115EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); 3122EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3116 3123
3117/** 3124/**
3118 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup 3125 * css_rightmost_descendant - return the rightmost descendant of a css
3119 * @pos: cgroup of interest 3126 * @pos: css of interest
3120 * 3127 *
3121 * Return the rightmost descendant of @pos. If there's no descendant, 3128 * Return the rightmost descendant of @pos. If there's no descendant, @pos
3122 * @pos is returned. This can be used during pre-order traversal to skip 3129 * is returned. This can be used during pre-order traversal to skip
3123 * subtree of @pos. 3130 * subtree of @pos.
3124 * 3131 *
3125 * While this function requires RCU read locking, it doesn't require the 3132 * While this function requires RCU read locking, it doesn't require the
@@ -3127,9 +3134,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3127 * function will return the correct rightmost descendant as long as @pos is 3134 * function will return the correct rightmost descendant as long as @pos is
3128 * accessible. 3135 * accessible.
3129 */ 3136 */
3130struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) 3137struct cgroup_subsys_state *
3138css_rightmost_descendant(struct cgroup_subsys_state *pos)
3131{ 3139{
3132 struct cgroup *last, *tmp; 3140 struct cgroup_subsys_state *last, *tmp;
3133 3141
3134 WARN_ON_ONCE(!rcu_read_lock_held()); 3142 WARN_ON_ONCE(!rcu_read_lock_held());
3135 3143
@@ -3137,82 +3145,136 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
3137 last = pos; 3145 last = pos;
3138 /* ->prev isn't RCU safe, walk ->next till the end */ 3146 /* ->prev isn't RCU safe, walk ->next till the end */
3139 pos = NULL; 3147 pos = NULL;
3140 list_for_each_entry_rcu(tmp, &last->children, sibling) 3148 css_for_each_child(tmp, last)
3141 pos = tmp; 3149 pos = tmp;
3142 } while (pos); 3150 } while (pos);
3143 3151
3144 return last; 3152 return last;
3145} 3153}
3146EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); 3154EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3147 3155
3148static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) 3156static struct cgroup_subsys_state *
3157css_leftmost_descendant(struct cgroup_subsys_state *pos)
3149{ 3158{
3150 struct cgroup *last; 3159 struct cgroup_subsys_state *last;
3151 3160
3152 do { 3161 do {
3153 last = pos; 3162 last = pos;
3154 pos = list_first_or_null_rcu(&pos->children, struct cgroup, 3163 pos = css_next_child(NULL, pos);
3155 sibling);
3156 } while (pos); 3164 } while (pos);
3157 3165
3158 return last; 3166 return last;
3159} 3167}
3160 3168
3161/** 3169/**
3162 * cgroup_next_descendant_post - find the next descendant for post-order walk 3170 * css_next_descendant_post - find the next descendant for post-order walk
3163 * @pos: the current position (%NULL to initiate traversal) 3171 * @pos: the current position (%NULL to initiate traversal)
3164 * @cgroup: cgroup whose descendants to walk 3172 * @root: css whose descendants to walk
3165 * 3173 *
3166 * To be used by cgroup_for_each_descendant_post(). Find the next 3174 * To be used by css_for_each_descendant_post(). Find the next descendant
3167 * descendant to visit for post-order traversal of @cgroup's descendants. 3175 * to visit for post-order traversal of @root's descendants. @root is
3176 * included in the iteration and the last node to be visited.
3168 * 3177 *
3169 * While this function requires RCU read locking, it doesn't require the 3178 * While this function requires RCU read locking, it doesn't require the
3170 * whole traversal to be contained in a single RCU critical section. This 3179 * whole traversal to be contained in a single RCU critical section. This
3171 * function will return the correct next descendant as long as both @pos 3180 * function will return the correct next descendant as long as both @pos
3172 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3181 * and @cgroup are accessible and @pos is a descendant of @cgroup.
3173 */ 3182 */
3174struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 3183struct cgroup_subsys_state *
3175 struct cgroup *cgroup) 3184css_next_descendant_post(struct cgroup_subsys_state *pos,
3185 struct cgroup_subsys_state *root)
3176{ 3186{
3177 struct cgroup *next; 3187 struct cgroup_subsys_state *next;
3178 3188
3179 WARN_ON_ONCE(!rcu_read_lock_held()); 3189 WARN_ON_ONCE(!rcu_read_lock_held());
3180 3190
3181 /* if first iteration, visit the leftmost descendant */ 3191 /* if first iteration, visit leftmost descendant which may be @root */
3182 if (!pos) { 3192 if (!pos)
3183 next = cgroup_leftmost_descendant(cgroup); 3193 return css_leftmost_descendant(root);
3184 return next != cgroup ? next : NULL; 3194
3185 } 3195 /* if we visited @root, we're done */
3196 if (pos == root)
3197 return NULL;
3186 3198
3187 /* if there's an unvisited sibling, visit its leftmost descendant */ 3199 /* if there's an unvisited sibling, visit its leftmost descendant */
3188 next = cgroup_next_sibling(pos); 3200 next = css_next_child(pos, css_parent(pos));
3189 if (next) 3201 if (next)
3190 return cgroup_leftmost_descendant(next); 3202 return css_leftmost_descendant(next);
3191 3203
3192 /* no sibling left, visit parent */ 3204 /* no sibling left, visit parent */
3193 next = pos->parent; 3205 return css_parent(pos);
3194 return next != cgroup ? next : NULL; 3206}
3207EXPORT_SYMBOL_GPL(css_next_descendant_post);
3208
3209/**
3210 * css_advance_task_iter - advance a task itererator to the next css_set
3211 * @it: the iterator to advance
3212 *
3213 * Advance @it to the next css_set to walk.
3214 */
3215static void css_advance_task_iter(struct css_task_iter *it)
3216{
3217 struct list_head *l = it->cset_link;
3218 struct cgrp_cset_link *link;
3219 struct css_set *cset;
3220
3221 /* Advance to the next non-empty css_set */
3222 do {
3223 l = l->next;
3224 if (l == &it->origin_css->cgroup->cset_links) {
3225 it->cset_link = NULL;
3226 return;
3227 }
3228 link = list_entry(l, struct cgrp_cset_link, cset_link);
3229 cset = link->cset;
3230 } while (list_empty(&cset->tasks));
3231 it->cset_link = l;
3232 it->task = cset->tasks.next;
3195} 3233}
3196EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
3197 3234
3198void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3235/**
3236 * css_task_iter_start - initiate task iteration
3237 * @css: the css to walk tasks of
3238 * @it: the task iterator to use
3239 *
3240 * Initiate iteration through the tasks of @css. The caller can call
3241 * css_task_iter_next() to walk through the tasks until the function
3242 * returns NULL. On completion of iteration, css_task_iter_end() must be
3243 * called.
3244 *
3245 * Note that this function acquires a lock which is released when the
3246 * iteration finishes. The caller can't sleep while iteration is in
3247 * progress.
3248 */
3249void css_task_iter_start(struct cgroup_subsys_state *css,
3250 struct css_task_iter *it)
3199 __acquires(css_set_lock) 3251 __acquires(css_set_lock)
3200{ 3252{
3201 /* 3253 /*
3202 * The first time anyone tries to iterate across a cgroup, 3254 * The first time anyone tries to iterate across a css, we need to
3203 * we need to enable the list linking each css_set to its 3255 * enable the list linking each css_set to its tasks, and fix up
3204 * tasks, and fix up all existing tasks. 3256 * all existing tasks.
3205 */ 3257 */
3206 if (!use_task_css_set_links) 3258 if (!use_task_css_set_links)
3207 cgroup_enable_task_cg_lists(); 3259 cgroup_enable_task_cg_lists();
3208 3260
3209 read_lock(&css_set_lock); 3261 read_lock(&css_set_lock);
3210 it->cset_link = &cgrp->cset_links; 3262
3211 cgroup_advance_iter(cgrp, it); 3263 it->origin_css = css;
3264 it->cset_link = &css->cgroup->cset_links;
3265
3266 css_advance_task_iter(it);
3212} 3267}
3213 3268
3214struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 3269/**
3215 struct cgroup_iter *it) 3270 * css_task_iter_next - return the next task for the iterator
3271 * @it: the task iterator being iterated
3272 *
3273 * The "next" function for task iteration. @it should have been
3274 * initialized via css_task_iter_start(). Returns NULL when the iteration
3275 * reaches the end.
3276 */
3277struct task_struct *css_task_iter_next(struct css_task_iter *it)
3216{ 3278{
3217 struct task_struct *res; 3279 struct task_struct *res;
3218 struct list_head *l = it->task; 3280 struct list_head *l = it->task;
@@ -3226,16 +3288,24 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3226 l = l->next; 3288 l = l->next;
3227 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); 3289 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
3228 if (l == &link->cset->tasks) { 3290 if (l == &link->cset->tasks) {
3229 /* We reached the end of this task list - move on to 3291 /*
3230 * the next cg_cgroup_link */ 3292 * We reached the end of this task list - move on to the
3231 cgroup_advance_iter(cgrp, it); 3293 * next cgrp_cset_link.
3294 */
3295 css_advance_task_iter(it);
3232 } else { 3296 } else {
3233 it->task = l; 3297 it->task = l;
3234 } 3298 }
3235 return res; 3299 return res;
3236} 3300}
3237 3301
3238void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 3302/**
3303 * css_task_iter_end - finish task iteration
3304 * @it: the task iterator to finish
3305 *
3306 * Finish task iteration started by css_task_iter_start().
3307 */
3308void css_task_iter_end(struct css_task_iter *it)
3239 __releases(css_set_lock) 3309 __releases(css_set_lock)
3240{ 3310{
3241 read_unlock(&css_set_lock); 3311 read_unlock(&css_set_lock);
@@ -3276,46 +3346,49 @@ static inline int started_after(void *p1, void *p2)
3276} 3346}
3277 3347
3278/** 3348/**
3279 * cgroup_scan_tasks - iterate though all the tasks in a cgroup 3349 * css_scan_tasks - iterate though all the tasks in a css
3280 * @scan: struct cgroup_scanner containing arguments for the scan 3350 * @css: the css to iterate tasks of
3351 * @test: optional test callback
3352 * @process: process callback
3353 * @data: data passed to @test and @process
3354 * @heap: optional pre-allocated heap used for task iteration
3355 *
3356 * Iterate through all the tasks in @css, calling @test for each, and if it
3357 * returns %true, call @process for it also.
3281 * 3358 *
3282 * Arguments include pointers to callback functions test_task() and 3359 * @test may be NULL, meaning always true (select all tasks), which
3283 * process_task(). 3360 * effectively duplicates css_task_iter_{start,next,end}() but does not
3284 * Iterate through all the tasks in a cgroup, calling test_task() for each, 3361 * lock css_set_lock for the call to @process.
3285 * and if it returns true, call process_task() for it also.
3286 * The test_task pointer may be NULL, meaning always true (select all tasks).
3287 * Effectively duplicates cgroup_iter_{start,next,end}()
3288 * but does not lock css_set_lock for the call to process_task().
3289 * The struct cgroup_scanner may be embedded in any structure of the caller's
3290 * creation.
3291 * It is guaranteed that process_task() will act on every task that
3292 * is a member of the cgroup for the duration of this call. This
3293 * function may or may not call process_task() for tasks that exit
3294 * or move to a different cgroup during the call, or are forked or
3295 * move into the cgroup during the call.
3296 * 3362 *
3297 * Note that test_task() may be called with locks held, and may in some 3363 * It is guaranteed that @process will act on every task that is a member
3298 * situations be called multiple times for the same task, so it should 3364 * of @css for the duration of this call. This function may or may not
3299 * be cheap. 3365 * call @process for tasks that exit or move to a different css during the
3300 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been 3366 * call, or are forked or move into the css during the call.
3301 * pre-allocated and will be used for heap operations (and its "gt" member will 3367 *
3302 * be overwritten), else a temporary heap will be used (allocation of which 3368 * Note that @test may be called with locks held, and may in some
3303 * may cause this function to fail). 3369 * situations be called multiple times for the same task, so it should be
3370 * cheap.
3371 *
3372 * If @heap is non-NULL, a heap has been pre-allocated and will be used for
3373 * heap operations (and its "gt" member will be overwritten), else a
3374 * temporary heap will be used (allocation of which may cause this function
3375 * to fail).
3304 */ 3376 */
3305int cgroup_scan_tasks(struct cgroup_scanner *scan) 3377int css_scan_tasks(struct cgroup_subsys_state *css,
3378 bool (*test)(struct task_struct *, void *),
3379 void (*process)(struct task_struct *, void *),
3380 void *data, struct ptr_heap *heap)
3306{ 3381{
3307 int retval, i; 3382 int retval, i;
3308 struct cgroup_iter it; 3383 struct css_task_iter it;
3309 struct task_struct *p, *dropped; 3384 struct task_struct *p, *dropped;
3310 /* Never dereference latest_task, since it's not refcounted */ 3385 /* Never dereference latest_task, since it's not refcounted */
3311 struct task_struct *latest_task = NULL; 3386 struct task_struct *latest_task = NULL;
3312 struct ptr_heap tmp_heap; 3387 struct ptr_heap tmp_heap;
3313 struct ptr_heap *heap;
3314 struct timespec latest_time = { 0, 0 }; 3388 struct timespec latest_time = { 0, 0 };
3315 3389
3316 if (scan->heap) { 3390 if (heap) {
3317 /* The caller supplied our heap and pre-allocated its memory */ 3391 /* The caller supplied our heap and pre-allocated its memory */
3318 heap = scan->heap;
3319 heap->gt = &started_after; 3392 heap->gt = &started_after;
3320 } else { 3393 } else {
3321 /* We need to allocate our own heap memory */ 3394 /* We need to allocate our own heap memory */
@@ -3328,25 +3401,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3328 3401
3329 again: 3402 again:
3330 /* 3403 /*
3331 * Scan tasks in the cgroup, using the scanner's "test_task" callback 3404 * Scan tasks in the css, using the @test callback to determine
3332 * to determine which are of interest, and using the scanner's 3405 * which are of interest, and invoking @process callback on the
3333 * "process_task" callback to process any of them that need an update. 3406 * ones which need an update. Since we don't want to hold any
3334 * Since we don't want to hold any locks during the task updates, 3407 * locks during the task updates, gather tasks to be processed in a
3335 * gather tasks to be processed in a heap structure. 3408 * heap structure. The heap is sorted by descending task start
3336 * The heap is sorted by descending task start time. 3409 * time. If the statically-sized heap fills up, we overflow tasks
3337 * If the statically-sized heap fills up, we overflow tasks that 3410 * that started later, and in future iterations only consider tasks
3338 * started later, and in future iterations only consider tasks that 3411 * that started after the latest task in the previous pass. This
3339 * started after the latest task in the previous pass. This
3340 * guarantees forward progress and that we don't miss any tasks. 3412 * guarantees forward progress and that we don't miss any tasks.
3341 */ 3413 */
3342 heap->size = 0; 3414 heap->size = 0;
3343 cgroup_iter_start(scan->cg, &it); 3415 css_task_iter_start(css, &it);
3344 while ((p = cgroup_iter_next(scan->cg, &it))) { 3416 while ((p = css_task_iter_next(&it))) {
3345 /* 3417 /*
3346 * Only affect tasks that qualify per the caller's callback, 3418 * Only affect tasks that qualify per the caller's callback,
3347 * if he provided one 3419 * if he provided one
3348 */ 3420 */
3349 if (scan->test_task && !scan->test_task(p, scan)) 3421 if (test && !test(p, data))
3350 continue; 3422 continue;
3351 /* 3423 /*
3352 * Only process tasks that started after the last task 3424 * Only process tasks that started after the last task
@@ -3374,7 +3446,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3374 * the heap and wasn't inserted 3446 * the heap and wasn't inserted
3375 */ 3447 */
3376 } 3448 }
3377 cgroup_iter_end(scan->cg, &it); 3449 css_task_iter_end(&it);
3378 3450
3379 if (heap->size) { 3451 if (heap->size) {
3380 for (i = 0; i < heap->size; i++) { 3452 for (i = 0; i < heap->size; i++) {
@@ -3384,7 +3456,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3384 latest_task = q; 3456 latest_task = q;
3385 } 3457 }
3386 /* Process the task per the caller's callback */ 3458 /* Process the task per the caller's callback */
3387 scan->process_task(q, scan); 3459 process(q, data);
3388 put_task_struct(q); 3460 put_task_struct(q);
3389 } 3461 }
3390 /* 3462 /*
@@ -3401,10 +3473,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3401 return 0; 3473 return 0;
3402} 3474}
3403 3475
3404static void cgroup_transfer_one_task(struct task_struct *task, 3476static void cgroup_transfer_one_task(struct task_struct *task, void *data)
3405 struct cgroup_scanner *scan)
3406{ 3477{
3407 struct cgroup *new_cgroup = scan->data; 3478 struct cgroup *new_cgroup = data;
3408 3479
3409 mutex_lock(&cgroup_mutex); 3480 mutex_lock(&cgroup_mutex);
3410 cgroup_attach_task(new_cgroup, task, false); 3481 cgroup_attach_task(new_cgroup, task, false);
@@ -3418,15 +3489,8 @@ static void cgroup_transfer_one_task(struct task_struct *task,
3418 */ 3489 */
3419int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) 3490int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3420{ 3491{
3421 struct cgroup_scanner scan; 3492 return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3422 3493 to, NULL);
3423 scan.cg = from;
3424 scan.test_task = NULL; /* select all tasks in cgroup */
3425 scan.process_task = cgroup_transfer_one_task;
3426 scan.heap = NULL;
3427 scan.data = to;
3428
3429 return cgroup_scan_tasks(&scan);
3430} 3494}
3431 3495
3432/* 3496/*
@@ -3468,7 +3532,7 @@ struct cgroup_pidlist {
3468 /* pointer to the cgroup we belong to, for list removal purposes */ 3532 /* pointer to the cgroup we belong to, for list removal purposes */
3469 struct cgroup *owner; 3533 struct cgroup *owner;
3470 /* protects the other fields */ 3534 /* protects the other fields */
3471 struct rw_semaphore mutex; 3535 struct rw_semaphore rwsem;
3472}; 3536};
3473 3537
3474/* 3538/*
@@ -3541,7 +3605,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3541 struct pid_namespace *ns = task_active_pid_ns(current); 3605 struct pid_namespace *ns = task_active_pid_ns(current);
3542 3606
3543 /* 3607 /*
3544 * We can't drop the pidlist_mutex before taking the l->mutex in case 3608 * We can't drop the pidlist_mutex before taking the l->rwsem in case
3545 * the last ref-holder is trying to remove l from the list at the same 3609 * the last ref-holder is trying to remove l from the list at the same
3546 * time. Holding the pidlist_mutex precludes somebody taking whichever 3610 * time. Holding the pidlist_mutex precludes somebody taking whichever
3547 * list we find out from under us - compare release_pid_array(). 3611 * list we find out from under us - compare release_pid_array().
@@ -3550,7 +3614,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3550 list_for_each_entry(l, &cgrp->pidlists, links) { 3614 list_for_each_entry(l, &cgrp->pidlists, links) {
3551 if (l->key.type == type && l->key.ns == ns) { 3615 if (l->key.type == type && l->key.ns == ns) {
3552 /* make sure l doesn't vanish out from under us */ 3616 /* make sure l doesn't vanish out from under us */
3553 down_write(&l->mutex); 3617 down_write(&l->rwsem);
3554 mutex_unlock(&cgrp->pidlist_mutex); 3618 mutex_unlock(&cgrp->pidlist_mutex);
3555 return l; 3619 return l;
3556 } 3620 }
@@ -3561,8 +3625,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3561 mutex_unlock(&cgrp->pidlist_mutex); 3625 mutex_unlock(&cgrp->pidlist_mutex);
3562 return l; 3626 return l;
3563 } 3627 }
3564 init_rwsem(&l->mutex); 3628 init_rwsem(&l->rwsem);
3565 down_write(&l->mutex); 3629 down_write(&l->rwsem);
3566 l->key.type = type; 3630 l->key.type = type;
3567 l->key.ns = get_pid_ns(ns); 3631 l->key.ns = get_pid_ns(ns);
3568 l->owner = cgrp; 3632 l->owner = cgrp;
@@ -3580,7 +3644,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3580 pid_t *array; 3644 pid_t *array;
3581 int length; 3645 int length;
3582 int pid, n = 0; /* used for populating the array */ 3646 int pid, n = 0; /* used for populating the array */
3583 struct cgroup_iter it; 3647 struct css_task_iter it;
3584 struct task_struct *tsk; 3648 struct task_struct *tsk;
3585 struct cgroup_pidlist *l; 3649 struct cgroup_pidlist *l;
3586 3650
@@ -3595,8 +3659,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3595 if (!array) 3659 if (!array)
3596 return -ENOMEM; 3660 return -ENOMEM;
3597 /* now, populate the array */ 3661 /* now, populate the array */
3598 cgroup_iter_start(cgrp, &it); 3662 css_task_iter_start(&cgrp->dummy_css, &it);
3599 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3663 while ((tsk = css_task_iter_next(&it))) {
3600 if (unlikely(n == length)) 3664 if (unlikely(n == length))
3601 break; 3665 break;
3602 /* get tgid or pid for procs or tasks file respectively */ 3666 /* get tgid or pid for procs or tasks file respectively */
@@ -3607,7 +3671,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3607 if (pid > 0) /* make sure to only use valid results */ 3671 if (pid > 0) /* make sure to only use valid results */
3608 array[n++] = pid; 3672 array[n++] = pid;
3609 } 3673 }
3610 cgroup_iter_end(cgrp, &it); 3674 css_task_iter_end(&it);
3611 length = n; 3675 length = n;
3612 /* now sort & (if procs) strip out duplicates */ 3676 /* now sort & (if procs) strip out duplicates */
3613 sort(array, length, sizeof(pid_t), cmppid, NULL); 3677 sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -3623,7 +3687,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3623 l->list = array; 3687 l->list = array;
3624 l->length = length; 3688 l->length = length;
3625 l->use_count++; 3689 l->use_count++;
3626 up_write(&l->mutex); 3690 up_write(&l->rwsem);
3627 *lp = l; 3691 *lp = l;
3628 return 0; 3692 return 0;
3629} 3693}
@@ -3641,7 +3705,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3641{ 3705{
3642 int ret = -EINVAL; 3706 int ret = -EINVAL;
3643 struct cgroup *cgrp; 3707 struct cgroup *cgrp;
3644 struct cgroup_iter it; 3708 struct css_task_iter it;
3645 struct task_struct *tsk; 3709 struct task_struct *tsk;
3646 3710
3647 /* 3711 /*
@@ -3655,8 +3719,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3655 ret = 0; 3719 ret = 0;
3656 cgrp = dentry->d_fsdata; 3720 cgrp = dentry->d_fsdata;
3657 3721
3658 cgroup_iter_start(cgrp, &it); 3722 css_task_iter_start(&cgrp->dummy_css, &it);
3659 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3723 while ((tsk = css_task_iter_next(&it))) {
3660 switch (tsk->state) { 3724 switch (tsk->state) {
3661 case TASK_RUNNING: 3725 case TASK_RUNNING:
3662 stats->nr_running++; 3726 stats->nr_running++;
@@ -3676,7 +3740,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3676 break; 3740 break;
3677 } 3741 }
3678 } 3742 }
3679 cgroup_iter_end(cgrp, &it); 3743 css_task_iter_end(&it);
3680 3744
3681err: 3745err:
3682 return ret; 3746 return ret;
@@ -3701,7 +3765,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3701 int index = 0, pid = *pos; 3765 int index = 0, pid = *pos;
3702 int *iter; 3766 int *iter;
3703 3767
3704 down_read(&l->mutex); 3768 down_read(&l->rwsem);
3705 if (pid) { 3769 if (pid) {
3706 int end = l->length; 3770 int end = l->length;
3707 3771
@@ -3728,7 +3792,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3728static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3792static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3729{ 3793{
3730 struct cgroup_pidlist *l = s->private; 3794 struct cgroup_pidlist *l = s->private;
3731 up_read(&l->mutex); 3795 up_read(&l->rwsem);
3732} 3796}
3733 3797
3734static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3798static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3774,7 +3838,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3774 * pidlist_mutex, we have to take pidlist_mutex first. 3838 * pidlist_mutex, we have to take pidlist_mutex first.
3775 */ 3839 */
3776 mutex_lock(&l->owner->pidlist_mutex); 3840 mutex_lock(&l->owner->pidlist_mutex);
3777 down_write(&l->mutex); 3841 down_write(&l->rwsem);
3778 BUG_ON(!l->use_count); 3842 BUG_ON(!l->use_count);
3779 if (!--l->use_count) { 3843 if (!--l->use_count) {
3780 /* we're the last user if refcount is 0; remove and free */ 3844 /* we're the last user if refcount is 0; remove and free */
@@ -3782,12 +3846,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3782 mutex_unlock(&l->owner->pidlist_mutex); 3846 mutex_unlock(&l->owner->pidlist_mutex);
3783 pidlist_free(l->list); 3847 pidlist_free(l->list);
3784 put_pid_ns(l->key.ns); 3848 put_pid_ns(l->key.ns);
3785 up_write(&l->mutex); 3849 up_write(&l->rwsem);
3786 kfree(l); 3850 kfree(l);
3787 return; 3851 return;
3788 } 3852 }
3789 mutex_unlock(&l->owner->pidlist_mutex); 3853 mutex_unlock(&l->owner->pidlist_mutex);
3790 up_write(&l->mutex); 3854 up_write(&l->rwsem);
3791} 3855}
3792 3856
3793static int cgroup_pidlist_release(struct inode *inode, struct file *file) 3857static int cgroup_pidlist_release(struct inode *inode, struct file *file)
@@ -3851,21 +3915,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file)
3851 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); 3915 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3852} 3916}
3853 3917
3854static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 3918static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3855 struct cftype *cft) 3919 struct cftype *cft)
3856{ 3920{
3857 return notify_on_release(cgrp); 3921 return notify_on_release(css->cgroup);
3858} 3922}
3859 3923
3860static int cgroup_write_notify_on_release(struct cgroup *cgrp, 3924static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3861 struct cftype *cft, 3925 struct cftype *cft, u64 val)
3862 u64 val)
3863{ 3926{
3864 clear_bit(CGRP_RELEASABLE, &cgrp->flags); 3927 clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3865 if (val) 3928 if (val)
3866 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3929 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3867 else 3930 else
3868 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3931 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3869 return 0; 3932 return 0;
3870} 3933}
3871 3934
@@ -3895,18 +3958,18 @@ static void cgroup_event_remove(struct work_struct *work)
3895{ 3958{
3896 struct cgroup_event *event = container_of(work, struct cgroup_event, 3959 struct cgroup_event *event = container_of(work, struct cgroup_event,
3897 remove); 3960 remove);
3898 struct cgroup *cgrp = event->cgrp; 3961 struct cgroup_subsys_state *css = event->css;
3899 3962
3900 remove_wait_queue(event->wqh, &event->wait); 3963 remove_wait_queue(event->wqh, &event->wait);
3901 3964
3902 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3965 event->cft->unregister_event(css, event->cft, event->eventfd);
3903 3966
3904 /* Notify userspace the event is going away. */ 3967 /* Notify userspace the event is going away. */
3905 eventfd_signal(event->eventfd, 1); 3968 eventfd_signal(event->eventfd, 1);
3906 3969
3907 eventfd_ctx_put(event->eventfd); 3970 eventfd_ctx_put(event->eventfd);
3908 kfree(event); 3971 kfree(event);
3909 cgroup_dput(cgrp); 3972 css_put(css);
3910} 3973}
3911 3974
3912/* 3975/*
@@ -3919,7 +3982,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3919{ 3982{
3920 struct cgroup_event *event = container_of(wait, 3983 struct cgroup_event *event = container_of(wait,
3921 struct cgroup_event, wait); 3984 struct cgroup_event, wait);
3922 struct cgroup *cgrp = event->cgrp; 3985 struct cgroup *cgrp = event->css->cgroup;
3923 unsigned long flags = (unsigned long)key; 3986 unsigned long flags = (unsigned long)key;
3924 3987
3925 if (flags & POLLHUP) { 3988 if (flags & POLLHUP) {
@@ -3963,14 +4026,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file,
3963 * Input must be in format '<event_fd> <control_fd> <args>'. 4026 * Input must be in format '<event_fd> <control_fd> <args>'.
3964 * Interpretation of args is defined by control file implementation. 4027 * Interpretation of args is defined by control file implementation.
3965 */ 4028 */
3966static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, 4029static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
3967 const char *buffer) 4030 struct cftype *cft, const char *buffer)
3968{ 4031{
3969 struct cgroup_event *event = NULL; 4032 struct cgroup *cgrp = dummy_css->cgroup;
3970 struct cgroup *cgrp_cfile; 4033 struct cgroup_event *event;
4034 struct cgroup_subsys_state *cfile_css;
3971 unsigned int efd, cfd; 4035 unsigned int efd, cfd;
3972 struct file *efile = NULL; 4036 struct fd efile;
3973 struct file *cfile = NULL; 4037 struct fd cfile;
3974 char *endp; 4038 char *endp;
3975 int ret; 4039 int ret;
3976 4040
@@ -3987,109 +4051,113 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3987 event = kzalloc(sizeof(*event), GFP_KERNEL); 4051 event = kzalloc(sizeof(*event), GFP_KERNEL);
3988 if (!event) 4052 if (!event)
3989 return -ENOMEM; 4053 return -ENOMEM;
3990 event->cgrp = cgrp; 4054
3991 INIT_LIST_HEAD(&event->list); 4055 INIT_LIST_HEAD(&event->list);
3992 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); 4056 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3993 init_waitqueue_func_entry(&event->wait, cgroup_event_wake); 4057 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3994 INIT_WORK(&event->remove, cgroup_event_remove); 4058 INIT_WORK(&event->remove, cgroup_event_remove);
3995 4059
3996 efile = eventfd_fget(efd); 4060 efile = fdget(efd);
3997 if (IS_ERR(efile)) { 4061 if (!efile.file) {
3998 ret = PTR_ERR(efile); 4062 ret = -EBADF;
3999 goto fail; 4063 goto out_kfree;
4000 } 4064 }
4001 4065
4002 event->eventfd = eventfd_ctx_fileget(efile); 4066 event->eventfd = eventfd_ctx_fileget(efile.file);
4003 if (IS_ERR(event->eventfd)) { 4067 if (IS_ERR(event->eventfd)) {
4004 ret = PTR_ERR(event->eventfd); 4068 ret = PTR_ERR(event->eventfd);
4005 goto fail; 4069 goto out_put_efile;
4006 } 4070 }
4007 4071
4008 cfile = fget(cfd); 4072 cfile = fdget(cfd);
4009 if (!cfile) { 4073 if (!cfile.file) {
4010 ret = -EBADF; 4074 ret = -EBADF;
4011 goto fail; 4075 goto out_put_eventfd;
4012 } 4076 }
4013 4077
4014 /* the process need read permission on control file */ 4078 /* the process need read permission on control file */
4015 /* AV: shouldn't we check that it's been opened for read instead? */ 4079 /* AV: shouldn't we check that it's been opened for read instead? */
4016 ret = inode_permission(file_inode(cfile), MAY_READ); 4080 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4017 if (ret < 0) 4081 if (ret < 0)
4018 goto fail; 4082 goto out_put_cfile;
4019 4083
4020 event->cft = __file_cft(cfile); 4084 event->cft = __file_cft(cfile.file);
4021 if (IS_ERR(event->cft)) { 4085 if (IS_ERR(event->cft)) {
4022 ret = PTR_ERR(event->cft); 4086 ret = PTR_ERR(event->cft);
4023 goto fail; 4087 goto out_put_cfile;
4088 }
4089
4090 if (!event->cft->ss) {
4091 ret = -EBADF;
4092 goto out_put_cfile;
4024 } 4093 }
4025 4094
4026 /* 4095 /*
4027 * The file to be monitored must be in the same cgroup as 4096 * Determine the css of @cfile, verify it belongs to the same
4028 * cgroup.event_control is. 4097 * cgroup as cgroup.event_control, and associate @event with it.
4098 * Remaining events are automatically removed on cgroup destruction
4099 * but the removal is asynchronous, so take an extra ref.
4029 */ 4100 */
4030 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); 4101 rcu_read_lock();
4031 if (cgrp_cfile != cgrp) { 4102
4032 ret = -EINVAL; 4103 ret = -EINVAL;
4033 goto fail; 4104 event->css = cgroup_css(cgrp, event->cft->ss);
4034 } 4105 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
4106 if (event->css && event->css == cfile_css && css_tryget(event->css))
4107 ret = 0;
4108
4109 rcu_read_unlock();
4110 if (ret)
4111 goto out_put_cfile;
4035 4112
4036 if (!event->cft->register_event || !event->cft->unregister_event) { 4113 if (!event->cft->register_event || !event->cft->unregister_event) {
4037 ret = -EINVAL; 4114 ret = -EINVAL;
4038 goto fail; 4115 goto out_put_css;
4039 } 4116 }
4040 4117
4041 ret = event->cft->register_event(cgrp, event->cft, 4118 ret = event->cft->register_event(event->css, event->cft,
4042 event->eventfd, buffer); 4119 event->eventfd, buffer);
4043 if (ret) 4120 if (ret)
4044 goto fail; 4121 goto out_put_css;
4045 4122
4046 efile->f_op->poll(efile, &event->pt); 4123 efile.file->f_op->poll(efile.file, &event->pt);
4047
4048 /*
4049 * Events should be removed after rmdir of cgroup directory, but before
4050 * destroying subsystem state objects. Let's take reference to cgroup
4051 * directory dentry to do that.
4052 */
4053 dget(cgrp->dentry);
4054 4124
4055 spin_lock(&cgrp->event_list_lock); 4125 spin_lock(&cgrp->event_list_lock);
4056 list_add(&event->list, &cgrp->event_list); 4126 list_add(&event->list, &cgrp->event_list);
4057 spin_unlock(&cgrp->event_list_lock); 4127 spin_unlock(&cgrp->event_list_lock);
4058 4128
4059 fput(cfile); 4129 fdput(cfile);
4060 fput(efile); 4130 fdput(efile);
4061 4131
4062 return 0; 4132 return 0;
4063 4133
4064fail: 4134out_put_css:
4065 if (cfile) 4135 css_put(event->css);
4066 fput(cfile); 4136out_put_cfile:
4067 4137 fdput(cfile);
4068 if (event && event->eventfd && !IS_ERR(event->eventfd)) 4138out_put_eventfd:
4069 eventfd_ctx_put(event->eventfd); 4139 eventfd_ctx_put(event->eventfd);
4070 4140out_put_efile:
4071 if (!IS_ERR_OR_NULL(efile)) 4141 fdput(efile);
4072 fput(efile); 4142out_kfree:
4073
4074 kfree(event); 4143 kfree(event);
4075 4144
4076 return ret; 4145 return ret;
4077} 4146}
4078 4147
4079static u64 cgroup_clone_children_read(struct cgroup *cgrp, 4148static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4080 struct cftype *cft) 4149 struct cftype *cft)
4081{ 4150{
4082 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4151 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4083} 4152}
4084 4153
4085static int cgroup_clone_children_write(struct cgroup *cgrp, 4154static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4086 struct cftype *cft, 4155 struct cftype *cft, u64 val)
4087 u64 val)
4088{ 4156{
4089 if (val) 4157 if (val)
4090 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4158 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4091 else 4159 else
4092 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4160 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4093 return 0; 4161 return 0;
4094} 4162}
4095 4163
@@ -4148,36 +4216,34 @@ static struct cftype cgroup_base_files[] = {
4148}; 4216};
4149 4217
4150/** 4218/**
4151 * cgroup_populate_dir - selectively creation of files in a directory 4219 * cgroup_populate_dir - create subsys files in a cgroup directory
4152 * @cgrp: target cgroup 4220 * @cgrp: target cgroup
4153 * @base_files: true if the base files should be added
4154 * @subsys_mask: mask of the subsystem ids whose files should be added 4221 * @subsys_mask: mask of the subsystem ids whose files should be added
4222 *
4223 * On failure, no file is added.
4155 */ 4224 */
4156static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 4225static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
4157 unsigned long subsys_mask)
4158{ 4226{
4159 int err;
4160 struct cgroup_subsys *ss; 4227 struct cgroup_subsys *ss;
4161 4228 int i, ret = 0;
4162 if (base_files) {
4163 err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
4164 if (err < 0)
4165 return err;
4166 }
4167 4229
4168 /* process cftsets of each subsystem */ 4230 /* process cftsets of each subsystem */
4169 for_each_root_subsys(cgrp->root, ss) { 4231 for_each_subsys(ss, i) {
4170 struct cftype_set *set; 4232 struct cftype_set *set;
4171 if (!test_bit(ss->subsys_id, &subsys_mask)) 4233
4234 if (!test_bit(i, &subsys_mask))
4172 continue; 4235 continue;
4173 4236
4174 list_for_each_entry(set, &ss->cftsets, node) 4237 list_for_each_entry(set, &ss->cftsets, node) {
4175 cgroup_addrm_files(cgrp, ss, set->cfts, true); 4238 ret = cgroup_addrm_files(cgrp, set->cfts, true);
4239 if (ret < 0)
4240 goto err;
4241 }
4176 } 4242 }
4177 4243
4178 /* This cgroup is ready now */ 4244 /* This cgroup is ready now */
4179 for_each_root_subsys(cgrp->root, ss) { 4245 for_each_root_subsys(cgrp->root, ss) {
4180 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4246 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
4181 struct css_id *id = rcu_dereference_protected(css->id, true); 4247 struct css_id *id = rcu_dereference_protected(css->id, true);
4182 4248
4183 /* 4249 /*
@@ -4190,14 +4256,57 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
4190 } 4256 }
4191 4257
4192 return 0; 4258 return 0;
4259err:
4260 cgroup_clear_dir(cgrp, subsys_mask);
4261 return ret;
4193} 4262}
4194 4263
4195static void css_dput_fn(struct work_struct *work) 4264/*
4265 * css destruction is four-stage process.
4266 *
4267 * 1. Destruction starts. Killing of the percpu_ref is initiated.
4268 * Implemented in kill_css().
4269 *
4270 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4271 * and thus css_tryget() is guaranteed to fail, the css can be offlined
4272 * by invoking offline_css(). After offlining, the base ref is put.
4273 * Implemented in css_killed_work_fn().
4274 *
4275 * 3. When the percpu_ref reaches zero, the only possible remaining
4276 * accessors are inside RCU read sections. css_release() schedules the
4277 * RCU callback.
4278 *
4279 * 4. After the grace period, the css can be freed. Implemented in
4280 * css_free_work_fn().
4281 *
4282 * It is actually hairier because both step 2 and 4 require process context
4283 * and thus involve punting to css->destroy_work adding two additional
4284 * steps to the already complex sequence.
4285 */
4286static void css_free_work_fn(struct work_struct *work)
4196{ 4287{
4197 struct cgroup_subsys_state *css = 4288 struct cgroup_subsys_state *css =
4198 container_of(work, struct cgroup_subsys_state, dput_work); 4289 container_of(work, struct cgroup_subsys_state, destroy_work);
4290 struct cgroup *cgrp = css->cgroup;
4291
4292 if (css->parent)
4293 css_put(css->parent);
4199 4294
4200 cgroup_dput(css->cgroup); 4295 css->ss->css_free(css);
4296 cgroup_dput(cgrp);
4297}
4298
4299static void css_free_rcu_fn(struct rcu_head *rcu_head)
4300{
4301 struct cgroup_subsys_state *css =
4302 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4303
4304 /*
4305 * css holds an extra ref to @cgrp->dentry which is put on the last
4306 * css_put(). dput() requires process context which we don't have.
4307 */
4308 INIT_WORK(&css->destroy_work, css_free_work_fn);
4309 schedule_work(&css->destroy_work);
4201} 4310}
4202 4311
4203static void css_release(struct percpu_ref *ref) 4312static void css_release(struct percpu_ref *ref)
@@ -4205,49 +4314,47 @@ static void css_release(struct percpu_ref *ref)
4205 struct cgroup_subsys_state *css = 4314 struct cgroup_subsys_state *css =
4206 container_of(ref, struct cgroup_subsys_state, refcnt); 4315 container_of(ref, struct cgroup_subsys_state, refcnt);
4207 4316
4208 schedule_work(&css->dput_work); 4317 call_rcu(&css->rcu_head, css_free_rcu_fn);
4209} 4318}
4210 4319
4211static void init_cgroup_css(struct cgroup_subsys_state *css, 4320static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
4212 struct cgroup_subsys *ss, 4321 struct cgroup *cgrp)
4213 struct cgroup *cgrp)
4214{ 4322{
4215 css->cgroup = cgrp; 4323 css->cgroup = cgrp;
4324 css->ss = ss;
4216 css->flags = 0; 4325 css->flags = 0;
4217 css->id = NULL; 4326 css->id = NULL;
4218 if (cgrp == cgroup_dummy_top) 4327
4328 if (cgrp->parent)
4329 css->parent = cgroup_css(cgrp->parent, ss);
4330 else
4219 css->flags |= CSS_ROOT; 4331 css->flags |= CSS_ROOT;
4220 BUG_ON(cgrp->subsys[ss->subsys_id]);
4221 cgrp->subsys[ss->subsys_id] = css;
4222 4332
4223 /* 4333 BUG_ON(cgroup_css(cgrp, ss));
4224 * css holds an extra ref to @cgrp->dentry which is put on the last
4225 * css_put(). dput() requires process context, which css_put() may
4226 * be called without. @css->dput_work will be used to invoke
4227 * dput() asynchronously from css_put().
4228 */
4229 INIT_WORK(&css->dput_work, css_dput_fn);
4230} 4334}
4231 4335
4232/* invoke ->post_create() on a new CSS and mark it online if successful */ 4336/* invoke ->css_online() on a new CSS and mark it online if successful */
4233static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4337static int online_css(struct cgroup_subsys_state *css)
4234{ 4338{
4339 struct cgroup_subsys *ss = css->ss;
4235 int ret = 0; 4340 int ret = 0;
4236 4341
4237 lockdep_assert_held(&cgroup_mutex); 4342 lockdep_assert_held(&cgroup_mutex);
4238 4343
4239 if (ss->css_online) 4344 if (ss->css_online)
4240 ret = ss->css_online(cgrp); 4345 ret = ss->css_online(css);
4241 if (!ret) 4346 if (!ret) {
4242 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; 4347 css->flags |= CSS_ONLINE;
4348 css->cgroup->nr_css++;
4349 rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
4350 }
4243 return ret; 4351 return ret;
4244} 4352}
4245 4353
4246/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ 4354/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4247static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4355static void offline_css(struct cgroup_subsys_state *css)
4248 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4249{ 4356{
4250 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4357 struct cgroup_subsys *ss = css->ss;
4251 4358
4252 lockdep_assert_held(&cgroup_mutex); 4359 lockdep_assert_held(&cgroup_mutex);
4253 4360
@@ -4255,9 +4362,11 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4255 return; 4362 return;
4256 4363
4257 if (ss->css_offline) 4364 if (ss->css_offline)
4258 ss->css_offline(cgrp); 4365 ss->css_offline(css);
4259 4366
4260 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; 4367 css->flags &= ~CSS_ONLINE;
4368 css->cgroup->nr_css--;
4369 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4261} 4370}
4262 4371
4263/* 4372/*
@@ -4271,6 +4380,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4271static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4380static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4272 umode_t mode) 4381 umode_t mode)
4273{ 4382{
4383 struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
4274 struct cgroup *cgrp; 4384 struct cgroup *cgrp;
4275 struct cgroup_name *name; 4385 struct cgroup_name *name;
4276 struct cgroupfs_root *root = parent->root; 4386 struct cgroupfs_root *root = parent->root;
@@ -4288,7 +4398,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4288 goto err_free_cgrp; 4398 goto err_free_cgrp;
4289 rcu_assign_pointer(cgrp->name, name); 4399 rcu_assign_pointer(cgrp->name, name);
4290 4400
4291 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); 4401 /*
4402 * Temporarily set the pointer to NULL, so idr_find() won't return
4403 * a half-baked cgroup.
4404 */
4405 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4292 if (cgrp->id < 0) 4406 if (cgrp->id < 0)
4293 goto err_free_name; 4407 goto err_free_name;
4294 4408
@@ -4317,6 +4431,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4317 cgrp->dentry = dentry; 4431 cgrp->dentry = dentry;
4318 4432
4319 cgrp->parent = parent; 4433 cgrp->parent = parent;
4434 cgrp->dummy_css.parent = &parent->dummy_css;
4320 cgrp->root = parent->root; 4435 cgrp->root = parent->root;
4321 4436
4322 if (notify_on_release(parent)) 4437 if (notify_on_release(parent))
@@ -4328,22 +4443,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4328 for_each_root_subsys(root, ss) { 4443 for_each_root_subsys(root, ss) {
4329 struct cgroup_subsys_state *css; 4444 struct cgroup_subsys_state *css;
4330 4445
4331 css = ss->css_alloc(cgrp); 4446 css = ss->css_alloc(cgroup_css(parent, ss));
4332 if (IS_ERR(css)) { 4447 if (IS_ERR(css)) {
4333 err = PTR_ERR(css); 4448 err = PTR_ERR(css);
4334 goto err_free_all; 4449 goto err_free_all;
4335 } 4450 }
4451 css_ar[ss->subsys_id] = css;
4336 4452
4337 err = percpu_ref_init(&css->refcnt, css_release); 4453 err = percpu_ref_init(&css->refcnt, css_release);
4338 if (err) { 4454 if (err)
4339 ss->css_free(cgrp);
4340 goto err_free_all; 4455 goto err_free_all;
4341 }
4342 4456
4343 init_cgroup_css(css, ss, cgrp); 4457 init_css(css, ss, cgrp);
4344 4458
4345 if (ss->use_id) { 4459 if (ss->use_id) {
4346 err = alloc_css_id(ss, parent, cgrp); 4460 err = alloc_css_id(css);
4347 if (err) 4461 if (err)
4348 goto err_free_all; 4462 goto err_free_all;
4349 } 4463 }
@@ -4365,16 +4479,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4365 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4479 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4366 root->number_of_cgroups++; 4480 root->number_of_cgroups++;
4367 4481
4368 /* each css holds a ref to the cgroup's dentry */ 4482 /* each css holds a ref to the cgroup's dentry and the parent css */
4369 for_each_root_subsys(root, ss) 4483 for_each_root_subsys(root, ss) {
4484 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4485
4370 dget(dentry); 4486 dget(dentry);
4487 css_get(css->parent);
4488 }
4371 4489
4372 /* hold a ref to the parent's dentry */ 4490 /* hold a ref to the parent's dentry */
4373 dget(parent->dentry); 4491 dget(parent->dentry);
4374 4492
4375 /* creation succeeded, notify subsystems */ 4493 /* creation succeeded, notify subsystems */
4376 for_each_root_subsys(root, ss) { 4494 for_each_root_subsys(root, ss) {
4377 err = online_css(ss, cgrp); 4495 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4496
4497 err = online_css(css);
4378 if (err) 4498 if (err)
4379 goto err_destroy; 4499 goto err_destroy;
4380 4500
@@ -4388,7 +4508,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4388 } 4508 }
4389 } 4509 }
4390 4510
4391 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4511 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4512
4513 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4514 if (err)
4515 goto err_destroy;
4516
4517 err = cgroup_populate_dir(cgrp, root->subsys_mask);
4392 if (err) 4518 if (err)
4393 goto err_destroy; 4519 goto err_destroy;
4394 4520
@@ -4399,18 +4525,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4399 4525
4400err_free_all: 4526err_free_all:
4401 for_each_root_subsys(root, ss) { 4527 for_each_root_subsys(root, ss) {
4402 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4528 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4403 4529
4404 if (css) { 4530 if (css) {
4405 percpu_ref_cancel_init(&css->refcnt); 4531 percpu_ref_cancel_init(&css->refcnt);
4406 ss->css_free(cgrp); 4532 ss->css_free(css);
4407 } 4533 }
4408 } 4534 }
4409 mutex_unlock(&cgroup_mutex); 4535 mutex_unlock(&cgroup_mutex);
4410 /* Release the reference count that we took on the superblock */ 4536 /* Release the reference count that we took on the superblock */
4411 deactivate_super(sb); 4537 deactivate_super(sb);
4412err_free_id: 4538err_free_id:
4413 ida_simple_remove(&root->cgroup_ida, cgrp->id); 4539 idr_remove(&root->cgroup_idr, cgrp->id);
4414err_free_name: 4540err_free_name:
4415 kfree(rcu_dereference_raw(cgrp->name)); 4541 kfree(rcu_dereference_raw(cgrp->name));
4416err_free_cgrp: 4542err_free_cgrp:
@@ -4432,22 +4558,84 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4432 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4558 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4433} 4559}
4434 4560
4435static void cgroup_css_killed(struct cgroup *cgrp) 4561/*
4562 * This is called when the refcnt of a css is confirmed to be killed.
4563 * css_tryget() is now guaranteed to fail.
4564 */
4565static void css_killed_work_fn(struct work_struct *work)
4436{ 4566{
4437 if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) 4567 struct cgroup_subsys_state *css =
4438 return; 4568 container_of(work, struct cgroup_subsys_state, destroy_work);
4569 struct cgroup *cgrp = css->cgroup;
4439 4570
4440 /* percpu ref's of all css's are killed, kick off the next step */ 4571 mutex_lock(&cgroup_mutex);
4441 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); 4572
4442 schedule_work(&cgrp->destroy_work); 4573 /*
4574 * css_tryget() is guaranteed to fail now. Tell subsystems to
4575 * initate destruction.
4576 */
4577 offline_css(css);
4578
4579 /*
4580 * If @cgrp is marked dead, it's waiting for refs of all css's to
4581 * be disabled before proceeding to the second phase of cgroup
4582 * destruction. If we are the last one, kick it off.
4583 */
4584 if (!cgrp->nr_css && cgroup_is_dead(cgrp))
4585 cgroup_destroy_css_killed(cgrp);
4586
4587 mutex_unlock(&cgroup_mutex);
4588
4589 /*
4590 * Put the css refs from kill_css(). Each css holds an extra
4591 * reference to the cgroup's dentry and cgroup removal proceeds
4592 * regardless of css refs. On the last put of each css, whenever
4593 * that may be, the extra dentry ref is put so that dentry
4594 * destruction happens only after all css's are released.
4595 */
4596 css_put(css);
4443} 4597}
4444 4598
4445static void css_ref_killed_fn(struct percpu_ref *ref) 4599/* css kill confirmation processing requires process context, bounce */
4600static void css_killed_ref_fn(struct percpu_ref *ref)
4446{ 4601{
4447 struct cgroup_subsys_state *css = 4602 struct cgroup_subsys_state *css =
4448 container_of(ref, struct cgroup_subsys_state, refcnt); 4603 container_of(ref, struct cgroup_subsys_state, refcnt);
4449 4604
4450 cgroup_css_killed(css->cgroup); 4605 INIT_WORK(&css->destroy_work, css_killed_work_fn);
4606 schedule_work(&css->destroy_work);
4607}
4608
4609/**
4610 * kill_css - destroy a css
4611 * @css: css to destroy
4612 *
4613 * This function initiates destruction of @css by removing cgroup interface
4614 * files and putting its base reference. ->css_offline() will be invoked
4615 * asynchronously once css_tryget() is guaranteed to fail and when the
4616 * reference count reaches zero, @css will be released.
4617 */
4618static void kill_css(struct cgroup_subsys_state *css)
4619{
4620 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
4621
4622 /*
4623 * Killing would put the base ref, but we need to keep it alive
4624 * until after ->css_offline().
4625 */
4626 css_get(css);
4627
4628 /*
4629 * cgroup core guarantees that, by the time ->css_offline() is
4630 * invoked, no new css reference will be given out via
4631 * css_tryget(). We can't simply call percpu_ref_kill() and
4632 * proceed to offlining css's because percpu_ref_kill() doesn't
4633 * guarantee that the ref is seen as killed on all CPUs on return.
4634 *
4635 * Use percpu_ref_kill_and_confirm() to get notifications as each
4636 * css is confirmed to be seen as killed on all CPUs.
4637 */
4638 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4451} 4639}
4452 4640
4453/** 4641/**
@@ -4513,41 +4701,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4513 return -EBUSY; 4701 return -EBUSY;
4514 4702
4515 /* 4703 /*
4516 * Block new css_tryget() by killing css refcnts. cgroup core 4704 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4517 * guarantees that, by the time ->css_offline() is invoked, no new 4705 * will be invoked to perform the rest of destruction once the
4518 * css reference will be given out via css_tryget(). We can't 4706 * percpu refs of all css's are confirmed to be killed.
4519 * simply call percpu_ref_kill() and proceed to offlining css's
4520 * because percpu_ref_kill() doesn't guarantee that the ref is seen
4521 * as killed on all CPUs on return.
4522 *
4523 * Use percpu_ref_kill_and_confirm() to get notifications as each
4524 * css is confirmed to be seen as killed on all CPUs. The
4525 * notification callback keeps track of the number of css's to be
4526 * killed and schedules cgroup_offline_fn() to perform the rest of
4527 * destruction once the percpu refs of all css's are confirmed to
4528 * be killed.
4529 */ 4707 */
4530 atomic_set(&cgrp->css_kill_cnt, 1); 4708 for_each_root_subsys(cgrp->root, ss)
4531 for_each_root_subsys(cgrp->root, ss) { 4709 kill_css(cgroup_css(cgrp, ss));
4532 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4533
4534 /*
4535 * Killing would put the base ref, but we need to keep it
4536 * alive until after ->css_offline.
4537 */
4538 percpu_ref_get(&css->refcnt);
4539
4540 atomic_inc(&cgrp->css_kill_cnt);
4541 percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
4542 }
4543 cgroup_css_killed(cgrp);
4544 4710
4545 /* 4711 /*
4546 * Mark @cgrp dead. This prevents further task migration and child 4712 * Mark @cgrp dead. This prevents further task migration and child
4547 * creation by disabling cgroup_lock_live_group(). Note that 4713 * creation by disabling cgroup_lock_live_group(). Note that
4548 * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to 4714 * CGRP_DEAD assertion is depended upon by css_next_child() to
4549 * resume iteration after dropping RCU read lock. See 4715 * resume iteration after dropping RCU read lock. See
4550 * cgroup_next_sibling() for details. 4716 * css_next_child() for details.
4551 */ 4717 */
4552 set_bit(CGRP_DEAD, &cgrp->flags); 4718 set_bit(CGRP_DEAD, &cgrp->flags);
4553 4719
@@ -4558,9 +4724,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4558 raw_spin_unlock(&release_list_lock); 4724 raw_spin_unlock(&release_list_lock);
4559 4725
4560 /* 4726 /*
4561 * Remove @cgrp directory. The removal puts the base ref but we 4727 * If @cgrp has css's attached, the second stage of cgroup
4562 * aren't quite done with @cgrp yet, so hold onto it. 4728 * destruction is kicked off from css_killed_work_fn() after the
4729 * refs of all attached css's are killed. If @cgrp doesn't have
4730 * any css, we kick it off here.
4563 */ 4731 */
4732 if (!cgrp->nr_css)
4733 cgroup_destroy_css_killed(cgrp);
4734
4735 /*
4736 * Clear the base files and remove @cgrp directory. The removal
4737 * puts the base ref but we aren't quite done with @cgrp yet, so
4738 * hold onto it.
4739 */
4740 cgroup_addrm_files(cgrp, cgroup_base_files, false);
4564 dget(d); 4741 dget(d);
4565 cgroup_d_remove_dir(d); 4742 cgroup_d_remove_dir(d);
4566 4743
@@ -4580,50 +4757,36 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4580}; 4757};
4581 4758
4582/** 4759/**
4583 * cgroup_offline_fn - the second step of cgroup destruction 4760 * cgroup_destroy_css_killed - the second step of cgroup destruction
4584 * @work: cgroup->destroy_free_work 4761 * @work: cgroup->destroy_free_work
4585 * 4762 *
4586 * This function is invoked from a work item for a cgroup which is being 4763 * This function is invoked from a work item for a cgroup which is being
4587 * destroyed after the percpu refcnts of all css's are guaranteed to be 4764 * destroyed after all css's are offlined and performs the rest of
4588 * seen as killed on all CPUs, and performs the rest of destruction. This 4765 * destruction. This is the second step of destruction described in the
4589 * is the second step of destruction described in the comment above 4766 * comment above cgroup_destroy_locked().
4590 * cgroup_destroy_locked().
4591 */ 4767 */
4592static void cgroup_offline_fn(struct work_struct *work) 4768static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4593{ 4769{
4594 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
4595 struct cgroup *parent = cgrp->parent; 4770 struct cgroup *parent = cgrp->parent;
4596 struct dentry *d = cgrp->dentry; 4771 struct dentry *d = cgrp->dentry;
4597 struct cgroup_subsys *ss;
4598 4772
4599 mutex_lock(&cgroup_mutex); 4773 lockdep_assert_held(&cgroup_mutex);
4600 4774
4601 /* 4775 /* delete this cgroup from parent->children */
4602 * css_tryget() is guaranteed to fail now. Tell subsystems to 4776 list_del_rcu(&cgrp->sibling);
4603 * initate destruction.
4604 */
4605 for_each_root_subsys(cgrp->root, ss)
4606 offline_css(ss, cgrp);
4607 4777
4608 /* 4778 /*
4609 * Put the css refs from cgroup_destroy_locked(). Each css holds 4779 * We should remove the cgroup object from idr before its grace
4610 * an extra reference to the cgroup's dentry and cgroup removal 4780 * period starts, so we won't be looking up a cgroup while the
4611 * proceeds regardless of css refs. On the last put of each css, 4781 * cgroup is being freed.
4612 * whenever that may be, the extra dentry ref is put so that dentry
4613 * destruction happens only after all css's are released.
4614 */ 4782 */
4615 for_each_root_subsys(cgrp->root, ss) 4783 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4616 css_put(cgrp->subsys[ss->subsys_id]); 4784 cgrp->id = -1;
4617
4618 /* delete this cgroup from parent->children */
4619 list_del_rcu(&cgrp->sibling);
4620 4785
4621 dput(d); 4786 dput(d);
4622 4787
4623 set_bit(CGRP_RELEASABLE, &parent->flags); 4788 set_bit(CGRP_RELEASABLE, &parent->flags);
4624 check_for_release(parent); 4789 check_for_release(parent);
4625
4626 mutex_unlock(&cgroup_mutex);
4627} 4790}
4628 4791
4629static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4792static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4646,6 +4809,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4646 * deregistration. 4809 * deregistration.
4647 */ 4810 */
4648 if (ss->base_cftypes) { 4811 if (ss->base_cftypes) {
4812 struct cftype *cft;
4813
4814 for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
4815 cft->ss = ss;
4816
4649 ss->base_cftset.cfts = ss->base_cftypes; 4817 ss->base_cftset.cfts = ss->base_cftypes;
4650 list_add_tail(&ss->base_cftset.node, &ss->cftsets); 4818 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4651 } 4819 }
@@ -4665,10 +4833,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4665 /* Create the top cgroup state for this subsystem */ 4833 /* Create the top cgroup state for this subsystem */
4666 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); 4834 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4667 ss->root = &cgroup_dummy_root; 4835 ss->root = &cgroup_dummy_root;
4668 css = ss->css_alloc(cgroup_dummy_top); 4836 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4669 /* We don't handle early failures gracefully */ 4837 /* We don't handle early failures gracefully */
4670 BUG_ON(IS_ERR(css)); 4838 BUG_ON(IS_ERR(css));
4671 init_cgroup_css(css, ss, cgroup_dummy_top); 4839 init_css(css, ss, cgroup_dummy_top);
4672 4840
4673 /* Update the init_css_set to contain a subsys 4841 /* Update the init_css_set to contain a subsys
4674 * pointer to this state - since the subsystem is 4842 * pointer to this state - since the subsystem is
@@ -4683,7 +4851,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4683 * need to invoke fork callbacks here. */ 4851 * need to invoke fork callbacks here. */
4684 BUG_ON(!list_empty(&init_task.tasks)); 4852 BUG_ON(!list_empty(&init_task.tasks));
4685 4853
4686 BUG_ON(online_css(ss, cgroup_dummy_top)); 4854 BUG_ON(online_css(css));
4687 4855
4688 mutex_unlock(&cgroup_mutex); 4856 mutex_unlock(&cgroup_mutex);
4689 4857
@@ -4744,7 +4912,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4744 * struct, so this can happen first (i.e. before the dummy root 4912 * struct, so this can happen first (i.e. before the dummy root
4745 * attachment). 4913 * attachment).
4746 */ 4914 */
4747 css = ss->css_alloc(cgroup_dummy_top); 4915 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4748 if (IS_ERR(css)) { 4916 if (IS_ERR(css)) {
4749 /* failure case - need to deassign the cgroup_subsys[] slot. */ 4917 /* failure case - need to deassign the cgroup_subsys[] slot. */
4750 cgroup_subsys[ss->subsys_id] = NULL; 4918 cgroup_subsys[ss->subsys_id] = NULL;
@@ -4756,8 +4924,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4756 ss->root = &cgroup_dummy_root; 4924 ss->root = &cgroup_dummy_root;
4757 4925
4758 /* our new subsystem will be attached to the dummy hierarchy. */ 4926 /* our new subsystem will be attached to the dummy hierarchy. */
4759 init_cgroup_css(css, ss, cgroup_dummy_top); 4927 init_css(css, ss, cgroup_dummy_top);
4760 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4928 /* init_idr must be after init_css() because it sets css->id. */
4761 if (ss->use_id) { 4929 if (ss->use_id) {
4762 ret = cgroup_init_idr(ss, css); 4930 ret = cgroup_init_idr(ss, css);
4763 if (ret) 4931 if (ret)
@@ -4787,7 +4955,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4787 } 4955 }
4788 write_unlock(&css_set_lock); 4956 write_unlock(&css_set_lock);
4789 4957
4790 ret = online_css(ss, cgroup_dummy_top); 4958 ret = online_css(css);
4791 if (ret) 4959 if (ret)
4792 goto err_unload; 4960 goto err_unload;
4793 4961
@@ -4819,14 +4987,14 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4819 4987
4820 /* 4988 /*
4821 * we shouldn't be called if the subsystem is in use, and the use of 4989 * we shouldn't be called if the subsystem is in use, and the use of
4822 * try_module_get in parse_cgroupfs_options should ensure that it 4990 * try_module_get() in rebind_subsystems() should ensure that it
4823 * doesn't start being used while we're killing it off. 4991 * doesn't start being used while we're killing it off.
4824 */ 4992 */
4825 BUG_ON(ss->root != &cgroup_dummy_root); 4993 BUG_ON(ss->root != &cgroup_dummy_root);
4826 4994
4827 mutex_lock(&cgroup_mutex); 4995 mutex_lock(&cgroup_mutex);
4828 4996
4829 offline_css(ss, cgroup_dummy_top); 4997 offline_css(cgroup_css(cgroup_dummy_top, ss));
4830 4998
4831 if (ss->use_id) 4999 if (ss->use_id)
4832 idr_destroy(&ss->idr); 5000 idr_destroy(&ss->idr);
@@ -4860,8 +5028,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4860 * the cgrp->subsys pointer to find their state. note that this 5028 * the cgrp->subsys pointer to find their state. note that this
4861 * also takes care of freeing the css_id. 5029 * also takes care of freeing the css_id.
4862 */ 5030 */
4863 ss->css_free(cgroup_dummy_top); 5031 ss->css_free(cgroup_css(cgroup_dummy_top, ss));
4864 cgroup_dummy_top->subsys[ss->subsys_id] = NULL; 5032 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4865 5033
4866 mutex_unlock(&cgroup_mutex); 5034 mutex_unlock(&cgroup_mutex);
4867} 5035}
@@ -4943,6 +5111,10 @@ int __init cgroup_init(void)
4943 5111
4944 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); 5112 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
4945 5113
5114 err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
5115 0, 1, GFP_KERNEL);
5116 BUG_ON(err < 0);
5117
4946 mutex_unlock(&cgroup_root_mutex); 5118 mutex_unlock(&cgroup_root_mutex);
4947 mutex_unlock(&cgroup_mutex); 5119 mutex_unlock(&cgroup_mutex);
4948 5120
@@ -5099,7 +5271,7 @@ void cgroup_fork(struct task_struct *child)
5099 * Adds the task to the list running through its css_set if necessary and 5271 * Adds the task to the list running through its css_set if necessary and
5100 * call the subsystem fork() callbacks. Has to be after the task is 5272 * call the subsystem fork() callbacks. Has to be after the task is
5101 * visible on the task list in case we race with the first call to 5273 * visible on the task list in case we race with the first call to
5102 * cgroup_iter_start() - to guarantee that the new task ends up on its 5274 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5103 * list. 5275 * list.
5104 */ 5276 */
5105void cgroup_post_fork(struct task_struct *child) 5277void cgroup_post_fork(struct task_struct *child)
@@ -5212,10 +5384,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5212 */ 5384 */
5213 for_each_builtin_subsys(ss, i) { 5385 for_each_builtin_subsys(ss, i) {
5214 if (ss->exit) { 5386 if (ss->exit) {
5215 struct cgroup *old_cgrp = cset->subsys[i]->cgroup; 5387 struct cgroup_subsys_state *old_css = cset->subsys[i];
5216 struct cgroup *cgrp = task_cgroup(tsk, i); 5388 struct cgroup_subsys_state *css = task_css(tsk, i);
5217 5389
5218 ss->exit(cgrp, old_cgrp, tsk); 5390 ss->exit(css, old_css, tsk);
5219 } 5391 }
5220 } 5392 }
5221 } 5393 }
@@ -5474,20 +5646,16 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5474 return 0; 5646 return 0;
5475} 5647}
5476 5648
5477static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, 5649static int alloc_css_id(struct cgroup_subsys_state *child_css)
5478 struct cgroup *child)
5479{ 5650{
5480 int subsys_id, i, depth = 0; 5651 struct cgroup_subsys_state *parent_css = css_parent(child_css);
5481 struct cgroup_subsys_state *parent_css, *child_css;
5482 struct css_id *child_id, *parent_id; 5652 struct css_id *child_id, *parent_id;
5653 int i, depth;
5483 5654
5484 subsys_id = ss->subsys_id;
5485 parent_css = parent->subsys[subsys_id];
5486 child_css = child->subsys[subsys_id];
5487 parent_id = rcu_dereference_protected(parent_css->id, true); 5655 parent_id = rcu_dereference_protected(parent_css->id, true);
5488 depth = parent_id->depth + 1; 5656 depth = parent_id->depth + 1;
5489 5657
5490 child_id = get_new_cssid(ss, depth); 5658 child_id = get_new_cssid(child_css->ss, depth);
5491 if (IS_ERR(child_id)) 5659 if (IS_ERR(child_id))
5492 return PTR_ERR(child_id); 5660 return PTR_ERR(child_id);
5493 5661
@@ -5525,31 +5693,56 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
5525} 5693}
5526EXPORT_SYMBOL_GPL(css_lookup); 5694EXPORT_SYMBOL_GPL(css_lookup);
5527 5695
5528/* 5696/**
5529 * get corresponding css from file open on cgroupfs directory 5697 * css_from_dir - get corresponding css from the dentry of a cgroup dir
5698 * @dentry: directory dentry of interest
5699 * @ss: subsystem of interest
5700 *
5701 * Must be called under RCU read lock. The caller is responsible for
5702 * pinning the returned css if it needs to be accessed outside the RCU
5703 * critical section.
5530 */ 5704 */
5531struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) 5705struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5706 struct cgroup_subsys *ss)
5532{ 5707{
5533 struct cgroup *cgrp; 5708 struct cgroup *cgrp;
5534 struct inode *inode;
5535 struct cgroup_subsys_state *css;
5536 5709
5537 inode = file_inode(f); 5710 WARN_ON_ONCE(!rcu_read_lock_held());
5538 /* check in cgroup filesystem dir */ 5711
5539 if (inode->i_op != &cgroup_dir_inode_operations) 5712 /* is @dentry a cgroup dir? */
5713 if (!dentry->d_inode ||
5714 dentry->d_inode->i_op != &cgroup_dir_inode_operations)
5540 return ERR_PTR(-EBADF); 5715 return ERR_PTR(-EBADF);
5541 5716
5542 if (id < 0 || id >= CGROUP_SUBSYS_COUNT) 5717 cgrp = __d_cgrp(dentry);
5543 return ERR_PTR(-EINVAL); 5718 return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT);
5719}
5544 5720
5545 /* get cgroup */ 5721/**
5546 cgrp = __d_cgrp(f->f_dentry); 5722 * css_from_id - lookup css by id
5547 css = cgrp->subsys[id]; 5723 * @id: the cgroup id
5548 return css ? css : ERR_PTR(-ENOENT); 5724 * @ss: cgroup subsys to be looked into
5725 *
5726 * Returns the css if there's valid one with @id, otherwise returns NULL.
5727 * Should be called under rcu_read_lock().
5728 */
5729struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5730{
5731 struct cgroup *cgrp;
5732
5733 rcu_lockdep_assert(rcu_read_lock_held() ||
5734 lockdep_is_held(&cgroup_mutex),
5735 "css_from_id() needs proper protection");
5736
5737 cgrp = idr_find(&ss->root->cgroup_idr, id);
5738 if (cgrp)
5739 return cgroup_css(cgrp, ss);
5740 return NULL;
5549} 5741}
5550 5742
5551#ifdef CONFIG_CGROUP_DEBUG 5743#ifdef CONFIG_CGROUP_DEBUG
5552static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) 5744static struct cgroup_subsys_state *
5745debug_css_alloc(struct cgroup_subsys_state *parent_css)
5553{ 5746{
5554 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5747 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5555 5748
@@ -5559,22 +5752,24 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
5559 return css; 5752 return css;
5560} 5753}
5561 5754
5562static void debug_css_free(struct cgroup *cgrp) 5755static void debug_css_free(struct cgroup_subsys_state *css)
5563{ 5756{
5564 kfree(cgrp->subsys[debug_subsys_id]); 5757 kfree(css);
5565} 5758}
5566 5759
5567static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) 5760static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
5761 struct cftype *cft)
5568{ 5762{
5569 return cgroup_task_count(cgrp); 5763 return cgroup_task_count(css->cgroup);
5570} 5764}
5571 5765
5572static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) 5766static u64 current_css_set_read(struct cgroup_subsys_state *css,
5767 struct cftype *cft)
5573{ 5768{
5574 return (u64)(unsigned long)current->cgroups; 5769 return (u64)(unsigned long)current->cgroups;
5575} 5770}
5576 5771
5577static u64 current_css_set_refcount_read(struct cgroup *cgrp, 5772static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5578 struct cftype *cft) 5773 struct cftype *cft)
5579{ 5774{
5580 u64 count; 5775 u64 count;
@@ -5585,7 +5780,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp,
5585 return count; 5780 return count;
5586} 5781}
5587 5782
5588static int current_css_set_cg_links_read(struct cgroup *cgrp, 5783static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
5589 struct cftype *cft, 5784 struct cftype *cft,
5590 struct seq_file *seq) 5785 struct seq_file *seq)
5591{ 5786{
@@ -5612,14 +5807,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp,
5612} 5807}
5613 5808
5614#define MAX_TASKS_SHOWN_PER_CSS 25 5809#define MAX_TASKS_SHOWN_PER_CSS 25
5615static int cgroup_css_links_read(struct cgroup *cgrp, 5810static int cgroup_css_links_read(struct cgroup_subsys_state *css,
5616 struct cftype *cft, 5811 struct cftype *cft, struct seq_file *seq)
5617 struct seq_file *seq)
5618{ 5812{
5619 struct cgrp_cset_link *link; 5813 struct cgrp_cset_link *link;
5620 5814
5621 read_lock(&css_set_lock); 5815 read_lock(&css_set_lock);
5622 list_for_each_entry(link, &cgrp->cset_links, cset_link) { 5816 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5623 struct css_set *cset = link->cset; 5817 struct css_set *cset = link->cset;
5624 struct task_struct *task; 5818 struct task_struct *task;
5625 int count = 0; 5819 int count = 0;
@@ -5638,9 +5832,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp,
5638 return 0; 5832 return 0;
5639} 5833}
5640 5834
5641static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) 5835static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5642{ 5836{
5643 return test_bit(CGRP_RELEASABLE, &cgrp->flags); 5837 return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
5644} 5838}
5645 5839
5646static struct cftype debug_files[] = { 5840static struct cftype debug_files[] = {
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 75dda1ea5026..f0ff64d0ebaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -45,25 +45,19 @@ struct freezer {
45 spinlock_t lock; 45 spinlock_t lock;
46}; 46};
47 47
48static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) 48static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
49{ 49{
50 return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), 50 return css ? container_of(css, struct freezer, css) : NULL;
51 struct freezer, css);
52} 51}
53 52
54static inline struct freezer *task_freezer(struct task_struct *task) 53static inline struct freezer *task_freezer(struct task_struct *task)
55{ 54{
56 return container_of(task_subsys_state(task, freezer_subsys_id), 55 return css_freezer(task_css(task, freezer_subsys_id));
57 struct freezer, css);
58} 56}
59 57
60static struct freezer *parent_freezer(struct freezer *freezer) 58static struct freezer *parent_freezer(struct freezer *freezer)
61{ 59{
62 struct cgroup *pcg = freezer->css.cgroup->parent; 60 return css_freezer(css_parent(&freezer->css));
63
64 if (pcg)
65 return cgroup_freezer(pcg);
66 return NULL;
67} 61}
68 62
69bool cgroup_freezing(struct task_struct *task) 63bool cgroup_freezing(struct task_struct *task)
@@ -92,7 +86,8 @@ static const char *freezer_state_strs(unsigned int state)
92 86
93struct cgroup_subsys freezer_subsys; 87struct cgroup_subsys freezer_subsys;
94 88
95static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) 89static struct cgroup_subsys_state *
90freezer_css_alloc(struct cgroup_subsys_state *parent_css)
96{ 91{
97 struct freezer *freezer; 92 struct freezer *freezer;
98 93
@@ -105,22 +100,22 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
105} 100}
106 101
107/** 102/**
108 * freezer_css_online - commit creation of a freezer cgroup 103 * freezer_css_online - commit creation of a freezer css
109 * @cgroup: cgroup being created 104 * @css: css being created
110 * 105 *
111 * We're committing to creation of @cgroup. Mark it online and inherit 106 * We're committing to creation of @css. Mark it online and inherit
112 * parent's freezing state while holding both parent's and our 107 * parent's freezing state while holding both parent's and our
113 * freezer->lock. 108 * freezer->lock.
114 */ 109 */
115static int freezer_css_online(struct cgroup *cgroup) 110static int freezer_css_online(struct cgroup_subsys_state *css)
116{ 111{
117 struct freezer *freezer = cgroup_freezer(cgroup); 112 struct freezer *freezer = css_freezer(css);
118 struct freezer *parent = parent_freezer(freezer); 113 struct freezer *parent = parent_freezer(freezer);
119 114
120 /* 115 /*
121 * The following double locking and freezing state inheritance 116 * The following double locking and freezing state inheritance
122 * guarantee that @cgroup can never escape ancestors' freezing 117 * guarantee that @cgroup can never escape ancestors' freezing
123 * states. See cgroup_for_each_descendant_pre() for details. 118 * states. See css_for_each_descendant_pre() for details.
124 */ 119 */
125 if (parent) 120 if (parent)
126 spin_lock_irq(&parent->lock); 121 spin_lock_irq(&parent->lock);
@@ -141,15 +136,15 @@ static int freezer_css_online(struct cgroup *cgroup)
141} 136}
142 137
143/** 138/**
144 * freezer_css_offline - initiate destruction of @cgroup 139 * freezer_css_offline - initiate destruction of a freezer css
145 * @cgroup: cgroup being destroyed 140 * @css: css being destroyed
146 * 141 *
147 * @cgroup is going away. Mark it dead and decrement system_freezing_count 142 * @css is going away. Mark it dead and decrement system_freezing_count if
148 * if it was holding one. 143 * it was holding one.
149 */ 144 */
150static void freezer_css_offline(struct cgroup *cgroup) 145static void freezer_css_offline(struct cgroup_subsys_state *css)
151{ 146{
152 struct freezer *freezer = cgroup_freezer(cgroup); 147 struct freezer *freezer = css_freezer(css);
153 148
154 spin_lock_irq(&freezer->lock); 149 spin_lock_irq(&freezer->lock);
155 150
@@ -161,9 +156,9 @@ static void freezer_css_offline(struct cgroup *cgroup)
161 spin_unlock_irq(&freezer->lock); 156 spin_unlock_irq(&freezer->lock);
162} 157}
163 158
164static void freezer_css_free(struct cgroup *cgroup) 159static void freezer_css_free(struct cgroup_subsys_state *css)
165{ 160{
166 kfree(cgroup_freezer(cgroup)); 161 kfree(css_freezer(css));
167} 162}
168 163
169/* 164/*
@@ -175,25 +170,26 @@ static void freezer_css_free(struct cgroup *cgroup)
175 * @freezer->lock. freezer_attach() makes the new tasks conform to the 170 * @freezer->lock. freezer_attach() makes the new tasks conform to the
176 * current state and all following state changes can see the new tasks. 171 * current state and all following state changes can see the new tasks.
177 */ 172 */
178static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) 173static void freezer_attach(struct cgroup_subsys_state *new_css,
174 struct cgroup_taskset *tset)
179{ 175{
180 struct freezer *freezer = cgroup_freezer(new_cgrp); 176 struct freezer *freezer = css_freezer(new_css);
181 struct task_struct *task; 177 struct task_struct *task;
182 bool clear_frozen = false; 178 bool clear_frozen = false;
183 179
184 spin_lock_irq(&freezer->lock); 180 spin_lock_irq(&freezer->lock);
185 181
186 /* 182 /*
187 * Make the new tasks conform to the current state of @new_cgrp. 183 * Make the new tasks conform to the current state of @new_css.
188 * For simplicity, when migrating any task to a FROZEN cgroup, we 184 * For simplicity, when migrating any task to a FROZEN cgroup, we
189 * revert it to FREEZING and let update_if_frozen() determine the 185 * revert it to FREEZING and let update_if_frozen() determine the
190 * correct state later. 186 * correct state later.
191 * 187 *
192 * Tasks in @tset are on @new_cgrp but may not conform to its 188 * Tasks in @tset are on @new_css but may not conform to its
193 * current state before executing the following - !frozen tasks may 189 * current state before executing the following - !frozen tasks may
194 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. 190 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
195 */ 191 */
196 cgroup_taskset_for_each(task, new_cgrp, tset) { 192 cgroup_taskset_for_each(task, new_css, tset) {
197 if (!(freezer->state & CGROUP_FREEZING)) { 193 if (!(freezer->state & CGROUP_FREEZING)) {
198 __thaw_task(task); 194 __thaw_task(task);
199 } else { 195 } else {
@@ -231,7 +227,7 @@ static void freezer_fork(struct task_struct *task)
231 * The root cgroup is non-freezable, so we can skip the 227 * The root cgroup is non-freezable, so we can skip the
232 * following check. 228 * following check.
233 */ 229 */
234 if (!freezer->css.cgroup->parent) 230 if (!parent_freezer(freezer))
235 goto out; 231 goto out;
236 232
237 spin_lock_irq(&freezer->lock); 233 spin_lock_irq(&freezer->lock);
@@ -244,7 +240,7 @@ out:
244 240
245/** 241/**
246 * update_if_frozen - update whether a cgroup finished freezing 242 * update_if_frozen - update whether a cgroup finished freezing
247 * @cgroup: cgroup of interest 243 * @css: css of interest
248 * 244 *
249 * Once FREEZING is initiated, transition to FROZEN is lazily updated by 245 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
250 * calling this function. If the current state is FREEZING but not FROZEN, 246 * calling this function. If the current state is FREEZING but not FROZEN,
@@ -255,14 +251,14 @@ out:
255 * update_if_frozen() on all descendants prior to invoking this function. 251 * update_if_frozen() on all descendants prior to invoking this function.
256 * 252 *
257 * Task states and freezer state might disagree while tasks are being 253 * Task states and freezer state might disagree while tasks are being
258 * migrated into or out of @cgroup, so we can't verify task states against 254 * migrated into or out of @css, so we can't verify task states against
259 * @freezer state here. See freezer_attach() for details. 255 * @freezer state here. See freezer_attach() for details.
260 */ 256 */
261static void update_if_frozen(struct cgroup *cgroup) 257static void update_if_frozen(struct cgroup_subsys_state *css)
262{ 258{
263 struct freezer *freezer = cgroup_freezer(cgroup); 259 struct freezer *freezer = css_freezer(css);
264 struct cgroup *pos; 260 struct cgroup_subsys_state *pos;
265 struct cgroup_iter it; 261 struct css_task_iter it;
266 struct task_struct *task; 262 struct task_struct *task;
267 263
268 WARN_ON_ONCE(!rcu_read_lock_held()); 264 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -274,8 +270,8 @@ static void update_if_frozen(struct cgroup *cgroup)
274 goto out_unlock; 270 goto out_unlock;
275 271
276 /* are all (live) children frozen? */ 272 /* are all (live) children frozen? */
277 cgroup_for_each_child(pos, cgroup) { 273 css_for_each_child(pos, css) {
278 struct freezer *child = cgroup_freezer(pos); 274 struct freezer *child = css_freezer(pos);
279 275
280 if ((child->state & CGROUP_FREEZER_ONLINE) && 276 if ((child->state & CGROUP_FREEZER_ONLINE) &&
281 !(child->state & CGROUP_FROZEN)) 277 !(child->state & CGROUP_FROZEN))
@@ -283,9 +279,9 @@ static void update_if_frozen(struct cgroup *cgroup)
283 } 279 }
284 280
285 /* are all tasks frozen? */ 281 /* are all tasks frozen? */
286 cgroup_iter_start(cgroup, &it); 282 css_task_iter_start(css, &it);
287 283
288 while ((task = cgroup_iter_next(cgroup, &it))) { 284 while ((task = css_task_iter_next(&it))) {
289 if (freezing(task)) { 285 if (freezing(task)) {
290 /* 286 /*
291 * freezer_should_skip() indicates that the task 287 * freezer_should_skip() indicates that the task
@@ -300,52 +296,49 @@ static void update_if_frozen(struct cgroup *cgroup)
300 296
301 freezer->state |= CGROUP_FROZEN; 297 freezer->state |= CGROUP_FROZEN;
302out_iter_end: 298out_iter_end:
303 cgroup_iter_end(cgroup, &it); 299 css_task_iter_end(&it);
304out_unlock: 300out_unlock:
305 spin_unlock_irq(&freezer->lock); 301 spin_unlock_irq(&freezer->lock);
306} 302}
307 303
308static int freezer_read(struct cgroup *cgroup, struct cftype *cft, 304static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft,
309 struct seq_file *m) 305 struct seq_file *m)
310{ 306{
311 struct cgroup *pos; 307 struct cgroup_subsys_state *pos;
312 308
313 rcu_read_lock(); 309 rcu_read_lock();
314 310
315 /* update states bottom-up */ 311 /* update states bottom-up */
316 cgroup_for_each_descendant_post(pos, cgroup) 312 css_for_each_descendant_post(pos, css)
317 update_if_frozen(pos); 313 update_if_frozen(pos);
318 update_if_frozen(cgroup);
319 314
320 rcu_read_unlock(); 315 rcu_read_unlock();
321 316
322 seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); 317 seq_puts(m, freezer_state_strs(css_freezer(css)->state));
323 seq_putc(m, '\n'); 318 seq_putc(m, '\n');
324 return 0; 319 return 0;
325} 320}
326 321
327static void freeze_cgroup(struct freezer *freezer) 322static void freeze_cgroup(struct freezer *freezer)
328{ 323{
329 struct cgroup *cgroup = freezer->css.cgroup; 324 struct css_task_iter it;
330 struct cgroup_iter it;
331 struct task_struct *task; 325 struct task_struct *task;
332 326
333 cgroup_iter_start(cgroup, &it); 327 css_task_iter_start(&freezer->css, &it);
334 while ((task = cgroup_iter_next(cgroup, &it))) 328 while ((task = css_task_iter_next(&it)))
335 freeze_task(task); 329 freeze_task(task);
336 cgroup_iter_end(cgroup, &it); 330 css_task_iter_end(&it);
337} 331}
338 332
339static void unfreeze_cgroup(struct freezer *freezer) 333static void unfreeze_cgroup(struct freezer *freezer)
340{ 334{
341 struct cgroup *cgroup = freezer->css.cgroup; 335 struct css_task_iter it;
342 struct cgroup_iter it;
343 struct task_struct *task; 336 struct task_struct *task;
344 337
345 cgroup_iter_start(cgroup, &it); 338 css_task_iter_start(&freezer->css, &it);
346 while ((task = cgroup_iter_next(cgroup, &it))) 339 while ((task = css_task_iter_next(&it)))
347 __thaw_task(task); 340 __thaw_task(task);
348 cgroup_iter_end(cgroup, &it); 341 css_task_iter_end(&it);
349} 342}
350 343
351/** 344/**
@@ -395,12 +388,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
395 */ 388 */
396static void freezer_change_state(struct freezer *freezer, bool freeze) 389static void freezer_change_state(struct freezer *freezer, bool freeze)
397{ 390{
398 struct cgroup *pos; 391 struct cgroup_subsys_state *pos;
399
400 /* update @freezer */
401 spin_lock_irq(&freezer->lock);
402 freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
403 spin_unlock_irq(&freezer->lock);
404 392
405 /* 393 /*
406 * Update all its descendants in pre-order traversal. Each 394 * Update all its descendants in pre-order traversal. Each
@@ -408,24 +396,33 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
408 * CGROUP_FREEZING_PARENT. 396 * CGROUP_FREEZING_PARENT.
409 */ 397 */
410 rcu_read_lock(); 398 rcu_read_lock();
411 cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { 399 css_for_each_descendant_pre(pos, &freezer->css) {
412 struct freezer *pos_f = cgroup_freezer(pos); 400 struct freezer *pos_f = css_freezer(pos);
413 struct freezer *parent = parent_freezer(pos_f); 401 struct freezer *parent = parent_freezer(pos_f);
414 402
415 /*
416 * Our update to @parent->state is already visible which is
417 * all we need. No need to lock @parent. For more info on
418 * synchronization, see freezer_post_create().
419 */
420 spin_lock_irq(&pos_f->lock); 403 spin_lock_irq(&pos_f->lock);
421 freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, 404
422 CGROUP_FREEZING_PARENT); 405 if (pos_f == freezer) {
406 freezer_apply_state(pos_f, freeze,
407 CGROUP_FREEZING_SELF);
408 } else {
409 /*
410 * Our update to @parent->state is already visible
411 * which is all we need. No need to lock @parent.
412 * For more info on synchronization, see
413 * freezer_post_create().
414 */
415 freezer_apply_state(pos_f,
416 parent->state & CGROUP_FREEZING,
417 CGROUP_FREEZING_PARENT);
418 }
419
423 spin_unlock_irq(&pos_f->lock); 420 spin_unlock_irq(&pos_f->lock);
424 } 421 }
425 rcu_read_unlock(); 422 rcu_read_unlock();
426} 423}
427 424
428static int freezer_write(struct cgroup *cgroup, struct cftype *cft, 425static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
429 const char *buffer) 426 const char *buffer)
430{ 427{
431 bool freeze; 428 bool freeze;
@@ -437,20 +434,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
437 else 434 else
438 return -EINVAL; 435 return -EINVAL;
439 436
440 freezer_change_state(cgroup_freezer(cgroup), freeze); 437 freezer_change_state(css_freezer(css), freeze);
441 return 0; 438 return 0;
442} 439}
443 440
444static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) 441static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
442 struct cftype *cft)
445{ 443{
446 struct freezer *freezer = cgroup_freezer(cgroup); 444 struct freezer *freezer = css_freezer(css);
447 445
448 return (bool)(freezer->state & CGROUP_FREEZING_SELF); 446 return (bool)(freezer->state & CGROUP_FREEZING_SELF);
449} 447}
450 448
451static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) 449static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
450 struct cftype *cft)
452{ 451{
453 struct freezer *freezer = cgroup_freezer(cgroup); 452 struct freezer *freezer = css_freezer(css);
454 453
455 return (bool)(freezer->state & CGROUP_FREEZING_PARENT); 454 return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
456} 455}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 383f8231e436..859c8dfd78a1 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -20,26 +20,46 @@
20#include <linux/hardirq.h> 20#include <linux/hardirq.h>
21#include <linux/export.h> 21#include <linux/export.h>
22 22
23DEFINE_PER_CPU(struct context_tracking, context_tracking) = { 23#define CREATE_TRACE_POINTS
24#ifdef CONFIG_CONTEXT_TRACKING_FORCE 24#include <trace/events/context_tracking.h>
25 .active = true, 25
26#endif 26struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
27}; 27EXPORT_SYMBOL_GPL(context_tracking_enabled);
28
29DEFINE_PER_CPU(struct context_tracking, context_tracking);
30EXPORT_SYMBOL_GPL(context_tracking);
31
32void context_tracking_cpu_set(int cpu)
33{
34 if (!per_cpu(context_tracking.active, cpu)) {
35 per_cpu(context_tracking.active, cpu) = true;
36 static_key_slow_inc(&context_tracking_enabled);
37 }
38}
28 39
29/** 40/**
30 * user_enter - Inform the context tracking that the CPU is going to 41 * context_tracking_user_enter - Inform the context tracking that the CPU is going to
31 * enter userspace mode. 42 * enter userspace mode.
32 * 43 *
33 * This function must be called right before we switch from the kernel 44 * This function must be called right before we switch from the kernel
34 * to userspace, when it's guaranteed the remaining kernel instructions 45 * to userspace, when it's guaranteed the remaining kernel instructions
35 * to execute won't use any RCU read side critical section because this 46 * to execute won't use any RCU read side critical section because this
36 * function sets RCU in extended quiescent state. 47 * function sets RCU in extended quiescent state.
37 */ 48 */
38void user_enter(void) 49void context_tracking_user_enter(void)
39{ 50{
40 unsigned long flags; 51 unsigned long flags;
41 52
42 /* 53 /*
54 * Repeat the user_enter() check here because some archs may be calling
55 * this from asm and if no CPU needs context tracking, they shouldn't
56 * go further. Repeat the check here until they support the static key
57 * check.
58 */
59 if (!static_key_false(&context_tracking_enabled))
60 return;
61
62 /*
43 * Some contexts may involve an exception occuring in an irq, 63 * Some contexts may involve an exception occuring in an irq,
44 * leading to that nesting: 64 * leading to that nesting:
45 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() 65 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
@@ -54,17 +74,32 @@ void user_enter(void)
54 WARN_ON_ONCE(!current->mm); 74 WARN_ON_ONCE(!current->mm);
55 75
56 local_irq_save(flags); 76 local_irq_save(flags);
57 if (__this_cpu_read(context_tracking.active) && 77 if ( __this_cpu_read(context_tracking.state) != IN_USER) {
58 __this_cpu_read(context_tracking.state) != IN_USER) { 78 if (__this_cpu_read(context_tracking.active)) {
79 trace_user_enter(0);
80 /*
81 * At this stage, only low level arch entry code remains and
82 * then we'll run in userspace. We can assume there won't be
83 * any RCU read-side critical section until the next call to
84 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
85 * on the tick.
86 */
87 vtime_user_enter(current);
88 rcu_user_enter();
89 }
59 /* 90 /*
60 * At this stage, only low level arch entry code remains and 91 * Even if context tracking is disabled on this CPU, because it's outside
61 * then we'll run in userspace. We can assume there won't be 92 * the full dynticks mask for example, we still have to keep track of the
62 * any RCU read-side critical section until the next call to 93 * context transitions and states to prevent inconsistency on those of
63 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency 94 * other CPUs.
64 * on the tick. 95 * If a task triggers an exception in userspace, sleep on the exception
96 * handler and then migrate to another CPU, that new CPU must know where
97 * the exception returns by the time we call exception_exit().
98 * This information can only be provided by the previous CPU when it called
99 * exception_enter().
100 * OTOH we can spare the calls to vtime and RCU when context_tracking.active
101 * is false because we know that CPU is not tickless.
65 */ 102 */
66 vtime_user_enter(current);
67 rcu_user_enter();
68 __this_cpu_write(context_tracking.state, IN_USER); 103 __this_cpu_write(context_tracking.state, IN_USER);
69 } 104 }
70 local_irq_restore(flags); 105 local_irq_restore(flags);
@@ -87,10 +122,9 @@ void user_enter(void)
87 */ 122 */
88void __sched notrace preempt_schedule_context(void) 123void __sched notrace preempt_schedule_context(void)
89{ 124{
90 struct thread_info *ti = current_thread_info();
91 enum ctx_state prev_ctx; 125 enum ctx_state prev_ctx;
92 126
93 if (likely(ti->preempt_count || irqs_disabled())) 127 if (likely(!preemptible()))
94 return; 128 return;
95 129
96 /* 130 /*
@@ -112,8 +146,8 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
112#endif /* CONFIG_PREEMPT */ 146#endif /* CONFIG_PREEMPT */
113 147
114/** 148/**
115 * user_exit - Inform the context tracking that the CPU is 149 * context_tracking_user_exit - Inform the context tracking that the CPU is
116 * exiting userspace mode and entering the kernel. 150 * exiting userspace mode and entering the kernel.
117 * 151 *
118 * This function must be called after we entered the kernel from userspace 152 * This function must be called after we entered the kernel from userspace
119 * before any use of RCU read side critical section. This potentially include 153 * before any use of RCU read side critical section. This potentially include
@@ -122,47 +156,34 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
122 * This call supports re-entrancy. This way it can be called from any exception 156 * This call supports re-entrancy. This way it can be called from any exception
123 * handler without needing to know if we came from userspace or not. 157 * handler without needing to know if we came from userspace or not.
124 */ 158 */
125void user_exit(void) 159void context_tracking_user_exit(void)
126{ 160{
127 unsigned long flags; 161 unsigned long flags;
128 162
163 if (!static_key_false(&context_tracking_enabled))
164 return;
165
129 if (in_interrupt()) 166 if (in_interrupt())
130 return; 167 return;
131 168
132 local_irq_save(flags); 169 local_irq_save(flags);
133 if (__this_cpu_read(context_tracking.state) == IN_USER) { 170 if (__this_cpu_read(context_tracking.state) == IN_USER) {
134 /* 171 if (__this_cpu_read(context_tracking.active)) {
135 * We are going to run code that may use RCU. Inform 172 /*
136 * RCU core about that (ie: we may need the tick again). 173 * We are going to run code that may use RCU. Inform
137 */ 174 * RCU core about that (ie: we may need the tick again).
138 rcu_user_exit(); 175 */
139 vtime_user_exit(current); 176 rcu_user_exit();
177 vtime_user_exit(current);
178 trace_user_exit(0);
179 }
140 __this_cpu_write(context_tracking.state, IN_KERNEL); 180 __this_cpu_write(context_tracking.state, IN_KERNEL);
141 } 181 }
142 local_irq_restore(flags); 182 local_irq_restore(flags);
143} 183}
144 184
145void guest_enter(void)
146{
147 if (vtime_accounting_enabled())
148 vtime_guest_enter(current);
149 else
150 __guest_enter();
151}
152EXPORT_SYMBOL_GPL(guest_enter);
153
154void guest_exit(void)
155{
156 if (vtime_accounting_enabled())
157 vtime_guest_exit(current);
158 else
159 __guest_exit();
160}
161EXPORT_SYMBOL_GPL(guest_exit);
162
163
164/** 185/**
165 * context_tracking_task_switch - context switch the syscall callbacks 186 * __context_tracking_task_switch - context switch the syscall callbacks
166 * @prev: the task that is being switched out 187 * @prev: the task that is being switched out
167 * @next: the task that is being switched in 188 * @next: the task that is being switched in
168 * 189 *
@@ -174,11 +195,19 @@ EXPORT_SYMBOL_GPL(guest_exit);
174 * migrate to some CPU that doesn't do the context tracking. As such the TIF 195 * migrate to some CPU that doesn't do the context tracking. As such the TIF
175 * flag may not be desired there. 196 * flag may not be desired there.
176 */ 197 */
177void context_tracking_task_switch(struct task_struct *prev, 198void __context_tracking_task_switch(struct task_struct *prev,
178 struct task_struct *next) 199 struct task_struct *next)
179{ 200{
180 if (__this_cpu_read(context_tracking.active)) { 201 clear_tsk_thread_flag(prev, TIF_NOHZ);
181 clear_tsk_thread_flag(prev, TIF_NOHZ); 202 set_tsk_thread_flag(next, TIF_NOHZ);
182 set_tsk_thread_flag(next, TIF_NOHZ);
183 }
184} 203}
204
205#ifdef CONFIG_CONTEXT_TRACKING_FORCE
206void __init context_tracking_init(void)
207{
208 int cpu;
209
210 for_each_possible_cpu(cpu)
211 context_tracking_cpu_set(cpu);
212}
213#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b2b227b82123..d7f07a2da5a6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
113 * get_online_cpus() not an api which is called all that often. 113 * get_online_cpus() not an api which is called all that often.
114 * 114 *
115 */ 115 */
116static void cpu_hotplug_begin(void) 116void cpu_hotplug_begin(void)
117{ 117{
118 cpu_hotplug.active_writer = current; 118 cpu_hotplug.active_writer = current;
119 119
@@ -127,7 +127,7 @@ static void cpu_hotplug_begin(void)
127 } 127 }
128} 128}
129 129
130static void cpu_hotplug_done(void) 130void cpu_hotplug_done(void)
131{ 131{
132 cpu_hotplug.active_writer = NULL; 132 cpu_hotplug.active_writer = NULL;
133 mutex_unlock(&cpu_hotplug.lock); 133 mutex_unlock(&cpu_hotplug.lock);
@@ -154,10 +154,7 @@ void cpu_hotplug_enable(void)
154 cpu_maps_update_done(); 154 cpu_maps_update_done();
155} 155}
156 156
157#else /* #if CONFIG_HOTPLUG_CPU */ 157#endif /* CONFIG_HOTPLUG_CPU */
158static void cpu_hotplug_begin(void) {}
159static void cpu_hotplug_done(void) {}
160#endif /* #else #if CONFIG_HOTPLUG_CPU */
161 158
162/* Need to know about CPUs going up/down? */ 159/* Need to know about CPUs going up/down? */
163int __ref register_cpu_notifier(struct notifier_block *nb) 160int __ref register_cpu_notifier(struct notifier_block *nb)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ea1966db34f2..6bf981e13c43 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -68,10 +68,6 @@
68 */ 68 */
69int number_of_cpusets __read_mostly; 69int number_of_cpusets __read_mostly;
70 70
71/* Forward declare cgroup structures */
72struct cgroup_subsys cpuset_subsys;
73struct cpuset;
74
75/* See "Frequency meter" comments, below. */ 71/* See "Frequency meter" comments, below. */
76 72
77struct fmeter { 73struct fmeter {
@@ -115,27 +111,20 @@ struct cpuset {
115 int relax_domain_level; 111 int relax_domain_level;
116}; 112};
117 113
118/* Retrieve the cpuset for a cgroup */ 114static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
119static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
120{ 115{
121 return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), 116 return css ? container_of(css, struct cpuset, css) : NULL;
122 struct cpuset, css);
123} 117}
124 118
125/* Retrieve the cpuset for a task */ 119/* Retrieve the cpuset for a task */
126static inline struct cpuset *task_cs(struct task_struct *task) 120static inline struct cpuset *task_cs(struct task_struct *task)
127{ 121{
128 return container_of(task_subsys_state(task, cpuset_subsys_id), 122 return css_cs(task_css(task, cpuset_subsys_id));
129 struct cpuset, css);
130} 123}
131 124
132static inline struct cpuset *parent_cs(const struct cpuset *cs) 125static inline struct cpuset *parent_cs(struct cpuset *cs)
133{ 126{
134 struct cgroup *pcgrp = cs->css.cgroup->parent; 127 return css_cs(css_parent(&cs->css));
135
136 if (pcgrp)
137 return cgroup_cs(pcgrp);
138 return NULL;
139} 128}
140 129
141#ifdef CONFIG_NUMA 130#ifdef CONFIG_NUMA
@@ -212,29 +201,30 @@ static struct cpuset top_cpuset = {
212/** 201/**
213 * cpuset_for_each_child - traverse online children of a cpuset 202 * cpuset_for_each_child - traverse online children of a cpuset
214 * @child_cs: loop cursor pointing to the current child 203 * @child_cs: loop cursor pointing to the current child
215 * @pos_cgrp: used for iteration 204 * @pos_css: used for iteration
216 * @parent_cs: target cpuset to walk children of 205 * @parent_cs: target cpuset to walk children of
217 * 206 *
218 * Walk @child_cs through the online children of @parent_cs. Must be used 207 * Walk @child_cs through the online children of @parent_cs. Must be used
219 * with RCU read locked. 208 * with RCU read locked.
220 */ 209 */
221#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ 210#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
222 cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ 211 css_for_each_child((pos_css), &(parent_cs)->css) \
223 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) 212 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
224 213
225/** 214/**
226 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants 215 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
227 * @des_cs: loop cursor pointing to the current descendant 216 * @des_cs: loop cursor pointing to the current descendant
228 * @pos_cgrp: used for iteration 217 * @pos_css: used for iteration
229 * @root_cs: target cpuset to walk ancestor of 218 * @root_cs: target cpuset to walk ancestor of
230 * 219 *
231 * Walk @des_cs through the online descendants of @root_cs. Must be used 220 * Walk @des_cs through the online descendants of @root_cs. Must be used
232 * with RCU read locked. The caller may modify @pos_cgrp by calling 221 * with RCU read locked. The caller may modify @pos_css by calling
233 * cgroup_rightmost_descendant() to skip subtree. 222 * css_rightmost_descendant() to skip subtree. @root_cs is included in the
223 * iteration and the first node to be visited.
234 */ 224 */
235#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ 225#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
236 cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ 226 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
237 if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) 227 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
238 228
239/* 229/*
240 * There are two global mutexes guarding cpuset structures - cpuset_mutex 230 * There are two global mutexes guarding cpuset structures - cpuset_mutex
@@ -320,8 +310,7 @@ static struct file_system_type cpuset_fs_type = {
320 * 310 *
321 * Call with callback_mutex held. 311 * Call with callback_mutex held.
322 */ 312 */
323static void guarantee_online_cpus(const struct cpuset *cs, 313static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
324 struct cpumask *pmask)
325{ 314{
326 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 315 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
327 cs = parent_cs(cs); 316 cs = parent_cs(cs);
@@ -339,7 +328,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
339 * 328 *
340 * Call with callback_mutex held. 329 * Call with callback_mutex held.
341 */ 330 */
342static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 331static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
343{ 332{
344 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) 333 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
345 cs = parent_cs(cs); 334 cs = parent_cs(cs);
@@ -384,7 +373,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
384 * alloc_trial_cpuset - allocate a trial cpuset 373 * alloc_trial_cpuset - allocate a trial cpuset
385 * @cs: the cpuset that the trial cpuset duplicates 374 * @cs: the cpuset that the trial cpuset duplicates
386 */ 375 */
387static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) 376static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
388{ 377{
389 struct cpuset *trial; 378 struct cpuset *trial;
390 379
@@ -431,9 +420,9 @@ static void free_trial_cpuset(struct cpuset *trial)
431 * Return 0 if valid, -errno if not. 420 * Return 0 if valid, -errno if not.
432 */ 421 */
433 422
434static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 423static int validate_change(struct cpuset *cur, struct cpuset *trial)
435{ 424{
436 struct cgroup *cgrp; 425 struct cgroup_subsys_state *css;
437 struct cpuset *c, *par; 426 struct cpuset *c, *par;
438 int ret; 427 int ret;
439 428
@@ -441,7 +430,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
441 430
442 /* Each of our child cpusets must be a subset of us */ 431 /* Each of our child cpusets must be a subset of us */
443 ret = -EBUSY; 432 ret = -EBUSY;
444 cpuset_for_each_child(c, cgrp, cur) 433 cpuset_for_each_child(c, css, cur)
445 if (!is_cpuset_subset(c, trial)) 434 if (!is_cpuset_subset(c, trial))
446 goto out; 435 goto out;
447 436
@@ -462,7 +451,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
462 * overlap 451 * overlap
463 */ 452 */
464 ret = -EINVAL; 453 ret = -EINVAL;
465 cpuset_for_each_child(c, cgrp, par) { 454 cpuset_for_each_child(c, css, par) {
466 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 455 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
467 c != cur && 456 c != cur &&
468 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 457 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -515,13 +504,16 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
515 struct cpuset *root_cs) 504 struct cpuset *root_cs)
516{ 505{
517 struct cpuset *cp; 506 struct cpuset *cp;
518 struct cgroup *pos_cgrp; 507 struct cgroup_subsys_state *pos_css;
519 508
520 rcu_read_lock(); 509 rcu_read_lock();
521 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 510 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
511 if (cp == root_cs)
512 continue;
513
522 /* skip the whole subtree if @cp doesn't have any CPU */ 514 /* skip the whole subtree if @cp doesn't have any CPU */
523 if (cpumask_empty(cp->cpus_allowed)) { 515 if (cpumask_empty(cp->cpus_allowed)) {
524 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 516 pos_css = css_rightmost_descendant(pos_css);
525 continue; 517 continue;
526 } 518 }
527 519
@@ -596,7 +588,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
596 struct sched_domain_attr *dattr; /* attributes for custom domains */ 588 struct sched_domain_attr *dattr; /* attributes for custom domains */
597 int ndoms = 0; /* number of sched domains in result */ 589 int ndoms = 0; /* number of sched domains in result */
598 int nslot; /* next empty doms[] struct cpumask slot */ 590 int nslot; /* next empty doms[] struct cpumask slot */
599 struct cgroup *pos_cgrp; 591 struct cgroup_subsys_state *pos_css;
600 592
601 doms = NULL; 593 doms = NULL;
602 dattr = NULL; 594 dattr = NULL;
@@ -625,7 +617,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
625 csn = 0; 617 csn = 0;
626 618
627 rcu_read_lock(); 619 rcu_read_lock();
628 cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { 620 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
621 if (cp == &top_cpuset)
622 continue;
629 /* 623 /*
630 * Continue traversing beyond @cp iff @cp has some CPUs and 624 * Continue traversing beyond @cp iff @cp has some CPUs and
631 * isn't load balancing. The former is obvious. The 625 * isn't load balancing. The former is obvious. The
@@ -642,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
642 csa[csn++] = cp; 636 csa[csn++] = cp;
643 637
644 /* skip @cp's subtree */ 638 /* skip @cp's subtree */
645 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 639 pos_css = css_rightmost_descendant(pos_css);
646 } 640 }
647 rcu_read_unlock(); 641 rcu_read_unlock();
648 642
@@ -837,52 +831,45 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
837/** 831/**
838 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's 832 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
839 * @tsk: task to test 833 * @tsk: task to test
840 * @scan: struct cgroup_scanner containing the cgroup of the task 834 * @data: cpuset to @tsk belongs to
841 * 835 *
842 * Called by cgroup_scan_tasks() for each task in a cgroup whose 836 * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
843 * cpus_allowed mask needs to be changed. 837 * mask needs to be changed.
844 * 838 *
845 * We don't need to re-check for the cgroup/cpuset membership, since we're 839 * We don't need to re-check for the cgroup/cpuset membership, since we're
846 * holding cpuset_mutex at this point. 840 * holding cpuset_mutex at this point.
847 */ 841 */
848static void cpuset_change_cpumask(struct task_struct *tsk, 842static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
849 struct cgroup_scanner *scan)
850{ 843{
851 struct cpuset *cpus_cs; 844 struct cpuset *cs = data;
845 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
852 846
853 cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
854 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); 847 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
855} 848}
856 849
857/** 850/**
858 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 851 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
859 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 852 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
860 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 853 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
861 * 854 *
862 * Called with cpuset_mutex held 855 * Called with cpuset_mutex held
863 * 856 *
864 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 857 * The css_scan_tasks() function will scan all the tasks in a cgroup,
865 * calling callback functions for each. 858 * calling callback functions for each.
866 * 859 *
867 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 860 * No return value. It's guaranteed that css_scan_tasks() always returns 0
868 * if @heap != NULL. 861 * if @heap != NULL.
869 */ 862 */
870static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) 863static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
871{ 864{
872 struct cgroup_scanner scan; 865 css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap);
873
874 scan.cg = cs->css.cgroup;
875 scan.test_task = NULL;
876 scan.process_task = cpuset_change_cpumask;
877 scan.heap = heap;
878 cgroup_scan_tasks(&scan);
879} 866}
880 867
881/* 868/*
882 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 869 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
883 * @root_cs: the root cpuset of the hierarchy 870 * @root_cs: the root cpuset of the hierarchy
884 * @update_root: update root cpuset or not? 871 * @update_root: update root cpuset or not?
885 * @heap: the heap used by cgroup_scan_tasks() 872 * @heap: the heap used by css_scan_tasks()
886 * 873 *
887 * This will update cpumasks of tasks in @root_cs and all other empty cpusets 874 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
888 * which take on cpumask of @root_cs. 875 * which take on cpumask of @root_cs.
@@ -893,17 +880,19 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
893 bool update_root, struct ptr_heap *heap) 880 bool update_root, struct ptr_heap *heap)
894{ 881{
895 struct cpuset *cp; 882 struct cpuset *cp;
896 struct cgroup *pos_cgrp; 883 struct cgroup_subsys_state *pos_css;
897
898 if (update_root)
899 update_tasks_cpumask(root_cs, heap);
900 884
901 rcu_read_lock(); 885 rcu_read_lock();
902 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 886 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
903 /* skip the whole subtree if @cp have some CPU */ 887 if (cp == root_cs) {
904 if (!cpumask_empty(cp->cpus_allowed)) { 888 if (!update_root)
905 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 889 continue;
906 continue; 890 } else {
891 /* skip the whole subtree if @cp have some CPU */
892 if (!cpumask_empty(cp->cpus_allowed)) {
893 pos_css = css_rightmost_descendant(pos_css);
894 continue;
895 }
907 } 896 }
908 if (!css_tryget(&cp->css)) 897 if (!css_tryget(&cp->css))
909 continue; 898 continue;
@@ -1059,20 +1048,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1059 task_unlock(tsk); 1048 task_unlock(tsk);
1060} 1049}
1061 1050
1051struct cpuset_change_nodemask_arg {
1052 struct cpuset *cs;
1053 nodemask_t *newmems;
1054};
1055
1062/* 1056/*
1063 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy 1057 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1064 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if 1058 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1065 * memory_migrate flag is set. Called with cpuset_mutex held. 1059 * memory_migrate flag is set. Called with cpuset_mutex held.
1066 */ 1060 */
1067static void cpuset_change_nodemask(struct task_struct *p, 1061static void cpuset_change_nodemask(struct task_struct *p, void *data)
1068 struct cgroup_scanner *scan)
1069{ 1062{
1070 struct cpuset *cs = cgroup_cs(scan->cg); 1063 struct cpuset_change_nodemask_arg *arg = data;
1064 struct cpuset *cs = arg->cs;
1071 struct mm_struct *mm; 1065 struct mm_struct *mm;
1072 int migrate; 1066 int migrate;
1073 nodemask_t *newmems = scan->data;
1074 1067
1075 cpuset_change_task_nodemask(p, newmems); 1068 cpuset_change_task_nodemask(p, arg->newmems);
1076 1069
1077 mm = get_task_mm(p); 1070 mm = get_task_mm(p);
1078 if (!mm) 1071 if (!mm)
@@ -1082,7 +1075,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1082 1075
1083 mpol_rebind_mm(mm, &cs->mems_allowed); 1076 mpol_rebind_mm(mm, &cs->mems_allowed);
1084 if (migrate) 1077 if (migrate)
1085 cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); 1078 cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
1086 mmput(mm); 1079 mmput(mm);
1087} 1080}
1088 1081
@@ -1091,28 +1084,22 @@ static void *cpuset_being_rebound;
1091/** 1084/**
1092 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1085 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1093 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1086 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1094 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1087 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1095 * 1088 *
1096 * Called with cpuset_mutex held 1089 * Called with cpuset_mutex held. No return value. It's guaranteed that
1097 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1090 * css_scan_tasks() always returns 0 if @heap != NULL.
1098 * if @heap != NULL.
1099 */ 1091 */
1100static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) 1092static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1101{ 1093{
1102 static nodemask_t newmems; /* protected by cpuset_mutex */ 1094 static nodemask_t newmems; /* protected by cpuset_mutex */
1103 struct cgroup_scanner scan;
1104 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1095 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1096 struct cpuset_change_nodemask_arg arg = { .cs = cs,
1097 .newmems = &newmems };
1105 1098
1106 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1099 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1107 1100
1108 guarantee_online_mems(mems_cs, &newmems); 1101 guarantee_online_mems(mems_cs, &newmems);
1109 1102
1110 scan.cg = cs->css.cgroup;
1111 scan.test_task = NULL;
1112 scan.process_task = cpuset_change_nodemask;
1113 scan.heap = heap;
1114 scan.data = &newmems;
1115
1116 /* 1103 /*
1117 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1104 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1118 * take while holding tasklist_lock. Forks can happen - the 1105 * take while holding tasklist_lock. Forks can happen - the
@@ -1123,7 +1110,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1123 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1110 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1124 * is idempotent. Also migrate pages in each mm to new nodes. 1111 * is idempotent. Also migrate pages in each mm to new nodes.
1125 */ 1112 */
1126 cgroup_scan_tasks(&scan); 1113 css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap);
1127 1114
1128 /* 1115 /*
1129 * All the tasks' nodemasks have been updated, update 1116 * All the tasks' nodemasks have been updated, update
@@ -1139,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1139 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1126 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1140 * @cs: the root cpuset of the hierarchy 1127 * @cs: the root cpuset of the hierarchy
1141 * @update_root: update the root cpuset or not? 1128 * @update_root: update the root cpuset or not?
1142 * @heap: the heap used by cgroup_scan_tasks() 1129 * @heap: the heap used by css_scan_tasks()
1143 * 1130 *
1144 * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1131 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1145 * which take on nodemask of @root_cs. 1132 * which take on nodemask of @root_cs.
@@ -1150,17 +1137,19 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1150 bool update_root, struct ptr_heap *heap) 1137 bool update_root, struct ptr_heap *heap)
1151{ 1138{
1152 struct cpuset *cp; 1139 struct cpuset *cp;
1153 struct cgroup *pos_cgrp; 1140 struct cgroup_subsys_state *pos_css;
1154
1155 if (update_root)
1156 update_tasks_nodemask(root_cs, heap);
1157 1141
1158 rcu_read_lock(); 1142 rcu_read_lock();
1159 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 1143 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
1160 /* skip the whole subtree if @cp have some CPU */ 1144 if (cp == root_cs) {
1161 if (!nodes_empty(cp->mems_allowed)) { 1145 if (!update_root)
1162 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 1146 continue;
1163 continue; 1147 } else {
1148 /* skip the whole subtree if @cp have some CPU */
1149 if (!nodes_empty(cp->mems_allowed)) {
1150 pos_css = css_rightmost_descendant(pos_css);
1151 continue;
1152 }
1164 } 1153 }
1165 if (!css_tryget(&cp->css)) 1154 if (!css_tryget(&cp->css))
1166 continue; 1155 continue;
@@ -1267,44 +1256,39 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1267 return 0; 1256 return 0;
1268} 1257}
1269 1258
1270/* 1259/**
1271 * cpuset_change_flag - make a task's spread flags the same as its cpuset's 1260 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1272 * @tsk: task to be updated 1261 * @tsk: task to be updated
1273 * @scan: struct cgroup_scanner containing the cgroup of the task 1262 * @data: cpuset to @tsk belongs to
1274 * 1263 *
1275 * Called by cgroup_scan_tasks() for each task in a cgroup. 1264 * Called by css_scan_tasks() for each task in a cgroup.
1276 * 1265 *
1277 * We don't need to re-check for the cgroup/cpuset membership, since we're 1266 * We don't need to re-check for the cgroup/cpuset membership, since we're
1278 * holding cpuset_mutex at this point. 1267 * holding cpuset_mutex at this point.
1279 */ 1268 */
1280static void cpuset_change_flag(struct task_struct *tsk, 1269static void cpuset_change_flag(struct task_struct *tsk, void *data)
1281 struct cgroup_scanner *scan)
1282{ 1270{
1283 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); 1271 struct cpuset *cs = data;
1272
1273 cpuset_update_task_spread_flag(cs, tsk);
1284} 1274}
1285 1275
1286/* 1276/**
1287 * update_tasks_flags - update the spread flags of tasks in the cpuset. 1277 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1288 * @cs: the cpuset in which each task's spread flags needs to be changed 1278 * @cs: the cpuset in which each task's spread flags needs to be changed
1289 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1279 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1290 * 1280 *
1291 * Called with cpuset_mutex held 1281 * Called with cpuset_mutex held
1292 * 1282 *
1293 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 1283 * The css_scan_tasks() function will scan all the tasks in a cgroup,
1294 * calling callback functions for each. 1284 * calling callback functions for each.
1295 * 1285 *
1296 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1286 * No return value. It's guaranteed that css_scan_tasks() always returns 0
1297 * if @heap != NULL. 1287 * if @heap != NULL.
1298 */ 1288 */
1299static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) 1289static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1300{ 1290{
1301 struct cgroup_scanner scan; 1291 css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap);
1302
1303 scan.cg = cs->css.cgroup;
1304 scan.test_task = NULL;
1305 scan.process_task = cpuset_change_flag;
1306 scan.heap = heap;
1307 cgroup_scan_tasks(&scan);
1308} 1292}
1309 1293
1310/* 1294/*
@@ -1462,9 +1446,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1462} 1446}
1463 1447
1464/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 1448/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1465static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1449static int cpuset_can_attach(struct cgroup_subsys_state *css,
1450 struct cgroup_taskset *tset)
1466{ 1451{
1467 struct cpuset *cs = cgroup_cs(cgrp); 1452 struct cpuset *cs = css_cs(css);
1468 struct task_struct *task; 1453 struct task_struct *task;
1469 int ret; 1454 int ret;
1470 1455
@@ -1475,11 +1460,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1475 * flag is set. 1460 * flag is set.
1476 */ 1461 */
1477 ret = -ENOSPC; 1462 ret = -ENOSPC;
1478 if (!cgroup_sane_behavior(cgrp) && 1463 if (!cgroup_sane_behavior(css->cgroup) &&
1479 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1464 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1480 goto out_unlock; 1465 goto out_unlock;
1481 1466
1482 cgroup_taskset_for_each(task, cgrp, tset) { 1467 cgroup_taskset_for_each(task, css, tset) {
1483 /* 1468 /*
1484 * Kthreads which disallow setaffinity shouldn't be moved 1469 * Kthreads which disallow setaffinity shouldn't be moved
1485 * to a new cpuset; we don't want to change their cpu 1470 * to a new cpuset; we don't want to change their cpu
@@ -1508,11 +1493,11 @@ out_unlock:
1508 return ret; 1493 return ret;
1509} 1494}
1510 1495
1511static void cpuset_cancel_attach(struct cgroup *cgrp, 1496static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
1512 struct cgroup_taskset *tset) 1497 struct cgroup_taskset *tset)
1513{ 1498{
1514 mutex_lock(&cpuset_mutex); 1499 mutex_lock(&cpuset_mutex);
1515 cgroup_cs(cgrp)->attach_in_progress--; 1500 css_cs(css)->attach_in_progress--;
1516 mutex_unlock(&cpuset_mutex); 1501 mutex_unlock(&cpuset_mutex);
1517} 1502}
1518 1503
@@ -1523,16 +1508,18 @@ static void cpuset_cancel_attach(struct cgroup *cgrp,
1523 */ 1508 */
1524static cpumask_var_t cpus_attach; 1509static cpumask_var_t cpus_attach;
1525 1510
1526static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1511static void cpuset_attach(struct cgroup_subsys_state *css,
1512 struct cgroup_taskset *tset)
1527{ 1513{
1528 /* static buf protected by cpuset_mutex */ 1514 /* static buf protected by cpuset_mutex */
1529 static nodemask_t cpuset_attach_nodemask_to; 1515 static nodemask_t cpuset_attach_nodemask_to;
1530 struct mm_struct *mm; 1516 struct mm_struct *mm;
1531 struct task_struct *task; 1517 struct task_struct *task;
1532 struct task_struct *leader = cgroup_taskset_first(tset); 1518 struct task_struct *leader = cgroup_taskset_first(tset);
1533 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); 1519 struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
1534 struct cpuset *cs = cgroup_cs(cgrp); 1520 cpuset_subsys_id);
1535 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1521 struct cpuset *cs = css_cs(css);
1522 struct cpuset *oldcs = css_cs(oldcss);
1536 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); 1523 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1537 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1524 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1538 1525
@@ -1546,7 +1533,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1546 1533
1547 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1534 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1548 1535
1549 cgroup_taskset_for_each(task, cgrp, tset) { 1536 cgroup_taskset_for_each(task, css, tset) {
1550 /* 1537 /*
1551 * can_attach beforehand should guarantee that this doesn't 1538 * can_attach beforehand should guarantee that this doesn't
1552 * fail. TODO: have a better way to handle failure here 1539 * fail. TODO: have a better way to handle failure here
@@ -1608,9 +1595,10 @@ typedef enum {
1608 FILE_SPREAD_SLAB, 1595 FILE_SPREAD_SLAB,
1609} cpuset_filetype_t; 1596} cpuset_filetype_t;
1610 1597
1611static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1598static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1599 u64 val)
1612{ 1600{
1613 struct cpuset *cs = cgroup_cs(cgrp); 1601 struct cpuset *cs = css_cs(css);
1614 cpuset_filetype_t type = cft->private; 1602 cpuset_filetype_t type = cft->private;
1615 int retval = 0; 1603 int retval = 0;
1616 1604
@@ -1657,9 +1645,10 @@ out_unlock:
1657 return retval; 1645 return retval;
1658} 1646}
1659 1647
1660static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) 1648static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
1649 s64 val)
1661{ 1650{
1662 struct cpuset *cs = cgroup_cs(cgrp); 1651 struct cpuset *cs = css_cs(css);
1663 cpuset_filetype_t type = cft->private; 1652 cpuset_filetype_t type = cft->private;
1664 int retval = -ENODEV; 1653 int retval = -ENODEV;
1665 1654
@@ -1683,10 +1672,10 @@ out_unlock:
1683/* 1672/*
1684 * Common handling for a write to a "cpus" or "mems" file. 1673 * Common handling for a write to a "cpus" or "mems" file.
1685 */ 1674 */
1686static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, 1675static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1687 const char *buf) 1676 struct cftype *cft, const char *buf)
1688{ 1677{
1689 struct cpuset *cs = cgroup_cs(cgrp); 1678 struct cpuset *cs = css_cs(css);
1690 struct cpuset *trialcs; 1679 struct cpuset *trialcs;
1691 int retval = -ENODEV; 1680 int retval = -ENODEV;
1692 1681
@@ -1765,13 +1754,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1765 return count; 1754 return count;
1766} 1755}
1767 1756
1768static ssize_t cpuset_common_file_read(struct cgroup *cgrp, 1757static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
1769 struct cftype *cft, 1758 struct cftype *cft, struct file *file,
1770 struct file *file, 1759 char __user *buf, size_t nbytes,
1771 char __user *buf, 1760 loff_t *ppos)
1772 size_t nbytes, loff_t *ppos)
1773{ 1761{
1774 struct cpuset *cs = cgroup_cs(cgrp); 1762 struct cpuset *cs = css_cs(css);
1775 cpuset_filetype_t type = cft->private; 1763 cpuset_filetype_t type = cft->private;
1776 char *page; 1764 char *page;
1777 ssize_t retval = 0; 1765 ssize_t retval = 0;
@@ -1801,9 +1789,9 @@ out:
1801 return retval; 1789 return retval;
1802} 1790}
1803 1791
1804static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) 1792static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
1805{ 1793{
1806 struct cpuset *cs = cgroup_cs(cgrp); 1794 struct cpuset *cs = css_cs(css);
1807 cpuset_filetype_t type = cft->private; 1795 cpuset_filetype_t type = cft->private;
1808 switch (type) { 1796 switch (type) {
1809 case FILE_CPU_EXCLUSIVE: 1797 case FILE_CPU_EXCLUSIVE:
@@ -1832,9 +1820,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
1832 return 0; 1820 return 0;
1833} 1821}
1834 1822
1835static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) 1823static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1836{ 1824{
1837 struct cpuset *cs = cgroup_cs(cgrp); 1825 struct cpuset *cs = css_cs(css);
1838 cpuset_filetype_t type = cft->private; 1826 cpuset_filetype_t type = cft->private;
1839 switch (type) { 1827 switch (type) {
1840 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1828 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1949,11 +1937,12 @@ static struct cftype files[] = {
1949 * cgrp: control group that the new cpuset will be part of 1937 * cgrp: control group that the new cpuset will be part of
1950 */ 1938 */
1951 1939
1952static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) 1940static struct cgroup_subsys_state *
1941cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1953{ 1942{
1954 struct cpuset *cs; 1943 struct cpuset *cs;
1955 1944
1956 if (!cgrp->parent) 1945 if (!parent_css)
1957 return &top_cpuset.css; 1946 return &top_cpuset.css;
1958 1947
1959 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1948 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
@@ -1973,12 +1962,12 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
1973 return &cs->css; 1962 return &cs->css;
1974} 1963}
1975 1964
1976static int cpuset_css_online(struct cgroup *cgrp) 1965static int cpuset_css_online(struct cgroup_subsys_state *css)
1977{ 1966{
1978 struct cpuset *cs = cgroup_cs(cgrp); 1967 struct cpuset *cs = css_cs(css);
1979 struct cpuset *parent = parent_cs(cs); 1968 struct cpuset *parent = parent_cs(cs);
1980 struct cpuset *tmp_cs; 1969 struct cpuset *tmp_cs;
1981 struct cgroup *pos_cg; 1970 struct cgroup_subsys_state *pos_css;
1982 1971
1983 if (!parent) 1972 if (!parent)
1984 return 0; 1973 return 0;
@@ -1993,7 +1982,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
1993 1982
1994 number_of_cpusets++; 1983 number_of_cpusets++;
1995 1984
1996 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) 1985 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1997 goto out_unlock; 1986 goto out_unlock;
1998 1987
1999 /* 1988 /*
@@ -2010,7 +1999,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
2010 * (and likewise for mems) to the new cgroup. 1999 * (and likewise for mems) to the new cgroup.
2011 */ 2000 */
2012 rcu_read_lock(); 2001 rcu_read_lock();
2013 cpuset_for_each_child(tmp_cs, pos_cg, parent) { 2002 cpuset_for_each_child(tmp_cs, pos_css, parent) {
2014 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { 2003 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2015 rcu_read_unlock(); 2004 rcu_read_unlock();
2016 goto out_unlock; 2005 goto out_unlock;
@@ -2027,9 +2016,15 @@ out_unlock:
2027 return 0; 2016 return 0;
2028} 2017}
2029 2018
2030static void cpuset_css_offline(struct cgroup *cgrp) 2019/*
2020 * If the cpuset being removed has its flag 'sched_load_balance'
2021 * enabled, then simulate turning sched_load_balance off, which
2022 * will call rebuild_sched_domains_locked().
2023 */
2024
2025static void cpuset_css_offline(struct cgroup_subsys_state *css)
2031{ 2026{
2032 struct cpuset *cs = cgroup_cs(cgrp); 2027 struct cpuset *cs = css_cs(css);
2033 2028
2034 mutex_lock(&cpuset_mutex); 2029 mutex_lock(&cpuset_mutex);
2035 2030
@@ -2042,15 +2037,9 @@ static void cpuset_css_offline(struct cgroup *cgrp)
2042 mutex_unlock(&cpuset_mutex); 2037 mutex_unlock(&cpuset_mutex);
2043} 2038}
2044 2039
2045/* 2040static void cpuset_css_free(struct cgroup_subsys_state *css)
2046 * If the cpuset being removed has its flag 'sched_load_balance'
2047 * enabled, then simulate turning sched_load_balance off, which
2048 * will call rebuild_sched_domains_locked().
2049 */
2050
2051static void cpuset_css_free(struct cgroup *cgrp)
2052{ 2041{
2053 struct cpuset *cs = cgroup_cs(cgrp); 2042 struct cpuset *cs = css_cs(css);
2054 2043
2055 free_cpumask_var(cs->cpus_allowed); 2044 free_cpumask_var(cs->cpus_allowed);
2056 kfree(cs); 2045 kfree(cs);
@@ -2257,11 +2246,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2257 /* if cpus or mems changed, we need to propagate to descendants */ 2246 /* if cpus or mems changed, we need to propagate to descendants */
2258 if (cpus_updated || mems_updated) { 2247 if (cpus_updated || mems_updated) {
2259 struct cpuset *cs; 2248 struct cpuset *cs;
2260 struct cgroup *pos_cgrp; 2249 struct cgroup_subsys_state *pos_css;
2261 2250
2262 rcu_read_lock(); 2251 rcu_read_lock();
2263 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { 2252 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2264 if (!css_tryget(&cs->css)) 2253 if (cs == &top_cpuset || !css_tryget(&cs->css))
2265 continue; 2254 continue;
2266 rcu_read_unlock(); 2255 rcu_read_unlock();
2267 2256
@@ -2350,7 +2339,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2350 2339
2351void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2340void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2352{ 2341{
2353 const struct cpuset *cpus_cs; 2342 struct cpuset *cpus_cs;
2354 2343
2355 rcu_read_lock(); 2344 rcu_read_lock();
2356 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2345 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
@@ -2423,7 +2412,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2423 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall 2412 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
2424 * (an unusual configuration), then returns the root cpuset. 2413 * (an unusual configuration), then returns the root cpuset.
2425 */ 2414 */
2426static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) 2415static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2427{ 2416{
2428 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) 2417 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2429 cs = parent_cs(cs); 2418 cs = parent_cs(cs);
@@ -2493,7 +2482,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2493 */ 2482 */
2494int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) 2483int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2495{ 2484{
2496 const struct cpuset *cs; /* current cpuset ancestors */ 2485 struct cpuset *cs; /* current cpuset ancestors */
2497 int allowed; /* is allocation in zone z allowed? */ 2486 int allowed; /* is allocation in zone z allowed? */
2498 2487
2499 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2488 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
@@ -2731,7 +2720,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2731 goto out_free; 2720 goto out_free;
2732 2721
2733 rcu_read_lock(); 2722 rcu_read_lock();
2734 css = task_subsys_state(tsk, cpuset_subsys_id); 2723 css = task_css(tsk, cpuset_subsys_id);
2735 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2724 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2736 rcu_read_unlock(); 2725 rcu_read_unlock();
2737 if (retval < 0) 2726 if (retval < 0)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c77206184b8b..97b67df8fbfe 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -116,6 +116,9 @@ int get_callchain_buffers(void)
116 116
117 err = alloc_callchain_buffers(); 117 err = alloc_callchain_buffers();
118exit: 118exit:
119 if (err)
120 atomic_dec(&nr_callchain_events);
121
119 mutex_unlock(&callchain_mutex); 122 mutex_unlock(&callchain_mutex);
120 123
121 return err; 124 return err;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f86599e8c123..953c14348375 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -145,6 +145,7 @@ static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
145static atomic_t nr_mmap_events __read_mostly; 145static atomic_t nr_mmap_events __read_mostly;
146static atomic_t nr_comm_events __read_mostly; 146static atomic_t nr_comm_events __read_mostly;
147static atomic_t nr_task_events __read_mostly; 147static atomic_t nr_task_events __read_mostly;
148static atomic_t nr_freq_events __read_mostly;
148 149
149static LIST_HEAD(pmus); 150static LIST_HEAD(pmus);
150static DEFINE_MUTEX(pmus_lock); 151static DEFINE_MUTEX(pmus_lock);
@@ -340,8 +341,8 @@ struct perf_cgroup {
340static inline struct perf_cgroup * 341static inline struct perf_cgroup *
341perf_cgroup_from_task(struct task_struct *task) 342perf_cgroup_from_task(struct task_struct *task)
342{ 343{
343 return container_of(task_subsys_state(task, perf_subsys_id), 344 return container_of(task_css(task, perf_subsys_id),
344 struct perf_cgroup, css); 345 struct perf_cgroup, css);
345} 346}
346 347
347static inline bool 348static inline bool
@@ -591,7 +592,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
591 if (!f.file) 592 if (!f.file)
592 return -EBADF; 593 return -EBADF;
593 594
594 css = cgroup_css_from_dir(f.file, perf_subsys_id); 595 rcu_read_lock();
596
597 css = css_from_dir(f.file->f_dentry, &perf_subsys);
595 if (IS_ERR(css)) { 598 if (IS_ERR(css)) {
596 ret = PTR_ERR(css); 599 ret = PTR_ERR(css);
597 goto out; 600 goto out;
@@ -617,6 +620,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
617 ret = -EINVAL; 620 ret = -EINVAL;
618 } 621 }
619out: 622out:
623 rcu_read_unlock();
620 fdput(f); 624 fdput(f);
621 return ret; 625 return ret;
622} 626}
@@ -869,12 +873,8 @@ static void perf_pmu_rotate_start(struct pmu *pmu)
869 873
870 WARN_ON(!irqs_disabled()); 874 WARN_ON(!irqs_disabled());
871 875
872 if (list_empty(&cpuctx->rotation_list)) { 876 if (list_empty(&cpuctx->rotation_list))
873 int was_empty = list_empty(head);
874 list_add(&cpuctx->rotation_list, head); 877 list_add(&cpuctx->rotation_list, head);
875 if (was_empty)
876 tick_nohz_full_kick();
877 }
878} 878}
879 879
880static void get_ctx(struct perf_event_context *ctx) 880static void get_ctx(struct perf_event_context *ctx)
@@ -1216,6 +1216,9 @@ static void perf_event__id_header_size(struct perf_event *event)
1216 if (sample_type & PERF_SAMPLE_TIME) 1216 if (sample_type & PERF_SAMPLE_TIME)
1217 size += sizeof(data->time); 1217 size += sizeof(data->time);
1218 1218
1219 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1220 size += sizeof(data->id);
1221
1219 if (sample_type & PERF_SAMPLE_ID) 1222 if (sample_type & PERF_SAMPLE_ID)
1220 size += sizeof(data->id); 1223 size += sizeof(data->id);
1221 1224
@@ -2712,7 +2715,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2712 2715
2713 hwc = &event->hw; 2716 hwc = &event->hw;
2714 2717
2715 if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { 2718 if (hwc->interrupts == MAX_INTERRUPTS) {
2716 hwc->interrupts = 0; 2719 hwc->interrupts = 0;
2717 perf_log_throttle(event, 1); 2720 perf_log_throttle(event, 1);
2718 event->pmu->start(event, 0); 2721 event->pmu->start(event, 0);
@@ -2811,10 +2814,11 @@ done:
2811#ifdef CONFIG_NO_HZ_FULL 2814#ifdef CONFIG_NO_HZ_FULL
2812bool perf_event_can_stop_tick(void) 2815bool perf_event_can_stop_tick(void)
2813{ 2816{
2814 if (list_empty(&__get_cpu_var(rotation_list))) 2817 if (atomic_read(&nr_freq_events) ||
2815 return true; 2818 __this_cpu_read(perf_throttled_count))
2816 else
2817 return false; 2819 return false;
2820 else
2821 return true;
2818} 2822}
2819#endif 2823#endif
2820 2824
@@ -3128,36 +3132,63 @@ static void free_event_rcu(struct rcu_head *head)
3128static void ring_buffer_put(struct ring_buffer *rb); 3132static void ring_buffer_put(struct ring_buffer *rb);
3129static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); 3133static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
3130 3134
3131static void free_event(struct perf_event *event) 3135static void unaccount_event_cpu(struct perf_event *event, int cpu)
3132{ 3136{
3133 irq_work_sync(&event->pending); 3137 if (event->parent)
3138 return;
3139
3140 if (has_branch_stack(event)) {
3141 if (!(event->attach_state & PERF_ATTACH_TASK))
3142 atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
3143 }
3144 if (is_cgroup_event(event))
3145 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3146}
3147
3148static void unaccount_event(struct perf_event *event)
3149{
3150 if (event->parent)
3151 return;
3152
3153 if (event->attach_state & PERF_ATTACH_TASK)
3154 static_key_slow_dec_deferred(&perf_sched_events);
3155 if (event->attr.mmap || event->attr.mmap_data)
3156 atomic_dec(&nr_mmap_events);
3157 if (event->attr.comm)
3158 atomic_dec(&nr_comm_events);
3159 if (event->attr.task)
3160 atomic_dec(&nr_task_events);
3161 if (event->attr.freq)
3162 atomic_dec(&nr_freq_events);
3163 if (is_cgroup_event(event))
3164 static_key_slow_dec_deferred(&perf_sched_events);
3165 if (has_branch_stack(event))
3166 static_key_slow_dec_deferred(&perf_sched_events);
3167
3168 unaccount_event_cpu(event, event->cpu);
3169}
3134 3170
3171static void __free_event(struct perf_event *event)
3172{
3135 if (!event->parent) { 3173 if (!event->parent) {
3136 if (event->attach_state & PERF_ATTACH_TASK)
3137 static_key_slow_dec_deferred(&perf_sched_events);
3138 if (event->attr.mmap || event->attr.mmap_data)
3139 atomic_dec(&nr_mmap_events);
3140 if (event->attr.comm)
3141 atomic_dec(&nr_comm_events);
3142 if (event->attr.task)
3143 atomic_dec(&nr_task_events);
3144 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 3174 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3145 put_callchain_buffers(); 3175 put_callchain_buffers();
3146 if (is_cgroup_event(event)) {
3147 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
3148 static_key_slow_dec_deferred(&perf_sched_events);
3149 }
3150
3151 if (has_branch_stack(event)) {
3152 static_key_slow_dec_deferred(&perf_sched_events);
3153 /* is system-wide event */
3154 if (!(event->attach_state & PERF_ATTACH_TASK)) {
3155 atomic_dec(&per_cpu(perf_branch_stack_events,
3156 event->cpu));
3157 }
3158 }
3159 } 3176 }
3160 3177
3178 if (event->destroy)
3179 event->destroy(event);
3180
3181 if (event->ctx)
3182 put_ctx(event->ctx);
3183
3184 call_rcu(&event->rcu_head, free_event_rcu);
3185}
3186static void free_event(struct perf_event *event)
3187{
3188 irq_work_sync(&event->pending);
3189
3190 unaccount_event(event);
3191
3161 if (event->rb) { 3192 if (event->rb) {
3162 struct ring_buffer *rb; 3193 struct ring_buffer *rb;
3163 3194
@@ -3180,13 +3211,8 @@ static void free_event(struct perf_event *event)
3180 if (is_cgroup_event(event)) 3211 if (is_cgroup_event(event))
3181 perf_detach_cgroup(event); 3212 perf_detach_cgroup(event);
3182 3213
3183 if (event->destroy)
3184 event->destroy(event);
3185 3214
3186 if (event->ctx) 3215 __free_event(event);
3187 put_ctx(event->ctx);
3188
3189 call_rcu(&event->rcu_head, free_event_rcu);
3190} 3216}
3191 3217
3192int perf_event_release_kernel(struct perf_event *event) 3218int perf_event_release_kernel(struct perf_event *event)
@@ -3544,6 +3570,15 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3544 case PERF_EVENT_IOC_PERIOD: 3570 case PERF_EVENT_IOC_PERIOD:
3545 return perf_event_period(event, (u64 __user *)arg); 3571 return perf_event_period(event, (u64 __user *)arg);
3546 3572
3573 case PERF_EVENT_IOC_ID:
3574 {
3575 u64 id = primary_event_id(event);
3576
3577 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
3578 return -EFAULT;
3579 return 0;
3580 }
3581
3547 case PERF_EVENT_IOC_SET_OUTPUT: 3582 case PERF_EVENT_IOC_SET_OUTPUT:
3548 { 3583 {
3549 int ret; 3584 int ret;
@@ -3625,6 +3660,26 @@ static void calc_timer_values(struct perf_event *event,
3625 *running = ctx_time - event->tstamp_running; 3660 *running = ctx_time - event->tstamp_running;
3626} 3661}
3627 3662
3663static void perf_event_init_userpage(struct perf_event *event)
3664{
3665 struct perf_event_mmap_page *userpg;
3666 struct ring_buffer *rb;
3667
3668 rcu_read_lock();
3669 rb = rcu_dereference(event->rb);
3670 if (!rb)
3671 goto unlock;
3672
3673 userpg = rb->user_page;
3674
3675 /* Allow new userspace to detect that bit 0 is deprecated */
3676 userpg->cap_bit0_is_deprecated = 1;
3677 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
3678
3679unlock:
3680 rcu_read_unlock();
3681}
3682
3628void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) 3683void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
3629{ 3684{
3630} 3685}
@@ -3641,6 +3696,10 @@ void perf_event_update_userpage(struct perf_event *event)
3641 u64 enabled, running, now; 3696 u64 enabled, running, now;
3642 3697
3643 rcu_read_lock(); 3698 rcu_read_lock();
3699 rb = rcu_dereference(event->rb);
3700 if (!rb)
3701 goto unlock;
3702
3644 /* 3703 /*
3645 * compute total_time_enabled, total_time_running 3704 * compute total_time_enabled, total_time_running
3646 * based on snapshot values taken when the event 3705 * based on snapshot values taken when the event
@@ -3651,12 +3710,8 @@ void perf_event_update_userpage(struct perf_event *event)
3651 * NMI context 3710 * NMI context
3652 */ 3711 */
3653 calc_timer_values(event, &now, &enabled, &running); 3712 calc_timer_values(event, &now, &enabled, &running);
3654 rb = rcu_dereference(event->rb);
3655 if (!rb)
3656 goto unlock;
3657 3713
3658 userpg = rb->user_page; 3714 userpg = rb->user_page;
3659
3660 /* 3715 /*
3661 * Disable preemption so as to not let the corresponding user-space 3716 * Disable preemption so as to not let the corresponding user-space
3662 * spin too long if we get preempted. 3717 * spin too long if we get preempted.
@@ -4009,6 +4064,7 @@ again:
4009 ring_buffer_attach(event, rb); 4064 ring_buffer_attach(event, rb);
4010 rcu_assign_pointer(event->rb, rb); 4065 rcu_assign_pointer(event->rb, rb);
4011 4066
4067 perf_event_init_userpage(event);
4012 perf_event_update_userpage(event); 4068 perf_event_update_userpage(event);
4013 4069
4014unlock: 4070unlock:
@@ -4251,7 +4307,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
4251 if (sample_type & PERF_SAMPLE_TIME) 4307 if (sample_type & PERF_SAMPLE_TIME)
4252 data->time = perf_clock(); 4308 data->time = perf_clock();
4253 4309
4254 if (sample_type & PERF_SAMPLE_ID) 4310 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
4255 data->id = primary_event_id(event); 4311 data->id = primary_event_id(event);
4256 4312
4257 if (sample_type & PERF_SAMPLE_STREAM_ID) 4313 if (sample_type & PERF_SAMPLE_STREAM_ID)
@@ -4290,6 +4346,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4290 4346
4291 if (sample_type & PERF_SAMPLE_CPU) 4347 if (sample_type & PERF_SAMPLE_CPU)
4292 perf_output_put(handle, data->cpu_entry); 4348 perf_output_put(handle, data->cpu_entry);
4349
4350 if (sample_type & PERF_SAMPLE_IDENTIFIER)
4351 perf_output_put(handle, data->id);
4293} 4352}
4294 4353
4295void perf_event__output_id_sample(struct perf_event *event, 4354void perf_event__output_id_sample(struct perf_event *event,
@@ -4355,7 +4414,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4355 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 4414 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4356 n = 0; 4415 n = 0;
4357 4416
4358 if (sub != event) 4417 if ((sub != event) &&
4418 (sub->state == PERF_EVENT_STATE_ACTIVE))
4359 sub->pmu->read(sub); 4419 sub->pmu->read(sub);
4360 4420
4361 values[n++] = perf_event_count(sub); 4421 values[n++] = perf_event_count(sub);
@@ -4402,6 +4462,9 @@ void perf_output_sample(struct perf_output_handle *handle,
4402 4462
4403 perf_output_put(handle, *header); 4463 perf_output_put(handle, *header);
4404 4464
4465 if (sample_type & PERF_SAMPLE_IDENTIFIER)
4466 perf_output_put(handle, data->id);
4467
4405 if (sample_type & PERF_SAMPLE_IP) 4468 if (sample_type & PERF_SAMPLE_IP)
4406 perf_output_put(handle, data->ip); 4469 perf_output_put(handle, data->ip);
4407 4470
@@ -4462,20 +4525,6 @@ void perf_output_sample(struct perf_output_handle *handle,
4462 } 4525 }
4463 } 4526 }
4464 4527
4465 if (!event->attr.watermark) {
4466 int wakeup_events = event->attr.wakeup_events;
4467
4468 if (wakeup_events) {
4469 struct ring_buffer *rb = handle->rb;
4470 int events = local_inc_return(&rb->events);
4471
4472 if (events >= wakeup_events) {
4473 local_sub(wakeup_events, &rb->events);
4474 local_inc(&rb->wakeup);
4475 }
4476 }
4477 }
4478
4479 if (sample_type & PERF_SAMPLE_BRANCH_STACK) { 4528 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4480 if (data->br_stack) { 4529 if (data->br_stack) {
4481 size_t size; 4530 size_t size;
@@ -4511,16 +4560,31 @@ void perf_output_sample(struct perf_output_handle *handle,
4511 } 4560 }
4512 } 4561 }
4513 4562
4514 if (sample_type & PERF_SAMPLE_STACK_USER) 4563 if (sample_type & PERF_SAMPLE_STACK_USER) {
4515 perf_output_sample_ustack(handle, 4564 perf_output_sample_ustack(handle,
4516 data->stack_user_size, 4565 data->stack_user_size,
4517 data->regs_user.regs); 4566 data->regs_user.regs);
4567 }
4518 4568
4519 if (sample_type & PERF_SAMPLE_WEIGHT) 4569 if (sample_type & PERF_SAMPLE_WEIGHT)
4520 perf_output_put(handle, data->weight); 4570 perf_output_put(handle, data->weight);
4521 4571
4522 if (sample_type & PERF_SAMPLE_DATA_SRC) 4572 if (sample_type & PERF_SAMPLE_DATA_SRC)
4523 perf_output_put(handle, data->data_src.val); 4573 perf_output_put(handle, data->data_src.val);
4574
4575 if (!event->attr.watermark) {
4576 int wakeup_events = event->attr.wakeup_events;
4577
4578 if (wakeup_events) {
4579 struct ring_buffer *rb = handle->rb;
4580 int events = local_inc_return(&rb->events);
4581
4582 if (events >= wakeup_events) {
4583 local_sub(wakeup_events, &rb->events);
4584 local_inc(&rb->wakeup);
4585 }
4586 }
4587 }
4524} 4588}
4525 4589
4526void perf_prepare_sample(struct perf_event_header *header, 4590void perf_prepare_sample(struct perf_event_header *header,
@@ -4680,12 +4744,10 @@ perf_event_read_event(struct perf_event *event,
4680 perf_output_end(&handle); 4744 perf_output_end(&handle);
4681} 4745}
4682 4746
4683typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data);
4684typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); 4747typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
4685 4748
4686static void 4749static void
4687perf_event_aux_ctx(struct perf_event_context *ctx, 4750perf_event_aux_ctx(struct perf_event_context *ctx,
4688 perf_event_aux_match_cb match,
4689 perf_event_aux_output_cb output, 4751 perf_event_aux_output_cb output,
4690 void *data) 4752 void *data)
4691{ 4753{
@@ -4696,15 +4758,12 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
4696 continue; 4758 continue;
4697 if (!event_filter_match(event)) 4759 if (!event_filter_match(event))
4698 continue; 4760 continue;
4699 if (match(event, data)) 4761 output(event, data);
4700 output(event, data);
4701 } 4762 }
4702} 4763}
4703 4764
4704static void 4765static void
4705perf_event_aux(perf_event_aux_match_cb match, 4766perf_event_aux(perf_event_aux_output_cb output, void *data,
4706 perf_event_aux_output_cb output,
4707 void *data,
4708 struct perf_event_context *task_ctx) 4767 struct perf_event_context *task_ctx)
4709{ 4768{
4710 struct perf_cpu_context *cpuctx; 4769 struct perf_cpu_context *cpuctx;
@@ -4717,7 +4776,7 @@ perf_event_aux(perf_event_aux_match_cb match,
4717 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4776 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4718 if (cpuctx->unique_pmu != pmu) 4777 if (cpuctx->unique_pmu != pmu)
4719 goto next; 4778 goto next;
4720 perf_event_aux_ctx(&cpuctx->ctx, match, output, data); 4779 perf_event_aux_ctx(&cpuctx->ctx, output, data);
4721 if (task_ctx) 4780 if (task_ctx)
4722 goto next; 4781 goto next;
4723 ctxn = pmu->task_ctx_nr; 4782 ctxn = pmu->task_ctx_nr;
@@ -4725,14 +4784,14 @@ perf_event_aux(perf_event_aux_match_cb match,
4725 goto next; 4784 goto next;
4726 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); 4785 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4727 if (ctx) 4786 if (ctx)
4728 perf_event_aux_ctx(ctx, match, output, data); 4787 perf_event_aux_ctx(ctx, output, data);
4729next: 4788next:
4730 put_cpu_ptr(pmu->pmu_cpu_context); 4789 put_cpu_ptr(pmu->pmu_cpu_context);
4731 } 4790 }
4732 4791
4733 if (task_ctx) { 4792 if (task_ctx) {
4734 preempt_disable(); 4793 preempt_disable();
4735 perf_event_aux_ctx(task_ctx, match, output, data); 4794 perf_event_aux_ctx(task_ctx, output, data);
4736 preempt_enable(); 4795 preempt_enable();
4737 } 4796 }
4738 rcu_read_unlock(); 4797 rcu_read_unlock();
@@ -4741,7 +4800,7 @@ next:
4741/* 4800/*
4742 * task tracking -- fork/exit 4801 * task tracking -- fork/exit
4743 * 4802 *
4744 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task 4803 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
4745 */ 4804 */
4746 4805
4747struct perf_task_event { 4806struct perf_task_event {
@@ -4759,6 +4818,13 @@ struct perf_task_event {
4759 } event_id; 4818 } event_id;
4760}; 4819};
4761 4820
4821static int perf_event_task_match(struct perf_event *event)
4822{
4823 return event->attr.comm || event->attr.mmap ||
4824 event->attr.mmap2 || event->attr.mmap_data ||
4825 event->attr.task;
4826}
4827
4762static void perf_event_task_output(struct perf_event *event, 4828static void perf_event_task_output(struct perf_event *event,
4763 void *data) 4829 void *data)
4764{ 4830{
@@ -4768,6 +4834,9 @@ static void perf_event_task_output(struct perf_event *event,
4768 struct task_struct *task = task_event->task; 4834 struct task_struct *task = task_event->task;
4769 int ret, size = task_event->event_id.header.size; 4835 int ret, size = task_event->event_id.header.size;
4770 4836
4837 if (!perf_event_task_match(event))
4838 return;
4839
4771 perf_event_header__init_id(&task_event->event_id.header, &sample, event); 4840 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4772 4841
4773 ret = perf_output_begin(&handle, event, 4842 ret = perf_output_begin(&handle, event,
@@ -4790,13 +4859,6 @@ out:
4790 task_event->event_id.header.size = size; 4859 task_event->event_id.header.size = size;
4791} 4860}
4792 4861
4793static int perf_event_task_match(struct perf_event *event,
4794 void *data __maybe_unused)
4795{
4796 return event->attr.comm || event->attr.mmap ||
4797 event->attr.mmap_data || event->attr.task;
4798}
4799
4800static void perf_event_task(struct task_struct *task, 4862static void perf_event_task(struct task_struct *task,
4801 struct perf_event_context *task_ctx, 4863 struct perf_event_context *task_ctx,
4802 int new) 4864 int new)
@@ -4825,8 +4887,7 @@ static void perf_event_task(struct task_struct *task,
4825 }, 4887 },
4826 }; 4888 };
4827 4889
4828 perf_event_aux(perf_event_task_match, 4890 perf_event_aux(perf_event_task_output,
4829 perf_event_task_output,
4830 &task_event, 4891 &task_event,
4831 task_ctx); 4892 task_ctx);
4832} 4893}
@@ -4853,6 +4914,11 @@ struct perf_comm_event {
4853 } event_id; 4914 } event_id;
4854}; 4915};
4855 4916
4917static int perf_event_comm_match(struct perf_event *event)
4918{
4919 return event->attr.comm;
4920}
4921
4856static void perf_event_comm_output(struct perf_event *event, 4922static void perf_event_comm_output(struct perf_event *event,
4857 void *data) 4923 void *data)
4858{ 4924{
@@ -4862,6 +4928,9 @@ static void perf_event_comm_output(struct perf_event *event,
4862 int size = comm_event->event_id.header.size; 4928 int size = comm_event->event_id.header.size;
4863 int ret; 4929 int ret;
4864 4930
4931 if (!perf_event_comm_match(event))
4932 return;
4933
4865 perf_event_header__init_id(&comm_event->event_id.header, &sample, event); 4934 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4866 ret = perf_output_begin(&handle, event, 4935 ret = perf_output_begin(&handle, event,
4867 comm_event->event_id.header.size); 4936 comm_event->event_id.header.size);
@@ -4883,12 +4952,6 @@ out:
4883 comm_event->event_id.header.size = size; 4952 comm_event->event_id.header.size = size;
4884} 4953}
4885 4954
4886static int perf_event_comm_match(struct perf_event *event,
4887 void *data __maybe_unused)
4888{
4889 return event->attr.comm;
4890}
4891
4892static void perf_event_comm_event(struct perf_comm_event *comm_event) 4955static void perf_event_comm_event(struct perf_comm_event *comm_event)
4893{ 4956{
4894 char comm[TASK_COMM_LEN]; 4957 char comm[TASK_COMM_LEN];
@@ -4903,8 +4966,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
4903 4966
4904 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 4967 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
4905 4968
4906 perf_event_aux(perf_event_comm_match, 4969 perf_event_aux(perf_event_comm_output,
4907 perf_event_comm_output,
4908 comm_event, 4970 comm_event,
4909 NULL); 4971 NULL);
4910} 4972}
@@ -4955,6 +5017,9 @@ struct perf_mmap_event {
4955 5017
4956 const char *file_name; 5018 const char *file_name;
4957 int file_size; 5019 int file_size;
5020 int maj, min;
5021 u64 ino;
5022 u64 ino_generation;
4958 5023
4959 struct { 5024 struct {
4960 struct perf_event_header header; 5025 struct perf_event_header header;
@@ -4967,6 +5032,17 @@ struct perf_mmap_event {
4967 } event_id; 5032 } event_id;
4968}; 5033};
4969 5034
5035static int perf_event_mmap_match(struct perf_event *event,
5036 void *data)
5037{
5038 struct perf_mmap_event *mmap_event = data;
5039 struct vm_area_struct *vma = mmap_event->vma;
5040 int executable = vma->vm_flags & VM_EXEC;
5041
5042 return (!executable && event->attr.mmap_data) ||
5043 (executable && (event->attr.mmap || event->attr.mmap2));
5044}
5045
4970static void perf_event_mmap_output(struct perf_event *event, 5046static void perf_event_mmap_output(struct perf_event *event,
4971 void *data) 5047 void *data)
4972{ 5048{
@@ -4976,6 +5052,17 @@ static void perf_event_mmap_output(struct perf_event *event,
4976 int size = mmap_event->event_id.header.size; 5052 int size = mmap_event->event_id.header.size;
4977 int ret; 5053 int ret;
4978 5054
5055 if (!perf_event_mmap_match(event, data))
5056 return;
5057
5058 if (event->attr.mmap2) {
5059 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5060 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5061 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5062 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5063 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5064 }
5065
4979 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 5066 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4980 ret = perf_output_begin(&handle, event, 5067 ret = perf_output_begin(&handle, event,
4981 mmap_event->event_id.header.size); 5068 mmap_event->event_id.header.size);
@@ -4986,6 +5073,14 @@ static void perf_event_mmap_output(struct perf_event *event,
4986 mmap_event->event_id.tid = perf_event_tid(event, current); 5073 mmap_event->event_id.tid = perf_event_tid(event, current);
4987 5074
4988 perf_output_put(&handle, mmap_event->event_id); 5075 perf_output_put(&handle, mmap_event->event_id);
5076
5077 if (event->attr.mmap2) {
5078 perf_output_put(&handle, mmap_event->maj);
5079 perf_output_put(&handle, mmap_event->min);
5080 perf_output_put(&handle, mmap_event->ino);
5081 perf_output_put(&handle, mmap_event->ino_generation);
5082 }
5083
4989 __output_copy(&handle, mmap_event->file_name, 5084 __output_copy(&handle, mmap_event->file_name,
4990 mmap_event->file_size); 5085 mmap_event->file_size);
4991 5086
@@ -4996,21 +5091,12 @@ out:
4996 mmap_event->event_id.header.size = size; 5091 mmap_event->event_id.header.size = size;
4997} 5092}
4998 5093
4999static int perf_event_mmap_match(struct perf_event *event,
5000 void *data)
5001{
5002 struct perf_mmap_event *mmap_event = data;
5003 struct vm_area_struct *vma = mmap_event->vma;
5004 int executable = vma->vm_flags & VM_EXEC;
5005
5006 return (!executable && event->attr.mmap_data) ||
5007 (executable && event->attr.mmap);
5008}
5009
5010static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 5094static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5011{ 5095{
5012 struct vm_area_struct *vma = mmap_event->vma; 5096 struct vm_area_struct *vma = mmap_event->vma;
5013 struct file *file = vma->vm_file; 5097 struct file *file = vma->vm_file;
5098 int maj = 0, min = 0;
5099 u64 ino = 0, gen = 0;
5014 unsigned int size; 5100 unsigned int size;
5015 char tmp[16]; 5101 char tmp[16];
5016 char *buf = NULL; 5102 char *buf = NULL;
@@ -5019,6 +5105,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5019 memset(tmp, 0, sizeof(tmp)); 5105 memset(tmp, 0, sizeof(tmp));
5020 5106
5021 if (file) { 5107 if (file) {
5108 struct inode *inode;
5109 dev_t dev;
5022 /* 5110 /*
5023 * d_path works from the end of the rb backwards, so we 5111 * d_path works from the end of the rb backwards, so we
5024 * need to add enough zero bytes after the string to handle 5112 * need to add enough zero bytes after the string to handle
@@ -5034,6 +5122,13 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5034 name = strncpy(tmp, "//toolong", sizeof(tmp)); 5122 name = strncpy(tmp, "//toolong", sizeof(tmp));
5035 goto got_name; 5123 goto got_name;
5036 } 5124 }
5125 inode = file_inode(vma->vm_file);
5126 dev = inode->i_sb->s_dev;
5127 ino = inode->i_ino;
5128 gen = inode->i_generation;
5129 maj = MAJOR(dev);
5130 min = MINOR(dev);
5131
5037 } else { 5132 } else {
5038 if (arch_vma_name(mmap_event->vma)) { 5133 if (arch_vma_name(mmap_event->vma)) {
5039 name = strncpy(tmp, arch_vma_name(mmap_event->vma), 5134 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
@@ -5064,14 +5159,17 @@ got_name:
5064 5159
5065 mmap_event->file_name = name; 5160 mmap_event->file_name = name;
5066 mmap_event->file_size = size; 5161 mmap_event->file_size = size;
5162 mmap_event->maj = maj;
5163 mmap_event->min = min;
5164 mmap_event->ino = ino;
5165 mmap_event->ino_generation = gen;
5067 5166
5068 if (!(vma->vm_flags & VM_EXEC)) 5167 if (!(vma->vm_flags & VM_EXEC))
5069 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; 5168 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
5070 5169
5071 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 5170 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
5072 5171
5073 perf_event_aux(perf_event_mmap_match, 5172 perf_event_aux(perf_event_mmap_output,
5074 perf_event_mmap_output,
5075 mmap_event, 5173 mmap_event,
5076 NULL); 5174 NULL);
5077 5175
@@ -5101,6 +5199,10 @@ void perf_event_mmap(struct vm_area_struct *vma)
5101 .len = vma->vm_end - vma->vm_start, 5199 .len = vma->vm_end - vma->vm_start,
5102 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, 5200 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
5103 }, 5201 },
5202 /* .maj (attr_mmap2 only) */
5203 /* .min (attr_mmap2 only) */
5204 /* .ino (attr_mmap2 only) */
5205 /* .ino_generation (attr_mmap2 only) */
5104 }; 5206 };
5105 5207
5106 perf_event_mmap_event(&mmap_event); 5208 perf_event_mmap_event(&mmap_event);
@@ -5178,6 +5280,7 @@ static int __perf_event_overflow(struct perf_event *event,
5178 __this_cpu_inc(perf_throttled_count); 5280 __this_cpu_inc(perf_throttled_count);
5179 hwc->interrupts = MAX_INTERRUPTS; 5281 hwc->interrupts = MAX_INTERRUPTS;
5180 perf_log_throttle(event, 0); 5282 perf_log_throttle(event, 0);
5283 tick_nohz_full_kick();
5181 ret = 1; 5284 ret = 1;
5182 } 5285 }
5183 } 5286 }
@@ -6443,6 +6546,44 @@ unlock:
6443 return pmu; 6546 return pmu;
6444} 6547}
6445 6548
6549static void account_event_cpu(struct perf_event *event, int cpu)
6550{
6551 if (event->parent)
6552 return;
6553
6554 if (has_branch_stack(event)) {
6555 if (!(event->attach_state & PERF_ATTACH_TASK))
6556 atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
6557 }
6558 if (is_cgroup_event(event))
6559 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
6560}
6561
6562static void account_event(struct perf_event *event)
6563{
6564 if (event->parent)
6565 return;
6566
6567 if (event->attach_state & PERF_ATTACH_TASK)
6568 static_key_slow_inc(&perf_sched_events.key);
6569 if (event->attr.mmap || event->attr.mmap_data)
6570 atomic_inc(&nr_mmap_events);
6571 if (event->attr.comm)
6572 atomic_inc(&nr_comm_events);
6573 if (event->attr.task)
6574 atomic_inc(&nr_task_events);
6575 if (event->attr.freq) {
6576 if (atomic_inc_return(&nr_freq_events) == 1)
6577 tick_nohz_full_kick_all();
6578 }
6579 if (has_branch_stack(event))
6580 static_key_slow_inc(&perf_sched_events.key);
6581 if (is_cgroup_event(event))
6582 static_key_slow_inc(&perf_sched_events.key);
6583
6584 account_event_cpu(event, event->cpu);
6585}
6586
6446/* 6587/*
6447 * Allocate and initialize a event structure 6588 * Allocate and initialize a event structure
6448 */ 6589 */
@@ -6457,7 +6598,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6457 struct pmu *pmu; 6598 struct pmu *pmu;
6458 struct perf_event *event; 6599 struct perf_event *event;
6459 struct hw_perf_event *hwc; 6600 struct hw_perf_event *hwc;
6460 long err; 6601 long err = -EINVAL;
6461 6602
6462 if ((unsigned)cpu >= nr_cpu_ids) { 6603 if ((unsigned)cpu >= nr_cpu_ids) {
6463 if (!task || cpu != -1) 6604 if (!task || cpu != -1)
@@ -6540,49 +6681,35 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6540 * we currently do not support PERF_FORMAT_GROUP on inherited events 6681 * we currently do not support PERF_FORMAT_GROUP on inherited events
6541 */ 6682 */
6542 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 6683 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
6543 goto done; 6684 goto err_ns;
6544 6685
6545 pmu = perf_init_event(event); 6686 pmu = perf_init_event(event);
6546
6547done:
6548 err = 0;
6549 if (!pmu) 6687 if (!pmu)
6550 err = -EINVAL; 6688 goto err_ns;
6551 else if (IS_ERR(pmu)) 6689 else if (IS_ERR(pmu)) {
6552 err = PTR_ERR(pmu); 6690 err = PTR_ERR(pmu);
6553 6691 goto err_ns;
6554 if (err) {
6555 if (event->ns)
6556 put_pid_ns(event->ns);
6557 kfree(event);
6558 return ERR_PTR(err);
6559 } 6692 }
6560 6693
6561 if (!event->parent) { 6694 if (!event->parent) {
6562 if (event->attach_state & PERF_ATTACH_TASK)
6563 static_key_slow_inc(&perf_sched_events.key);
6564 if (event->attr.mmap || event->attr.mmap_data)
6565 atomic_inc(&nr_mmap_events);
6566 if (event->attr.comm)
6567 atomic_inc(&nr_comm_events);
6568 if (event->attr.task)
6569 atomic_inc(&nr_task_events);
6570 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { 6695 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
6571 err = get_callchain_buffers(); 6696 err = get_callchain_buffers();
6572 if (err) { 6697 if (err)
6573 free_event(event); 6698 goto err_pmu;
6574 return ERR_PTR(err);
6575 }
6576 }
6577 if (has_branch_stack(event)) {
6578 static_key_slow_inc(&perf_sched_events.key);
6579 if (!(event->attach_state & PERF_ATTACH_TASK))
6580 atomic_inc(&per_cpu(perf_branch_stack_events,
6581 event->cpu));
6582 } 6699 }
6583 } 6700 }
6584 6701
6585 return event; 6702 return event;
6703
6704err_pmu:
6705 if (event->destroy)
6706 event->destroy(event);
6707err_ns:
6708 if (event->ns)
6709 put_pid_ns(event->ns);
6710 kfree(event);
6711
6712 return ERR_PTR(err);
6586} 6713}
6587 6714
6588static int perf_copy_attr(struct perf_event_attr __user *uattr, 6715static int perf_copy_attr(struct perf_event_attr __user *uattr,
@@ -6640,6 +6767,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6640 if (ret) 6767 if (ret)
6641 return -EFAULT; 6768 return -EFAULT;
6642 6769
6770 /* disabled for now */
6771 if (attr->mmap2)
6772 return -EINVAL;
6773
6643 if (attr->__reserved_1) 6774 if (attr->__reserved_1)
6644 return -EINVAL; 6775 return -EINVAL;
6645 6776
@@ -6864,17 +6995,14 @@ SYSCALL_DEFINE5(perf_event_open,
6864 6995
6865 if (flags & PERF_FLAG_PID_CGROUP) { 6996 if (flags & PERF_FLAG_PID_CGROUP) {
6866 err = perf_cgroup_connect(pid, event, &attr, group_leader); 6997 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6867 if (err) 6998 if (err) {
6868 goto err_alloc; 6999 __free_event(event);
6869 /* 7000 goto err_task;
6870 * one more event: 7001 }
6871 * - that has cgroup constraint on event->cpu
6872 * - that may need work on context switch
6873 */
6874 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6875 static_key_slow_inc(&perf_sched_events.key);
6876 } 7002 }
6877 7003
7004 account_event(event);
7005
6878 /* 7006 /*
6879 * Special case software events and allow them to be part of 7007 * Special case software events and allow them to be part of
6880 * any hardware group. 7008 * any hardware group.
@@ -7070,6 +7198,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7070 goto err; 7198 goto err;
7071 } 7199 }
7072 7200
7201 account_event(event);
7202
7073 ctx = find_get_context(event->pmu, task, cpu); 7203 ctx = find_get_context(event->pmu, task, cpu);
7074 if (IS_ERR(ctx)) { 7204 if (IS_ERR(ctx)) {
7075 err = PTR_ERR(ctx); 7205 err = PTR_ERR(ctx);
@@ -7106,18 +7236,20 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7106 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 7236 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
7107 event_entry) { 7237 event_entry) {
7108 perf_remove_from_context(event); 7238 perf_remove_from_context(event);
7239 unaccount_event_cpu(event, src_cpu);
7109 put_ctx(src_ctx); 7240 put_ctx(src_ctx);
7110 list_add(&event->event_entry, &events); 7241 list_add(&event->migrate_entry, &events);
7111 } 7242 }
7112 mutex_unlock(&src_ctx->mutex); 7243 mutex_unlock(&src_ctx->mutex);
7113 7244
7114 synchronize_rcu(); 7245 synchronize_rcu();
7115 7246
7116 mutex_lock(&dst_ctx->mutex); 7247 mutex_lock(&dst_ctx->mutex);
7117 list_for_each_entry_safe(event, tmp, &events, event_entry) { 7248 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
7118 list_del(&event->event_entry); 7249 list_del(&event->migrate_entry);
7119 if (event->state >= PERF_EVENT_STATE_OFF) 7250 if (event->state >= PERF_EVENT_STATE_OFF)
7120 event->state = PERF_EVENT_STATE_INACTIVE; 7251 event->state = PERF_EVENT_STATE_INACTIVE;
7252 account_event_cpu(event, dst_cpu);
7121 perf_install_in_context(dst_ctx, event, dst_cpu); 7253 perf_install_in_context(dst_ctx, event, dst_cpu);
7122 get_ctx(dst_ctx); 7254 get_ctx(dst_ctx);
7123 } 7255 }
@@ -7798,7 +7930,8 @@ unlock:
7798device_initcall(perf_event_sysfs_init); 7930device_initcall(perf_event_sysfs_init);
7799 7931
7800#ifdef CONFIG_CGROUP_PERF 7932#ifdef CONFIG_CGROUP_PERF
7801static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) 7933static struct cgroup_subsys_state *
7934perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7802{ 7935{
7803 struct perf_cgroup *jc; 7936 struct perf_cgroup *jc;
7804 7937
@@ -7815,11 +7948,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7815 return &jc->css; 7948 return &jc->css;
7816} 7949}
7817 7950
7818static void perf_cgroup_css_free(struct cgroup *cont) 7951static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
7819{ 7952{
7820 struct perf_cgroup *jc; 7953 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
7821 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7954
7822 struct perf_cgroup, css);
7823 free_percpu(jc->info); 7955 free_percpu(jc->info);
7824 kfree(jc); 7956 kfree(jc);
7825} 7957}
@@ -7831,15 +7963,17 @@ static int __perf_cgroup_move(void *info)
7831 return 0; 7963 return 0;
7832} 7964}
7833 7965
7834static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 7966static void perf_cgroup_attach(struct cgroup_subsys_state *css,
7967 struct cgroup_taskset *tset)
7835{ 7968{
7836 struct task_struct *task; 7969 struct task_struct *task;
7837 7970
7838 cgroup_taskset_for_each(task, cgrp, tset) 7971 cgroup_taskset_for_each(task, css, tset)
7839 task_function_call(task, __perf_cgroup_move, task); 7972 task_function_call(task, __perf_cgroup_move, task);
7840} 7973}
7841 7974
7842static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7975static void perf_cgroup_exit(struct cgroup_subsys_state *css,
7976 struct cgroup_subsys_state *old_css,
7843 struct task_struct *task) 7977 struct task_struct *task)
7844{ 7978{
7845 /* 7979 /*
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index cd55144270b5..9c2ddfbf4525 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -87,10 +87,31 @@ again:
87 goto out; 87 goto out;
88 88
89 /* 89 /*
90 * Publish the known good head. Rely on the full barrier implied 90 * Since the mmap() consumer (userspace) can run on a different CPU:
91 * by atomic_dec_and_test() order the rb->head read and this 91 *
92 * write. 92 * kernel user
93 *
94 * READ ->data_tail READ ->data_head
95 * smp_mb() (A) smp_rmb() (C)
96 * WRITE $data READ $data
97 * smp_wmb() (B) smp_mb() (D)
98 * STORE ->data_head WRITE ->data_tail
99 *
100 * Where A pairs with D, and B pairs with C.
101 *
102 * I don't think A needs to be a full barrier because we won't in fact
103 * write data until we see the store from userspace. So we simply don't
104 * issue the data WRITE until we observe it. Be conservative for now.
105 *
106 * OTOH, D needs to be a full barrier since it separates the data READ
107 * from the tail WRITE.
108 *
109 * For B a WMB is sufficient since it separates two WRITEs, and for C
110 * an RMB is sufficient since it separates two READs.
111 *
112 * See perf_output_begin().
93 */ 113 */
114 smp_wmb();
94 rb->user_page->data_head = head; 115 rb->user_page->data_head = head;
95 116
96 /* 117 /*
@@ -154,9 +175,11 @@ int perf_output_begin(struct perf_output_handle *handle,
154 * Userspace could choose to issue a mb() before updating the 175 * Userspace could choose to issue a mb() before updating the
155 * tail pointer. So that all reads will be completed before the 176 * tail pointer. So that all reads will be completed before the
156 * write is issued. 177 * write is issued.
178 *
179 * See perf_output_put_handle().
157 */ 180 */
158 tail = ACCESS_ONCE(rb->user_page->data_tail); 181 tail = ACCESS_ONCE(rb->user_page->data_tail);
159 smp_rmb(); 182 smp_mb();
160 offset = head = local_read(&rb->head); 183 offset = head = local_read(&rb->head);
161 head += size; 184 head += size;
162 if (unlikely(!perf_output_space(rb, tail, offset, head))) 185 if (unlikely(!perf_output_space(rb, tail, offset, head)))
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f3569747d629..ad8e1bdca70e 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1682,12 +1682,10 @@ static bool handle_trampoline(struct pt_regs *regs)
1682 tmp = ri; 1682 tmp = ri;
1683 ri = ri->next; 1683 ri = ri->next;
1684 kfree(tmp); 1684 kfree(tmp);
1685 utask->depth--;
1685 1686
1686 if (!chained) 1687 if (!chained)
1687 break; 1688 break;
1688
1689 utask->depth--;
1690
1691 BUG_ON(!ri); 1689 BUG_ON(!ri);
1692 } 1690 }
1693 1691
diff --git a/kernel/extable.c b/kernel/extable.c
index 67460b93b1a1..832cb28105bb 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,7 +41,7 @@ u32 __initdata main_extable_sort_needed = 1;
41/* Sort the kernel's built-in exception table */ 41/* Sort the kernel's built-in exception table */
42void __init sort_main_extable(void) 42void __init sort_main_extable(void)
43{ 43{
44 if (main_extable_sort_needed) { 44 if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
45 pr_notice("Sorting __ex_table...\n"); 45 pr_notice("Sorting __ex_table...\n");
46 sort_extable(__start___ex_table, __stop___ex_table); 46 sort_extable(__start___ex_table, __stop___ex_table);
47 } 47 }
diff --git a/kernel/fork.c b/kernel/fork.c
index bf46287c91a4..086fe73ad6bd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -351,7 +351,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
351 struct rb_node **rb_link, *rb_parent; 351 struct rb_node **rb_link, *rb_parent;
352 int retval; 352 int retval;
353 unsigned long charge; 353 unsigned long charge;
354 struct mempolicy *pol;
355 354
356 uprobe_start_dup_mmap(); 355 uprobe_start_dup_mmap();
357 down_write(&oldmm->mmap_sem); 356 down_write(&oldmm->mmap_sem);
@@ -400,11 +399,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
400 goto fail_nomem; 399 goto fail_nomem;
401 *tmp = *mpnt; 400 *tmp = *mpnt;
402 INIT_LIST_HEAD(&tmp->anon_vma_chain); 401 INIT_LIST_HEAD(&tmp->anon_vma_chain);
403 pol = mpol_dup(vma_policy(mpnt)); 402 retval = vma_dup_policy(mpnt, tmp);
404 retval = PTR_ERR(pol); 403 if (retval)
405 if (IS_ERR(pol))
406 goto fail_nomem_policy; 404 goto fail_nomem_policy;
407 vma_set_policy(tmp, pol);
408 tmp->vm_mm = mm; 405 tmp->vm_mm = mm;
409 if (anon_vma_fork(tmp, mpnt)) 406 if (anon_vma_fork(tmp, mpnt))
410 goto fail_nomem_anon_vma_fork; 407 goto fail_nomem_anon_vma_fork;
@@ -472,7 +469,7 @@ out:
472 uprobe_end_dup_mmap(); 469 uprobe_end_dup_mmap();
473 return retval; 470 return retval;
474fail_nomem_anon_vma_fork: 471fail_nomem_anon_vma_fork:
475 mpol_put(pol); 472 mpol_put(vma_policy(tmp));
476fail_nomem_policy: 473fail_nomem_policy:
477 kmem_cache_free(vm_area_cachep, tmp); 474 kmem_cache_free(vm_area_cachep, tmp);
478fail_nomem: 475fail_nomem:
@@ -522,7 +519,7 @@ static void mm_init_aio(struct mm_struct *mm)
522{ 519{
523#ifdef CONFIG_AIO 520#ifdef CONFIG_AIO
524 spin_lock_init(&mm->ioctx_lock); 521 spin_lock_init(&mm->ioctx_lock);
525 INIT_HLIST_HEAD(&mm->ioctx_list); 522 mm->ioctx_table = NULL;
526#endif 523#endif
527} 524}
528 525
@@ -1173,13 +1170,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1173 return ERR_PTR(-EINVAL); 1170 return ERR_PTR(-EINVAL);
1174 1171
1175 /* 1172 /*
1176 * If the new process will be in a different pid namespace 1173 * If the new process will be in a different pid or user namespace
1177 * don't allow the creation of threads. 1174 * do not allow it to share a thread group or signal handlers or
1175 * parent with the forking task.
1178 */ 1176 */
1179 if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && 1177 if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) {
1180 (task_active_pid_ns(current) != 1178 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1181 current->nsproxy->pid_ns_for_children)) 1179 (task_active_pid_ns(current) !=
1182 return ERR_PTR(-EINVAL); 1180 current->nsproxy->pid_ns_for_children))
1181 return ERR_PTR(-EINVAL);
1182 }
1183 1183
1184 retval = security_task_create(clone_flags); 1184 retval = security_task_create(clone_flags);
1185 if (retval) 1185 if (retval)
@@ -1576,15 +1576,6 @@ long do_fork(unsigned long clone_flags,
1576 long nr; 1576 long nr;
1577 1577
1578 /* 1578 /*
1579 * Do some preliminary argument and permissions checking before we
1580 * actually start allocating stuff
1581 */
1582 if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
1583 if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
1584 return -EINVAL;
1585 }
1586
1587 /*
1588 * Determine whether and which event to report to ptracer. When 1579 * Determine whether and which event to report to ptracer. When
1589 * called from kernel_thread or CLONE_UNTRACED is explicitly 1580 * called from kernel_thread or CLONE_UNTRACED is explicitly
1590 * requested, no event is reported; otherwise, report if the event 1581 * requested, no event is reported; otherwise, report if the event
@@ -1825,11 +1816,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1825 if (unshare_flags & CLONE_NEWUSER) 1816 if (unshare_flags & CLONE_NEWUSER)
1826 unshare_flags |= CLONE_THREAD | CLONE_FS; 1817 unshare_flags |= CLONE_THREAD | CLONE_FS;
1827 /* 1818 /*
1828 * If unsharing a pid namespace must also unshare the thread.
1829 */
1830 if (unshare_flags & CLONE_NEWPID)
1831 unshare_flags |= CLONE_THREAD;
1832 /*
1833 * If unsharing a thread from a thread group, must also unshare vm. 1819 * If unsharing a thread from a thread group, must also unshare vm.
1834 */ 1820 */
1835 if (unshare_flags & CLONE_THREAD) 1821 if (unshare_flags & CLONE_THREAD)
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 9bd0934f6c33..7a7d2ee96d42 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -74,7 +74,7 @@ static int __init gcov_persist_setup(char *str)
74{ 74{
75 unsigned long val; 75 unsigned long val;
76 76
77 if (strict_strtoul(str, 0, &val)) { 77 if (kstrtoul(str, 0, &val)) {
78 pr_warning("invalid gcov_persist parameter '%s'\n", str); 78 pr_warning("invalid gcov_persist parameter '%s'\n", str);
79 return 0; 79 return 0;
80 } 80 }
diff --git a/kernel/groups.c b/kernel/groups.c
index 6b2588dd04ff..90cf1c38c8ea 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
233 struct group_info *group_info; 233 struct group_info *group_info;
234 int retval; 234 int retval;
235 235
236 if (!nsown_capable(CAP_SETGID)) 236 if (!ns_capable(current_user_ns(), CAP_SETGID))
237 return -EPERM; 237 return -EPERM;
238 if ((unsigned)gidsetsize > NGROUPS_MAX) 238 if ((unsigned)gidsetsize > NGROUPS_MAX)
239 return -EINVAL; 239 return -EINVAL;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 6df614912b9d..3e97fb126e6b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -15,6 +15,7 @@
15#include <linux/lockdep.h> 15#include <linux/lockdep.h>
16#include <linux/export.h> 16#include <linux/export.h>
17#include <linux/sysctl.h> 17#include <linux/sysctl.h>
18#include <linux/utsname.h>
18 19
19/* 20/*
20 * The number of tasks checked: 21 * The number of tasks checked:
@@ -99,10 +100,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
99 * Ok, the task did not get scheduled for more than 2 minutes, 100 * Ok, the task did not get scheduled for more than 2 minutes,
100 * complain: 101 * complain:
101 */ 102 */
102 printk(KERN_ERR "INFO: task %s:%d blocked for more than " 103 pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
103 "%ld seconds.\n", t->comm, t->pid, timeout); 104 t->comm, t->pid, timeout);
104 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" 105 pr_err(" %s %s %.*s\n",
105 " disables this message.\n"); 106 print_tainted(), init_utsname()->release,
107 (int)strcspn(init_utsname()->version, " "),
108 init_utsname()->version);
109 pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
110 " disables this message.\n");
106 sched_show_task(t); 111 sched_show_task(t);
107 debug_show_held_locks(t); 112 debug_show_held_locks(t);
108 113
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d1a758bc972a..4a1fef09f658 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,15 +1,4 @@
1# Select this to activate the generic irq options below
2config HAVE_GENERIC_HARDIRQS
3 bool
4
5if HAVE_GENERIC_HARDIRQS
6menu "IRQ subsystem" 1menu "IRQ subsystem"
7#
8# Interrupt subsystem related configuration options
9#
10config GENERIC_HARDIRQS
11 def_bool y
12
13# Options selectable by the architecture code 2# Options selectable by the architecture code
14 3
15# Make sparse irq Kconfig switch below available 4# Make sparse irq Kconfig switch below available
@@ -84,4 +73,3 @@ config SPARSE_IRQ
84 If you don't know what to do here, say N. 73 If you don't know what to do here, say N.
85 74
86endmenu 75endmenu
87endif
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 60f48fa0fd0d..297a9247a3b3 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -13,6 +13,7 @@
13#include <linux/sort.h> 13#include <linux/sort.h>
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/static_key.h> 15#include <linux/static_key.h>
16#include <linux/jump_label_ratelimit.h>
16 17
17#ifdef HAVE_JUMP_LABEL 18#ifdef HAVE_JUMP_LABEL
18 19
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 59f7b55ba745..2a74f307c5ec 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1474,11 +1474,8 @@ static int __init __parse_crashkernel(char *cmdline,
1474 if (first_colon && (!first_space || first_colon < first_space)) 1474 if (first_colon && (!first_space || first_colon < first_space))
1475 return parse_crashkernel_mem(ck_cmdline, system_ram, 1475 return parse_crashkernel_mem(ck_cmdline, system_ram,
1476 crash_size, crash_base); 1476 crash_size, crash_base);
1477 else
1478 return parse_crashkernel_simple(ck_cmdline, crash_size,
1479 crash_base);
1480 1477
1481 return 0; 1478 return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
1482} 1479}
1483 1480
1484/* 1481/*
diff --git a/kernel/kmod.c b/kernel/kmod.c
index fb326365b694..b086006c59e7 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -571,6 +571,10 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
571 DECLARE_COMPLETION_ONSTACK(done); 571 DECLARE_COMPLETION_ONSTACK(done);
572 int retval = 0; 572 int retval = 0;
573 573
574 if (!sub_info->path) {
575 call_usermodehelper_freeinfo(sub_info);
576 return -EINVAL;
577 }
574 helper_lock(); 578 helper_lock();
575 if (!khelper_wq || usermodehelper_disabled) { 579 if (!khelper_wq || usermodehelper_disabled) {
576 retval = -EBUSY; 580 retval = -EBUSY;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6e33498d665c..a0d367a49122 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -112,6 +112,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
112struct kprobe_insn_page { 112struct kprobe_insn_page {
113 struct list_head list; 113 struct list_head list;
114 kprobe_opcode_t *insns; /* Page of instruction slots */ 114 kprobe_opcode_t *insns; /* Page of instruction slots */
115 struct kprobe_insn_cache *cache;
115 int nused; 116 int nused;
116 int ngarbage; 117 int ngarbage;
117 char slot_used[]; 118 char slot_used[];
@@ -121,12 +122,6 @@ struct kprobe_insn_page {
121 (offsetof(struct kprobe_insn_page, slot_used) + \ 122 (offsetof(struct kprobe_insn_page, slot_used) + \
122 (sizeof(char) * (slots))) 123 (sizeof(char) * (slots)))
123 124
124struct kprobe_insn_cache {
125 struct list_head pages; /* list of kprobe_insn_page */
126 size_t insn_size; /* size of instruction slot */
127 int nr_garbage;
128};
129
130static int slots_per_page(struct kprobe_insn_cache *c) 125static int slots_per_page(struct kprobe_insn_cache *c)
131{ 126{
132 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); 127 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
@@ -138,8 +133,20 @@ enum kprobe_slot_state {
138 SLOT_USED = 2, 133 SLOT_USED = 2,
139}; 134};
140 135
141static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ 136static void *alloc_insn_page(void)
142static struct kprobe_insn_cache kprobe_insn_slots = { 137{
138 return module_alloc(PAGE_SIZE);
139}
140
141static void free_insn_page(void *page)
142{
143 module_free(NULL, page);
144}
145
146struct kprobe_insn_cache kprobe_insn_slots = {
147 .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
148 .alloc = alloc_insn_page,
149 .free = free_insn_page,
143 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), 150 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
144 .insn_size = MAX_INSN_SIZE, 151 .insn_size = MAX_INSN_SIZE,
145 .nr_garbage = 0, 152 .nr_garbage = 0,
@@ -150,10 +157,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
150 * __get_insn_slot() - Find a slot on an executable page for an instruction. 157 * __get_insn_slot() - Find a slot on an executable page for an instruction.
151 * We allocate an executable page if there's no room on existing ones. 158 * We allocate an executable page if there's no room on existing ones.
152 */ 159 */
153static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) 160kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
154{ 161{
155 struct kprobe_insn_page *kip; 162 struct kprobe_insn_page *kip;
163 kprobe_opcode_t *slot = NULL;
156 164
165 mutex_lock(&c->mutex);
157 retry: 166 retry:
158 list_for_each_entry(kip, &c->pages, list) { 167 list_for_each_entry(kip, &c->pages, list) {
159 if (kip->nused < slots_per_page(c)) { 168 if (kip->nused < slots_per_page(c)) {
@@ -162,7 +171,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
162 if (kip->slot_used[i] == SLOT_CLEAN) { 171 if (kip->slot_used[i] == SLOT_CLEAN) {
163 kip->slot_used[i] = SLOT_USED; 172 kip->slot_used[i] = SLOT_USED;
164 kip->nused++; 173 kip->nused++;
165 return kip->insns + (i * c->insn_size); 174 slot = kip->insns + (i * c->insn_size);
175 goto out;
166 } 176 }
167 } 177 }
168 /* kip->nused is broken. Fix it. */ 178 /* kip->nused is broken. Fix it. */
@@ -178,37 +188,29 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
178 /* All out of space. Need to allocate a new page. */ 188 /* All out of space. Need to allocate a new page. */
179 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); 189 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
180 if (!kip) 190 if (!kip)
181 return NULL; 191 goto out;
182 192
183 /* 193 /*
184 * Use module_alloc so this page is within +/- 2GB of where the 194 * Use module_alloc so this page is within +/- 2GB of where the
185 * kernel image and loaded module images reside. This is required 195 * kernel image and loaded module images reside. This is required
186 * so x86_64 can correctly handle the %rip-relative fixups. 196 * so x86_64 can correctly handle the %rip-relative fixups.
187 */ 197 */
188 kip->insns = module_alloc(PAGE_SIZE); 198 kip->insns = c->alloc();
189 if (!kip->insns) { 199 if (!kip->insns) {
190 kfree(kip); 200 kfree(kip);
191 return NULL; 201 goto out;
192 } 202 }
193 INIT_LIST_HEAD(&kip->list); 203 INIT_LIST_HEAD(&kip->list);
194 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); 204 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
195 kip->slot_used[0] = SLOT_USED; 205 kip->slot_used[0] = SLOT_USED;
196 kip->nused = 1; 206 kip->nused = 1;
197 kip->ngarbage = 0; 207 kip->ngarbage = 0;
208 kip->cache = c;
198 list_add(&kip->list, &c->pages); 209 list_add(&kip->list, &c->pages);
199 return kip->insns; 210 slot = kip->insns;
200} 211out:
201 212 mutex_unlock(&c->mutex);
202 213 return slot;
203kprobe_opcode_t __kprobes *get_insn_slot(void)
204{
205 kprobe_opcode_t *ret = NULL;
206
207 mutex_lock(&kprobe_insn_mutex);
208 ret = __get_insn_slot(&kprobe_insn_slots);
209 mutex_unlock(&kprobe_insn_mutex);
210
211 return ret;
212} 214}
213 215
214/* Return 1 if all garbages are collected, otherwise 0. */ 216/* Return 1 if all garbages are collected, otherwise 0. */
@@ -225,7 +227,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
225 */ 227 */
226 if (!list_is_singular(&kip->list)) { 228 if (!list_is_singular(&kip->list)) {
227 list_del(&kip->list); 229 list_del(&kip->list);
228 module_free(NULL, kip->insns); 230 kip->cache->free(kip->insns);
229 kfree(kip); 231 kfree(kip);
230 } 232 }
231 return 1; 233 return 1;
@@ -255,11 +257,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
255 return 0; 257 return 0;
256} 258}
257 259
258static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, 260void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
259 kprobe_opcode_t *slot, int dirty) 261 kprobe_opcode_t *slot, int dirty)
260{ 262{
261 struct kprobe_insn_page *kip; 263 struct kprobe_insn_page *kip;
262 264
265 mutex_lock(&c->mutex);
263 list_for_each_entry(kip, &c->pages, list) { 266 list_for_each_entry(kip, &c->pages, list) {
264 long idx = ((long)slot - (long)kip->insns) / 267 long idx = ((long)slot - (long)kip->insns) /
265 (c->insn_size * sizeof(kprobe_opcode_t)); 268 (c->insn_size * sizeof(kprobe_opcode_t));
@@ -272,45 +275,25 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
272 collect_garbage_slots(c); 275 collect_garbage_slots(c);
273 } else 276 } else
274 collect_one_slot(kip, idx); 277 collect_one_slot(kip, idx);
275 return; 278 goto out;
276 } 279 }
277 } 280 }
278 /* Could not free this slot. */ 281 /* Could not free this slot. */
279 WARN_ON(1); 282 WARN_ON(1);
283out:
284 mutex_unlock(&c->mutex);
280} 285}
281 286
282void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
283{
284 mutex_lock(&kprobe_insn_mutex);
285 __free_insn_slot(&kprobe_insn_slots, slot, dirty);
286 mutex_unlock(&kprobe_insn_mutex);
287}
288#ifdef CONFIG_OPTPROBES 287#ifdef CONFIG_OPTPROBES
289/* For optimized_kprobe buffer */ 288/* For optimized_kprobe buffer */
290static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ 289struct kprobe_insn_cache kprobe_optinsn_slots = {
291static struct kprobe_insn_cache kprobe_optinsn_slots = { 290 .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
291 .alloc = alloc_insn_page,
292 .free = free_insn_page,
292 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), 293 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
293 /* .insn_size is initialized later */ 294 /* .insn_size is initialized later */
294 .nr_garbage = 0, 295 .nr_garbage = 0,
295}; 296};
296/* Get a slot for optimized_kprobe buffer */
297kprobe_opcode_t __kprobes *get_optinsn_slot(void)
298{
299 kprobe_opcode_t *ret = NULL;
300
301 mutex_lock(&kprobe_optinsn_mutex);
302 ret = __get_insn_slot(&kprobe_optinsn_slots);
303 mutex_unlock(&kprobe_optinsn_mutex);
304
305 return ret;
306}
307
308void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
309{
310 mutex_lock(&kprobe_optinsn_mutex);
311 __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
312 mutex_unlock(&kprobe_optinsn_mutex);
313}
314#endif 297#endif
315#endif 298#endif
316 299
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6ada93c23a9a..9659d38e008f 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -113,7 +113,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
113 unsigned long cnt; 113 unsigned long cnt;
114 int ret; 114 int ret;
115 115
116 if (strict_strtoul(buf, 0, &cnt)) 116 if (kstrtoul(buf, 0, &cnt))
117 return -EINVAL; 117 return -EINVAL;
118 118
119 ret = crash_shrink_memory(cnt); 119 ret = crash_shrink_memory(cnt);
diff --git a/kernel/lglock.c b/kernel/lglock.c
index 6535a667a5a7..86ae2aebf004 100644
--- a/kernel/lglock.c
+++ b/kernel/lglock.c
@@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg)
21 arch_spinlock_t *lock; 21 arch_spinlock_t *lock;
22 22
23 preempt_disable(); 23 preempt_disable();
24 rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); 24 lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
25 lock = this_cpu_ptr(lg->lock); 25 lock = this_cpu_ptr(lg->lock);
26 arch_spin_lock(lock); 26 arch_spin_lock(lock);
27} 27}
@@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg)
31{ 31{
32 arch_spinlock_t *lock; 32 arch_spinlock_t *lock;
33 33
34 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); 34 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
35 lock = this_cpu_ptr(lg->lock); 35 lock = this_cpu_ptr(lg->lock);
36 arch_spin_unlock(lock); 36 arch_spin_unlock(lock);
37 preempt_enable(); 37 preempt_enable();
@@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu)
43 arch_spinlock_t *lock; 43 arch_spinlock_t *lock;
44 44
45 preempt_disable(); 45 preempt_disable();
46 rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); 46 lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
47 lock = per_cpu_ptr(lg->lock, cpu); 47 lock = per_cpu_ptr(lg->lock, cpu);
48 arch_spin_lock(lock); 48 arch_spin_lock(lock);
49} 49}
@@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
53{ 53{
54 arch_spinlock_t *lock; 54 arch_spinlock_t *lock;
55 55
56 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); 56 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
57 lock = per_cpu_ptr(lg->lock, cpu); 57 lock = per_cpu_ptr(lg->lock, cpu);
58 arch_spin_unlock(lock); 58 arch_spin_unlock(lock);
59 preempt_enable(); 59 preempt_enable();
@@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg)
65 int i; 65 int i;
66 66
67 preempt_disable(); 67 preempt_disable();
68 rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); 68 lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
69 for_each_possible_cpu(i) { 69 for_each_possible_cpu(i) {
70 arch_spinlock_t *lock; 70 arch_spinlock_t *lock;
71 lock = per_cpu_ptr(lg->lock, i); 71 lock = per_cpu_ptr(lg->lock, i);
@@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg)
78{ 78{
79 int i; 79 int i;
80 80
81 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); 81 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
82 for_each_possible_cpu(i) { 82 for_each_possible_cpu(i) {
83 arch_spinlock_t *lock; 83 arch_spinlock_t *lock;
84 lock = per_cpu_ptr(lg->lock, i); 84 lock = per_cpu_ptr(lg->lock, i);
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
index 2b6e69909c39..7cbd4507a7e6 100644
--- a/kernel/modsign_pubkey.c
+++ b/kernel/modsign_pubkey.c
@@ -18,14 +18,14 @@
18 18
19struct key *modsign_keyring; 19struct key *modsign_keyring;
20 20
21extern __initdata const u8 modsign_certificate_list[]; 21extern __initconst const u8 modsign_certificate_list[];
22extern __initdata const u8 modsign_certificate_list_end[]; 22extern __initconst const u8 modsign_certificate_list_end[];
23 23
24/* 24/*
25 * We need to make sure ccache doesn't cache the .o file as it doesn't notice 25 * We need to make sure ccache doesn't cache the .o file as it doesn't notice
26 * if modsign.pub changes. 26 * if modsign.pub changes.
27 */ 27 */
28static __initdata const char annoy_ccache[] = __TIME__ "foo"; 28static __initconst const char annoy_ccache[] = __TIME__ "foo";
29 29
30/* 30/*
31 * Load the compiled-in keys 31 * Load the compiled-in keys
diff --git a/kernel/module.c b/kernel/module.c
index 206915830d29..dc582749fa13 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -136,6 +136,7 @@ static int param_set_bool_enable_only(const char *val,
136} 136}
137 137
138static const struct kernel_param_ops param_ops_bool_enable_only = { 138static const struct kernel_param_ops param_ops_bool_enable_only = {
139 .flags = KERNEL_PARAM_FL_NOARG,
139 .set = param_set_bool_enable_only, 140 .set = param_set_bool_enable_only,
140 .get = param_get_bool, 141 .get = param_get_bool,
141}; 142};
@@ -603,7 +604,7 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \
603static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ 604static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
604 struct module_kobject *mk, char *buffer) \ 605 struct module_kobject *mk, char *buffer) \
605{ \ 606{ \
606 return sprintf(buffer, "%s\n", mk->mod->field); \ 607 return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \
607} \ 608} \
608static int modinfo_##field##_exists(struct module *mod) \ 609static int modinfo_##field##_exists(struct module *mod) \
609{ \ 610{ \
@@ -1611,6 +1612,14 @@ static void module_remove_modinfo_attrs(struct module *mod)
1611 kfree(mod->modinfo_attrs); 1612 kfree(mod->modinfo_attrs);
1612} 1613}
1613 1614
1615static void mod_kobject_put(struct module *mod)
1616{
1617 DECLARE_COMPLETION_ONSTACK(c);
1618 mod->mkobj.kobj_completion = &c;
1619 kobject_put(&mod->mkobj.kobj);
1620 wait_for_completion(&c);
1621}
1622
1614static int mod_sysfs_init(struct module *mod) 1623static int mod_sysfs_init(struct module *mod)
1615{ 1624{
1616 int err; 1625 int err;
@@ -1638,7 +1647,7 @@ static int mod_sysfs_init(struct module *mod)
1638 err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, 1647 err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
1639 "%s", mod->name); 1648 "%s", mod->name);
1640 if (err) 1649 if (err)
1641 kobject_put(&mod->mkobj.kobj); 1650 mod_kobject_put(mod);
1642 1651
1643 /* delay uevent until full sysfs population */ 1652 /* delay uevent until full sysfs population */
1644out: 1653out:
@@ -1682,7 +1691,7 @@ out_unreg_param:
1682out_unreg_holders: 1691out_unreg_holders:
1683 kobject_put(mod->holders_dir); 1692 kobject_put(mod->holders_dir);
1684out_unreg: 1693out_unreg:
1685 kobject_put(&mod->mkobj.kobj); 1694 mod_kobject_put(mod);
1686out: 1695out:
1687 return err; 1696 return err;
1688} 1697}
@@ -1691,7 +1700,7 @@ static void mod_sysfs_fini(struct module *mod)
1691{ 1700{
1692 remove_notes_attrs(mod); 1701 remove_notes_attrs(mod);
1693 remove_sect_attrs(mod); 1702 remove_sect_attrs(mod);
1694 kobject_put(&mod->mkobj.kobj); 1703 mod_kobject_put(mod);
1695} 1704}
1696 1705
1697#else /* !CONFIG_SYSFS */ 1706#else /* !CONFIG_SYSFS */
@@ -2540,21 +2549,20 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
2540/* Sets info->hdr and info->len. */ 2549/* Sets info->hdr and info->len. */
2541static int copy_module_from_fd(int fd, struct load_info *info) 2550static int copy_module_from_fd(int fd, struct load_info *info)
2542{ 2551{
2543 struct file *file; 2552 struct fd f = fdget(fd);
2544 int err; 2553 int err;
2545 struct kstat stat; 2554 struct kstat stat;
2546 loff_t pos; 2555 loff_t pos;
2547 ssize_t bytes = 0; 2556 ssize_t bytes = 0;
2548 2557
2549 file = fget(fd); 2558 if (!f.file)
2550 if (!file)
2551 return -ENOEXEC; 2559 return -ENOEXEC;
2552 2560
2553 err = security_kernel_module_from_file(file); 2561 err = security_kernel_module_from_file(f.file);
2554 if (err) 2562 if (err)
2555 goto out; 2563 goto out;
2556 2564
2557 err = vfs_getattr(&file->f_path, &stat); 2565 err = vfs_getattr(&f.file->f_path, &stat);
2558 if (err) 2566 if (err)
2559 goto out; 2567 goto out;
2560 2568
@@ -2577,7 +2585,7 @@ static int copy_module_from_fd(int fd, struct load_info *info)
2577 2585
2578 pos = 0; 2586 pos = 0;
2579 while (pos < stat.size) { 2587 while (pos < stat.size) {
2580 bytes = kernel_read(file, pos, (char *)(info->hdr) + pos, 2588 bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos,
2581 stat.size - pos); 2589 stat.size - pos);
2582 if (bytes < 0) { 2590 if (bytes < 0) {
2583 vfree(info->hdr); 2591 vfree(info->hdr);
@@ -2591,7 +2599,7 @@ static int copy_module_from_fd(int fd, struct load_info *info)
2591 info->len = pos; 2599 info->len = pos;
2592 2600
2593out: 2601out:
2594 fput(file); 2602 fdput(f);
2595 return err; 2603 return err;
2596} 2604}
2597 2605
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a52ee7bb830d..d24105b1b794 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -209,11 +209,13 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
209 */ 209 */
210static inline int mutex_can_spin_on_owner(struct mutex *lock) 210static inline int mutex_can_spin_on_owner(struct mutex *lock)
211{ 211{
212 struct task_struct *owner;
212 int retval = 1; 213 int retval = 1;
213 214
214 rcu_read_lock(); 215 rcu_read_lock();
215 if (lock->owner) 216 owner = ACCESS_ONCE(lock->owner);
216 retval = lock->owner->on_cpu; 217 if (owner)
218 retval = owner->on_cpu;
217 rcu_read_unlock(); 219 rcu_read_unlock();
218 /* 220 /*
219 * if lock->owner is not set, the mutex owner may have just acquired 221 * if lock->owner is not set, the mutex owner may have just acquired
@@ -408,7 +410,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
408static __always_inline int __sched 410static __always_inline int __sched
409__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, 411__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
410 struct lockdep_map *nest_lock, unsigned long ip, 412 struct lockdep_map *nest_lock, unsigned long ip,
411 struct ww_acquire_ctx *ww_ctx) 413 struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
412{ 414{
413 struct task_struct *task = current; 415 struct task_struct *task = current;
414 struct mutex_waiter waiter; 416 struct mutex_waiter waiter;
@@ -448,7 +450,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
448 struct task_struct *owner; 450 struct task_struct *owner;
449 struct mspin_node node; 451 struct mspin_node node;
450 452
451 if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { 453 if (use_ww_ctx && ww_ctx->acquired > 0) {
452 struct ww_mutex *ww; 454 struct ww_mutex *ww;
453 455
454 ww = container_of(lock, struct ww_mutex, base); 456 ww = container_of(lock, struct ww_mutex, base);
@@ -461,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
461 * performed the optimistic spinning cannot be done. 463 * performed the optimistic spinning cannot be done.
462 */ 464 */
463 if (ACCESS_ONCE(ww->ctx)) 465 if (ACCESS_ONCE(ww->ctx))
464 break; 466 goto slowpath;
465 } 467 }
466 468
467 /* 469 /*
@@ -472,13 +474,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
472 owner = ACCESS_ONCE(lock->owner); 474 owner = ACCESS_ONCE(lock->owner);
473 if (owner && !mutex_spin_on_owner(lock, owner)) { 475 if (owner && !mutex_spin_on_owner(lock, owner)) {
474 mspin_unlock(MLOCK(lock), &node); 476 mspin_unlock(MLOCK(lock), &node);
475 break; 477 goto slowpath;
476 } 478 }
477 479
478 if ((atomic_read(&lock->count) == 1) && 480 if ((atomic_read(&lock->count) == 1) &&
479 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { 481 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
480 lock_acquired(&lock->dep_map, ip); 482 lock_acquired(&lock->dep_map, ip);
481 if (!__builtin_constant_p(ww_ctx == NULL)) { 483 if (use_ww_ctx) {
482 struct ww_mutex *ww; 484 struct ww_mutex *ww;
483 ww = container_of(lock, struct ww_mutex, base); 485 ww = container_of(lock, struct ww_mutex, base);
484 486
@@ -499,7 +501,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
499 * the owner complete. 501 * the owner complete.
500 */ 502 */
501 if (!owner && (need_resched() || rt_task(task))) 503 if (!owner && (need_resched() || rt_task(task)))
502 break; 504 goto slowpath;
503 505
504 /* 506 /*
505 * The cpu_relax() call is a compiler barrier which forces 507 * The cpu_relax() call is a compiler barrier which forces
@@ -513,6 +515,10 @@ slowpath:
513#endif 515#endif
514 spin_lock_mutex(&lock->wait_lock, flags); 516 spin_lock_mutex(&lock->wait_lock, flags);
515 517
518 /* once more, can we acquire the lock? */
519 if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1))
520 goto skip_wait;
521
516 debug_mutex_lock_common(lock, &waiter); 522 debug_mutex_lock_common(lock, &waiter);
517 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); 523 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
518 524
@@ -520,9 +526,6 @@ slowpath:
520 list_add_tail(&waiter.list, &lock->wait_list); 526 list_add_tail(&waiter.list, &lock->wait_list);
521 waiter.task = task; 527 waiter.task = task;
522 528
523 if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
524 goto done;
525
526 lock_contended(&lock->dep_map, ip); 529 lock_contended(&lock->dep_map, ip);
527 530
528 for (;;) { 531 for (;;) {
@@ -536,7 +539,7 @@ slowpath:
536 * other waiters: 539 * other waiters:
537 */ 540 */
538 if (MUTEX_SHOW_NO_WAITER(lock) && 541 if (MUTEX_SHOW_NO_WAITER(lock) &&
539 (atomic_xchg(&lock->count, -1) == 1)) 542 (atomic_xchg(&lock->count, -1) == 1))
540 break; 543 break;
541 544
542 /* 545 /*
@@ -548,7 +551,7 @@ slowpath:
548 goto err; 551 goto err;
549 } 552 }
550 553
551 if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { 554 if (use_ww_ctx && ww_ctx->acquired > 0) {
552 ret = __mutex_lock_check_stamp(lock, ww_ctx); 555 ret = __mutex_lock_check_stamp(lock, ww_ctx);
553 if (ret) 556 if (ret)
554 goto err; 557 goto err;
@@ -561,24 +564,25 @@ slowpath:
561 schedule_preempt_disabled(); 564 schedule_preempt_disabled();
562 spin_lock_mutex(&lock->wait_lock, flags); 565 spin_lock_mutex(&lock->wait_lock, flags);
563 } 566 }
567 mutex_remove_waiter(lock, &waiter, current_thread_info());
568 /* set it to 0 if there are no waiters left: */
569 if (likely(list_empty(&lock->wait_list)))
570 atomic_set(&lock->count, 0);
571 debug_mutex_free_waiter(&waiter);
564 572
565done: 573skip_wait:
574 /* got the lock - cleanup and rejoice! */
566 lock_acquired(&lock->dep_map, ip); 575 lock_acquired(&lock->dep_map, ip);
567 /* got the lock - rejoice! */
568 mutex_remove_waiter(lock, &waiter, current_thread_info());
569 mutex_set_owner(lock); 576 mutex_set_owner(lock);
570 577
571 if (!__builtin_constant_p(ww_ctx == NULL)) { 578 if (use_ww_ctx) {
572 struct ww_mutex *ww = container_of(lock, 579 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
573 struct ww_mutex,
574 base);
575 struct mutex_waiter *cur; 580 struct mutex_waiter *cur;
576 581
577 /* 582 /*
578 * This branch gets optimized out for the common case, 583 * This branch gets optimized out for the common case,
579 * and is only important for ww_mutex_lock. 584 * and is only important for ww_mutex_lock.
580 */ 585 */
581
582 ww_mutex_lock_acquired(ww, ww_ctx); 586 ww_mutex_lock_acquired(ww, ww_ctx);
583 ww->ctx = ww_ctx; 587 ww->ctx = ww_ctx;
584 588
@@ -592,15 +596,8 @@ done:
592 } 596 }
593 } 597 }
594 598
595 /* set it to 0 if there are no waiters left: */
596 if (likely(list_empty(&lock->wait_list)))
597 atomic_set(&lock->count, 0);
598
599 spin_unlock_mutex(&lock->wait_lock, flags); 599 spin_unlock_mutex(&lock->wait_lock, flags);
600
601 debug_mutex_free_waiter(&waiter);
602 preempt_enable(); 600 preempt_enable();
603
604 return 0; 601 return 0;
605 602
606err: 603err:
@@ -618,7 +615,7 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
618{ 615{
619 might_sleep(); 616 might_sleep();
620 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 617 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
621 subclass, NULL, _RET_IP_, NULL); 618 subclass, NULL, _RET_IP_, NULL, 0);
622} 619}
623 620
624EXPORT_SYMBOL_GPL(mutex_lock_nested); 621EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -628,7 +625,7 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
628{ 625{
629 might_sleep(); 626 might_sleep();
630 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 627 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
631 0, nest, _RET_IP_, NULL); 628 0, nest, _RET_IP_, NULL, 0);
632} 629}
633 630
634EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); 631EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
@@ -638,7 +635,7 @@ mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
638{ 635{
639 might_sleep(); 636 might_sleep();
640 return __mutex_lock_common(lock, TASK_KILLABLE, 637 return __mutex_lock_common(lock, TASK_KILLABLE,
641 subclass, NULL, _RET_IP_, NULL); 638 subclass, NULL, _RET_IP_, NULL, 0);
642} 639}
643EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); 640EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
644 641
@@ -647,7 +644,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
647{ 644{
648 might_sleep(); 645 might_sleep();
649 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 646 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
650 subclass, NULL, _RET_IP_, NULL); 647 subclass, NULL, _RET_IP_, NULL, 0);
651} 648}
652 649
653EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 650EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -685,7 +682,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
685 682
686 might_sleep(); 683 might_sleep();
687 ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 684 ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
688 0, &ctx->dep_map, _RET_IP_, ctx); 685 0, &ctx->dep_map, _RET_IP_, ctx, 1);
689 if (!ret && ctx->acquired > 1) 686 if (!ret && ctx->acquired > 1)
690 return ww_mutex_deadlock_injection(lock, ctx); 687 return ww_mutex_deadlock_injection(lock, ctx);
691 688
@@ -700,7 +697,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
700 697
701 might_sleep(); 698 might_sleep();
702 ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 699 ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
703 0, &ctx->dep_map, _RET_IP_, ctx); 700 0, &ctx->dep_map, _RET_IP_, ctx, 1);
704 701
705 if (!ret && ctx->acquired > 1) 702 if (!ret && ctx->acquired > 1)
706 return ww_mutex_deadlock_injection(lock, ctx); 703 return ww_mutex_deadlock_injection(lock, ctx);
@@ -812,28 +809,28 @@ __mutex_lock_slowpath(atomic_t *lock_count)
812 struct mutex *lock = container_of(lock_count, struct mutex, count); 809 struct mutex *lock = container_of(lock_count, struct mutex, count);
813 810
814 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, 811 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
815 NULL, _RET_IP_, NULL); 812 NULL, _RET_IP_, NULL, 0);
816} 813}
817 814
818static noinline int __sched 815static noinline int __sched
819__mutex_lock_killable_slowpath(struct mutex *lock) 816__mutex_lock_killable_slowpath(struct mutex *lock)
820{ 817{
821 return __mutex_lock_common(lock, TASK_KILLABLE, 0, 818 return __mutex_lock_common(lock, TASK_KILLABLE, 0,
822 NULL, _RET_IP_, NULL); 819 NULL, _RET_IP_, NULL, 0);
823} 820}
824 821
825static noinline int __sched 822static noinline int __sched
826__mutex_lock_interruptible_slowpath(struct mutex *lock) 823__mutex_lock_interruptible_slowpath(struct mutex *lock)
827{ 824{
828 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, 825 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
829 NULL, _RET_IP_, NULL); 826 NULL, _RET_IP_, NULL, 0);
830} 827}
831 828
832static noinline int __sched 829static noinline int __sched
833__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) 830__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
834{ 831{
835 return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, 832 return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
836 NULL, _RET_IP_, ctx); 833 NULL, _RET_IP_, ctx, 1);
837} 834}
838 835
839static noinline int __sched 836static noinline int __sched
@@ -841,7 +838,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
841 struct ww_acquire_ctx *ctx) 838 struct ww_acquire_ctx *ctx)
842{ 839{
843 return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, 840 return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
844 NULL, _RET_IP_, ctx); 841 NULL, _RET_IP_, ctx, 1);
845} 842}
846 843
847#endif 844#endif
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 997cbb951a3b..8e7811086b82 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -126,22 +126,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
126 struct nsproxy *old_ns = tsk->nsproxy; 126 struct nsproxy *old_ns = tsk->nsproxy;
127 struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); 127 struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
128 struct nsproxy *new_ns; 128 struct nsproxy *new_ns;
129 int err = 0;
130 129
131 if (!old_ns) 130 if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
131 CLONE_NEWPID | CLONE_NEWNET)))) {
132 get_nsproxy(old_ns);
132 return 0; 133 return 0;
133
134 get_nsproxy(old_ns);
135
136 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
137 CLONE_NEWPID | CLONE_NEWNET)))
138 return 0;
139
140 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
141 err = -EPERM;
142 goto out;
143 } 134 }
144 135
136 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
137 return -EPERM;
138
145 /* 139 /*
146 * CLONE_NEWIPC must detach from the undolist: after switching 140 * CLONE_NEWIPC must detach from the undolist: after switching
147 * to a new ipc namespace, the semaphore arrays from the old 141 * to a new ipc namespace, the semaphore arrays from the old
@@ -149,22 +143,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
149 * means share undolist with parent, so we must forbid using 143 * means share undolist with parent, so we must forbid using
150 * it along with CLONE_NEWIPC. 144 * it along with CLONE_NEWIPC.
151 */ 145 */
152 if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) { 146 if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
153 err = -EINVAL; 147 (CLONE_NEWIPC | CLONE_SYSVSEM))
154 goto out; 148 return -EINVAL;
155 }
156 149
157 new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); 150 new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
158 if (IS_ERR(new_ns)) { 151 if (IS_ERR(new_ns))
159 err = PTR_ERR(new_ns); 152 return PTR_ERR(new_ns);
160 goto out;
161 }
162 153
163 tsk->nsproxy = new_ns; 154 tsk->nsproxy = new_ns;
164 155 return 0;
165out:
166 put_nsproxy(old_ns);
167 return err;
168} 156}
169 157
170void free_nsproxy(struct nsproxy *ns) 158void free_nsproxy(struct nsproxy *ns)
diff --git a/kernel/padata.c b/kernel/padata.c
index 072f4ee4eb89..07af2c95dcfe 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -846,6 +846,8 @@ static int padata_cpu_callback(struct notifier_block *nfb,
846 switch (action) { 846 switch (action) {
847 case CPU_ONLINE: 847 case CPU_ONLINE:
848 case CPU_ONLINE_FROZEN: 848 case CPU_ONLINE_FROZEN:
849 case CPU_DOWN_FAILED:
850 case CPU_DOWN_FAILED_FROZEN:
849 if (!pinst_has_cpu(pinst, cpu)) 851 if (!pinst_has_cpu(pinst, cpu))
850 break; 852 break;
851 mutex_lock(&pinst->lock); 853 mutex_lock(&pinst->lock);
@@ -857,6 +859,8 @@ static int padata_cpu_callback(struct notifier_block *nfb,
857 859
858 case CPU_DOWN_PREPARE: 860 case CPU_DOWN_PREPARE:
859 case CPU_DOWN_PREPARE_FROZEN: 861 case CPU_DOWN_PREPARE_FROZEN:
862 case CPU_UP_CANCELED:
863 case CPU_UP_CANCELED_FROZEN:
860 if (!pinst_has_cpu(pinst, cpu)) 864 if (!pinst_has_cpu(pinst, cpu))
861 break; 865 break;
862 mutex_lock(&pinst->lock); 866 mutex_lock(&pinst->lock);
@@ -865,22 +869,6 @@ static int padata_cpu_callback(struct notifier_block *nfb,
865 if (err) 869 if (err)
866 return notifier_from_errno(err); 870 return notifier_from_errno(err);
867 break; 871 break;
868
869 case CPU_UP_CANCELED:
870 case CPU_UP_CANCELED_FROZEN:
871 if (!pinst_has_cpu(pinst, cpu))
872 break;
873 mutex_lock(&pinst->lock);
874 __padata_remove_cpu(pinst, cpu);
875 mutex_unlock(&pinst->lock);
876
877 case CPU_DOWN_FAILED:
878 case CPU_DOWN_FAILED_FROZEN:
879 if (!pinst_has_cpu(pinst, cpu))
880 break;
881 mutex_lock(&pinst->lock);
882 __padata_add_cpu(pinst, cpu);
883 mutex_unlock(&pinst->lock);
884 } 872 }
885 873
886 return NOTIFY_OK; 874 return NOTIFY_OK;
@@ -1086,18 +1074,18 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq,
1086 1074
1087 pinst->flags = 0; 1075 pinst->flags = 0;
1088 1076
1089#ifdef CONFIG_HOTPLUG_CPU
1090 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
1091 pinst->cpu_notifier.priority = 0;
1092 register_hotcpu_notifier(&pinst->cpu_notifier);
1093#endif
1094
1095 put_online_cpus(); 1077 put_online_cpus();
1096 1078
1097 BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); 1079 BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
1098 kobject_init(&pinst->kobj, &padata_attr_type); 1080 kobject_init(&pinst->kobj, &padata_attr_type);
1099 mutex_init(&pinst->lock); 1081 mutex_init(&pinst->lock);
1100 1082
1083#ifdef CONFIG_HOTPLUG_CPU
1084 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
1085 pinst->cpu_notifier.priority = 0;
1086 register_hotcpu_notifier(&pinst->cpu_notifier);
1087#endif
1088
1101 return pinst; 1089 return pinst;
1102 1090
1103err_free_masks: 1091err_free_masks:
diff --git a/kernel/panic.c b/kernel/panic.c
index 801864600514..b6c482ccc5db 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -123,10 +123,14 @@ void panic(const char *fmt, ...)
123 */ 123 */
124 smp_send_stop(); 124 smp_send_stop();
125 125
126 kmsg_dump(KMSG_DUMP_PANIC); 126 /*
127 127 * Run any panic handlers, including those that might need to
128 * add information to the kmsg dump output.
129 */
128 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 130 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
129 131
132 kmsg_dump(KMSG_DUMP_PANIC);
133
130 bust_spinlocks(0); 134 bust_spinlocks(0);
131 135
132 if (!panic_blink) 136 if (!panic_blink)
diff --git a/kernel/params.c b/kernel/params.c
index 440e65d1a544..c00d5b502aa4 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -103,8 +103,8 @@ static int parse_one(char *param,
103 || params[i].level > max_level) 103 || params[i].level > max_level)
104 return 0; 104 return 0;
105 /* No one handled NULL, so do it here. */ 105 /* No one handled NULL, so do it here. */
106 if (!val && params[i].ops->set != param_set_bool 106 if (!val &&
107 && params[i].ops->set != param_set_bint) 107 !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG))
108 return -EINVAL; 108 return -EINVAL;
109 pr_debug("handling %s with %p\n", param, 109 pr_debug("handling %s with %p\n", param,
110 params[i].ops->set); 110 params[i].ops->set);
@@ -241,7 +241,8 @@ int parse_args(const char *doing,
241 } \ 241 } \
242 int param_get_##name(char *buffer, const struct kernel_param *kp) \ 242 int param_get_##name(char *buffer, const struct kernel_param *kp) \
243 { \ 243 { \
244 return sprintf(buffer, format, *((type *)kp->arg)); \ 244 return scnprintf(buffer, PAGE_SIZE, format, \
245 *((type *)kp->arg)); \
245 } \ 246 } \
246 struct kernel_param_ops param_ops_##name = { \ 247 struct kernel_param_ops param_ops_##name = { \
247 .set = param_set_##name, \ 248 .set = param_set_##name, \
@@ -252,13 +253,13 @@ int parse_args(const char *doing,
252 EXPORT_SYMBOL(param_ops_##name) 253 EXPORT_SYMBOL(param_ops_##name)
253 254
254 255
255STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); 256STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul);
256STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); 257STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol);
257STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); 258STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul);
258STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); 259STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol);
259STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); 260STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul);
260STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); 261STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol);
261STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); 262STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul);
262 263
263int param_set_charp(const char *val, const struct kernel_param *kp) 264int param_set_charp(const char *val, const struct kernel_param *kp)
264{ 265{
@@ -285,7 +286,7 @@ EXPORT_SYMBOL(param_set_charp);
285 286
286int param_get_charp(char *buffer, const struct kernel_param *kp) 287int param_get_charp(char *buffer, const struct kernel_param *kp)
287{ 288{
288 return sprintf(buffer, "%s", *((char **)kp->arg)); 289 return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg));
289} 290}
290EXPORT_SYMBOL(param_get_charp); 291EXPORT_SYMBOL(param_get_charp);
291 292
@@ -320,6 +321,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp)
320EXPORT_SYMBOL(param_get_bool); 321EXPORT_SYMBOL(param_get_bool);
321 322
322struct kernel_param_ops param_ops_bool = { 323struct kernel_param_ops param_ops_bool = {
324 .flags = KERNEL_PARAM_FL_NOARG,
323 .set = param_set_bool, 325 .set = param_set_bool,
324 .get = param_get_bool, 326 .get = param_get_bool,
325}; 327};
@@ -370,6 +372,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
370EXPORT_SYMBOL(param_set_bint); 372EXPORT_SYMBOL(param_set_bint);
371 373
372struct kernel_param_ops param_ops_bint = { 374struct kernel_param_ops param_ops_bint = {
375 .flags = KERNEL_PARAM_FL_NOARG,
373 .set = param_set_bint, 376 .set = param_set_bint,
374 .get = param_get_int, 377 .get = param_get_int,
375}; 378};
@@ -827,7 +830,7 @@ ssize_t __modver_version_show(struct module_attribute *mattr,
827 struct module_version_attribute *vattr = 830 struct module_version_attribute *vattr =
828 container_of(mattr, struct module_version_attribute, mattr); 831 container_of(mattr, struct module_version_attribute, mattr);
829 832
830 return sprintf(buf, "%s\n", vattr->version); 833 return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version);
831} 834}
832 835
833extern const struct module_version_attribute *__start___modver[]; 836extern const struct module_version_attribute *__start___modver[];
@@ -912,7 +915,14 @@ static const struct kset_uevent_ops module_uevent_ops = {
912struct kset *module_kset; 915struct kset *module_kset;
913int module_sysfs_initialized; 916int module_sysfs_initialized;
914 917
918static void module_kobj_release(struct kobject *kobj)
919{
920 struct module_kobject *mk = to_module_kobject(kobj);
921 complete(mk->kobj_completion);
922}
923
915struct kobj_type module_ktype = { 924struct kobj_type module_ktype = {
925 .release = module_kobj_release,
916 .sysfs_ops = &module_sysfs_ops, 926 .sysfs_ops = &module_sysfs_ops,
917}; 927};
918 928
diff --git a/kernel/pid.c b/kernel/pid.c
index 66505c1dfc51..9b9a26698144 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -265,6 +265,7 @@ void free_pid(struct pid *pid)
265 struct pid_namespace *ns = upid->ns; 265 struct pid_namespace *ns = upid->ns;
266 hlist_del_rcu(&upid->pid_chain); 266 hlist_del_rcu(&upid->pid_chain);
267 switch(--ns->nr_hashed) { 267 switch(--ns->nr_hashed) {
268 case 2:
268 case 1: 269 case 1:
269 /* When all that is left in the pid namespace 270 /* When all that is left in the pid namespace
270 * is the reaper wake up the reaper. The reaper 271 * is the reaper wake up the reaper. The reaper
@@ -272,6 +273,11 @@ void free_pid(struct pid *pid)
272 */ 273 */
273 wake_up_process(ns->child_reaper); 274 wake_up_process(ns->child_reaper);
274 break; 275 break;
276 case PIDNS_HASH_ADDING:
277 /* Handle a fork failure of the first process */
278 WARN_ON(ns->child_reaper);
279 ns->nr_hashed = 0;
280 /* fall through */
275 case 0: 281 case 0:
276 schedule_work(&ns->proc_work); 282 schedule_work(&ns->proc_work);
277 break; 283 break;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 601bb361c235..42086551a24a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -329,7 +329,7 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
329 struct pid_namespace *ancestor, *new = ns; 329 struct pid_namespace *ancestor, *new = ns;
330 330
331 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || 331 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
332 !nsown_capable(CAP_SYS_ADMIN)) 332 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
333 return -EPERM; 333 return -EPERM;
334 334
335 /* 335 /*
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b26f5f1e773e..0121dab83f43 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -39,7 +39,7 @@ static int resume_delay;
39static char resume_file[256] = CONFIG_PM_STD_PARTITION; 39static char resume_file[256] = CONFIG_PM_STD_PARTITION;
40dev_t swsusp_resume_device; 40dev_t swsusp_resume_device;
41sector_t swsusp_resume_block; 41sector_t swsusp_resume_block;
42int in_suspend __nosavedata; 42__visible int in_suspend __nosavedata;
43 43
44enum { 44enum {
45 HIBERNATION_INVALID, 45 HIBERNATION_INVALID,
@@ -644,22 +644,23 @@ int hibernate(void)
644 if (error) 644 if (error)
645 goto Exit; 645 goto Exit;
646 646
647 /* Allocate memory management structures */
648 error = create_basic_memory_bitmaps();
649 if (error)
650 goto Exit;
651
652 printk(KERN_INFO "PM: Syncing filesystems ... "); 647 printk(KERN_INFO "PM: Syncing filesystems ... ");
653 sys_sync(); 648 sys_sync();
654 printk("done.\n"); 649 printk("done.\n");
655 650
656 error = freeze_processes(); 651 error = freeze_processes();
657 if (error) 652 if (error)
658 goto Free_bitmaps; 653 goto Exit;
654
655 lock_device_hotplug();
656 /* Allocate memory management structures */
657 error = create_basic_memory_bitmaps();
658 if (error)
659 goto Thaw;
659 660
660 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 661 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
661 if (error || freezer_test_done) 662 if (error || freezer_test_done)
662 goto Thaw; 663 goto Free_bitmaps;
663 664
664 if (in_suspend) { 665 if (in_suspend) {
665 unsigned int flags = 0; 666 unsigned int flags = 0;
@@ -682,14 +683,14 @@ int hibernate(void)
682 pr_debug("PM: Image restored successfully.\n"); 683 pr_debug("PM: Image restored successfully.\n");
683 } 684 }
684 685
686 Free_bitmaps:
687 free_basic_memory_bitmaps();
685 Thaw: 688 Thaw:
689 unlock_device_hotplug();
686 thaw_processes(); 690 thaw_processes();
687 691
688 /* Don't bother checking whether freezer_test_done is true */ 692 /* Don't bother checking whether freezer_test_done is true */
689 freezer_test_done = false; 693 freezer_test_done = false;
690
691 Free_bitmaps:
692 free_basic_memory_bitmaps();
693 Exit: 694 Exit:
694 pm_notifier_call_chain(PM_POST_HIBERNATION); 695 pm_notifier_call_chain(PM_POST_HIBERNATION);
695 pm_restore_console(); 696 pm_restore_console();
@@ -806,21 +807,20 @@ static int software_resume(void)
806 pm_prepare_console(); 807 pm_prepare_console();
807 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 808 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
808 if (error) 809 if (error)
809 goto close_finish; 810 goto Close_Finish;
810
811 error = create_basic_memory_bitmaps();
812 if (error)
813 goto close_finish;
814 811
815 pr_debug("PM: Preparing processes for restore.\n"); 812 pr_debug("PM: Preparing processes for restore.\n");
816 error = freeze_processes(); 813 error = freeze_processes();
817 if (error) { 814 if (error)
818 swsusp_close(FMODE_READ); 815 goto Close_Finish;
819 goto Done;
820 }
821 816
822 pr_debug("PM: Loading hibernation image.\n"); 817 pr_debug("PM: Loading hibernation image.\n");
823 818
819 lock_device_hotplug();
820 error = create_basic_memory_bitmaps();
821 if (error)
822 goto Thaw;
823
824 error = swsusp_read(&flags); 824 error = swsusp_read(&flags);
825 swsusp_close(FMODE_READ); 825 swsusp_close(FMODE_READ);
826 if (!error) 826 if (!error)
@@ -828,9 +828,10 @@ static int software_resume(void)
828 828
829 printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); 829 printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
830 swsusp_free(); 830 swsusp_free();
831 thaw_processes();
832 Done:
833 free_basic_memory_bitmaps(); 831 free_basic_memory_bitmaps();
832 Thaw:
833 unlock_device_hotplug();
834 thaw_processes();
834 Finish: 835 Finish:
835 pm_notifier_call_chain(PM_POST_RESTORE); 836 pm_notifier_call_chain(PM_POST_RESTORE);
836 pm_restore_console(); 837 pm_restore_console();
@@ -840,12 +841,12 @@ static int software_resume(void)
840 mutex_unlock(&pm_mutex); 841 mutex_unlock(&pm_mutex);
841 pr_debug("PM: Hibernation image not present or could not be loaded.\n"); 842 pr_debug("PM: Hibernation image not present or could not be loaded.\n");
842 return error; 843 return error;
843close_finish: 844 Close_Finish:
844 swsusp_close(FMODE_READ); 845 swsusp_close(FMODE_READ);
845 goto Finish; 846 goto Finish;
846} 847}
847 848
848late_initcall(software_resume); 849late_initcall_sync(software_resume);
849 850
850 851
851static const char * const hibernation_modes[] = { 852static const char * const hibernation_modes[] = {
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 349587bb03e1..98c3b34a4cff 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -352,7 +352,7 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
352 struct mem_extent *ext, *cur, *aux; 352 struct mem_extent *ext, *cur, *aux;
353 353
354 zone_start = zone->zone_start_pfn; 354 zone_start = zone->zone_start_pfn;
355 zone_end = zone->zone_start_pfn + zone->spanned_pages; 355 zone_end = zone_end_pfn(zone);
356 356
357 list_for_each_entry(ext, list, hook) 357 list_for_each_entry(ext, list, hook)
358 if (zone_start <= ext->end) 358 if (zone_start <= ext->end)
@@ -743,7 +743,10 @@ int create_basic_memory_bitmaps(void)
743 struct memory_bitmap *bm1, *bm2; 743 struct memory_bitmap *bm1, *bm2;
744 int error = 0; 744 int error = 0;
745 745
746 BUG_ON(forbidden_pages_map || free_pages_map); 746 if (forbidden_pages_map && free_pages_map)
747 return 0;
748 else
749 BUG_ON(forbidden_pages_map || free_pages_map);
747 750
748 bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); 751 bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
749 if (!bm1) 752 if (!bm1)
@@ -884,7 +887,7 @@ static unsigned int count_highmem_pages(void)
884 continue; 887 continue;
885 888
886 mark_free_pages(zone); 889 mark_free_pages(zone);
887 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 890 max_zone_pfn = zone_end_pfn(zone);
888 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 891 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
889 if (saveable_highmem_page(zone, pfn)) 892 if (saveable_highmem_page(zone, pfn))
890 n++; 893 n++;
@@ -948,7 +951,7 @@ static unsigned int count_data_pages(void)
948 continue; 951 continue;
949 952
950 mark_free_pages(zone); 953 mark_free_pages(zone);
951 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 954 max_zone_pfn = zone_end_pfn(zone);
952 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 955 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
953 if (saveable_page(zone, pfn)) 956 if (saveable_page(zone, pfn))
954 n++; 957 n++;
@@ -1041,7 +1044,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1041 unsigned long max_zone_pfn; 1044 unsigned long max_zone_pfn;
1042 1045
1043 mark_free_pages(zone); 1046 mark_free_pages(zone);
1044 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1047 max_zone_pfn = zone_end_pfn(zone);
1045 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1048 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1046 if (page_is_saveable(zone, pfn)) 1049 if (page_is_saveable(zone, pfn))
1047 memory_bm_set_bit(orig_bm, pfn); 1050 memory_bm_set_bit(orig_bm, pfn);
@@ -1093,7 +1096,7 @@ void swsusp_free(void)
1093 unsigned long pfn, max_zone_pfn; 1096 unsigned long pfn, max_zone_pfn;
1094 1097
1095 for_each_populated_zone(zone) { 1098 for_each_populated_zone(zone) {
1096 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1099 max_zone_pfn = zone_end_pfn(zone);
1097 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1100 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1098 if (pfn_valid(pfn)) { 1101 if (pfn_valid(pfn)) {
1099 struct page *page = pfn_to_page(pfn); 1102 struct page *page = pfn_to_page(pfn);
@@ -1755,7 +1758,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1755 1758
1756 /* Clear page flags */ 1759 /* Clear page flags */
1757 for_each_populated_zone(zone) { 1760 for_each_populated_zone(zone) {
1758 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1761 max_zone_pfn = zone_end_pfn(zone);
1759 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1762 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1760 if (pfn_valid(pfn)) 1763 if (pfn_valid(pfn))
1761 swsusp_unset_page_free(pfn_to_page(pfn)); 1764 swsusp_unset_page_free(pfn_to_page(pfn));
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ece04223bb1e..62ee437b5c7e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -210,6 +210,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
210 goto Platform_wake; 210 goto Platform_wake;
211 } 211 }
212 212
213 ftrace_stop();
213 error = disable_nonboot_cpus(); 214 error = disable_nonboot_cpus();
214 if (error || suspend_test(TEST_CPUS)) 215 if (error || suspend_test(TEST_CPUS))
215 goto Enable_cpus; 216 goto Enable_cpus;
@@ -232,6 +233,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
232 233
233 Enable_cpus: 234 Enable_cpus:
234 enable_nonboot_cpus(); 235 enable_nonboot_cpus();
236 ftrace_start();
235 237
236 Platform_wake: 238 Platform_wake:
237 if (need_suspend_ops(state) && suspend_ops->wake) 239 if (need_suspend_ops(state) && suspend_ops->wake)
@@ -265,7 +267,6 @@ int suspend_devices_and_enter(suspend_state_t state)
265 goto Close; 267 goto Close;
266 } 268 }
267 suspend_console(); 269 suspend_console();
268 ftrace_stop();
269 suspend_test_start(); 270 suspend_test_start();
270 error = dpm_suspend_start(PMSG_SUSPEND); 271 error = dpm_suspend_start(PMSG_SUSPEND);
271 if (error) { 272 if (error) {
@@ -285,7 +286,6 @@ int suspend_devices_and_enter(suspend_state_t state)
285 suspend_test_start(); 286 suspend_test_start();
286 dpm_resume_end(PMSG_RESUME); 287 dpm_resume_end(PMSG_RESUME);
287 suspend_test_finish("resume devices"); 288 suspend_test_finish("resume devices");
288 ftrace_start();
289 resume_console(); 289 resume_console();
290 Close: 290 Close:
291 if (need_suspend_ops(state) && suspend_ops->end) 291 if (need_suspend_ops(state) && suspend_ops->end)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 4ed81e74f86f..957f06164ad1 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -39,6 +39,7 @@ static struct snapshot_data {
39 char frozen; 39 char frozen;
40 char ready; 40 char ready;
41 char platform_support; 41 char platform_support;
42 bool free_bitmaps;
42} snapshot_state; 43} snapshot_state;
43 44
44atomic_t snapshot_device_available = ATOMIC_INIT(1); 45atomic_t snapshot_device_available = ATOMIC_INIT(1);
@@ -60,11 +61,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
60 error = -ENOSYS; 61 error = -ENOSYS;
61 goto Unlock; 62 goto Unlock;
62 } 63 }
63 if(create_basic_memory_bitmaps()) {
64 atomic_inc(&snapshot_device_available);
65 error = -ENOMEM;
66 goto Unlock;
67 }
68 nonseekable_open(inode, filp); 64 nonseekable_open(inode, filp);
69 data = &snapshot_state; 65 data = &snapshot_state;
70 filp->private_data = data; 66 filp->private_data = data;
@@ -87,13 +83,16 @@ static int snapshot_open(struct inode *inode, struct file *filp)
87 data->swap = -1; 83 data->swap = -1;
88 data->mode = O_WRONLY; 84 data->mode = O_WRONLY;
89 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 85 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
86 if (!error) {
87 error = create_basic_memory_bitmaps();
88 data->free_bitmaps = !error;
89 }
90 if (error) 90 if (error)
91 pm_notifier_call_chain(PM_POST_RESTORE); 91 pm_notifier_call_chain(PM_POST_RESTORE);
92 } 92 }
93 if (error) { 93 if (error)
94 free_basic_memory_bitmaps();
95 atomic_inc(&snapshot_device_available); 94 atomic_inc(&snapshot_device_available);
96 } 95
97 data->frozen = 0; 96 data->frozen = 0;
98 data->ready = 0; 97 data->ready = 0;
99 data->platform_support = 0; 98 data->platform_support = 0;
@@ -111,12 +110,14 @@ static int snapshot_release(struct inode *inode, struct file *filp)
111 lock_system_sleep(); 110 lock_system_sleep();
112 111
113 swsusp_free(); 112 swsusp_free();
114 free_basic_memory_bitmaps();
115 data = filp->private_data; 113 data = filp->private_data;
116 free_all_swap_pages(data->swap); 114 free_all_swap_pages(data->swap);
117 if (data->frozen) { 115 if (data->frozen) {
118 pm_restore_gfp_mask(); 116 pm_restore_gfp_mask();
117 free_basic_memory_bitmaps();
119 thaw_processes(); 118 thaw_processes();
119 } else if (data->free_bitmaps) {
120 free_basic_memory_bitmaps();
120 } 121 }
121 pm_notifier_call_chain(data->mode == O_RDONLY ? 122 pm_notifier_call_chain(data->mode == O_RDONLY ?
122 PM_POST_HIBERNATION : PM_POST_RESTORE); 123 PM_POST_HIBERNATION : PM_POST_RESTORE);
@@ -207,6 +208,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
207 if (!mutex_trylock(&pm_mutex)) 208 if (!mutex_trylock(&pm_mutex))
208 return -EBUSY; 209 return -EBUSY;
209 210
211 lock_device_hotplug();
210 data = filp->private_data; 212 data = filp->private_data;
211 213
212 switch (cmd) { 214 switch (cmd) {
@@ -220,14 +222,23 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
220 printk("done.\n"); 222 printk("done.\n");
221 223
222 error = freeze_processes(); 224 error = freeze_processes();
223 if (!error) 225 if (error)
226 break;
227
228 error = create_basic_memory_bitmaps();
229 if (error)
230 thaw_processes();
231 else
224 data->frozen = 1; 232 data->frozen = 1;
233
225 break; 234 break;
226 235
227 case SNAPSHOT_UNFREEZE: 236 case SNAPSHOT_UNFREEZE:
228 if (!data->frozen || data->ready) 237 if (!data->frozen || data->ready)
229 break; 238 break;
230 pm_restore_gfp_mask(); 239 pm_restore_gfp_mask();
240 free_basic_memory_bitmaps();
241 data->free_bitmaps = false;
231 thaw_processes(); 242 thaw_processes();
232 data->frozen = 0; 243 data->frozen = 0;
233 break; 244 break;
@@ -371,6 +382,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
371 382
372 } 383 }
373 384
385 unlock_device_hotplug();
374 mutex_unlock(&pm_mutex); 386 mutex_unlock(&pm_mutex);
375 387
376 return error; 388 return error;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5b5a7080e2a5..b4e8500afdb3 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2226,6 +2226,13 @@ void register_console(struct console *newcon)
2226 struct console *bcon = NULL; 2226 struct console *bcon = NULL;
2227 struct console_cmdline *c; 2227 struct console_cmdline *c;
2228 2228
2229 if (console_drivers)
2230 for_each_console(bcon)
2231 if (WARN(bcon == newcon,
2232 "console '%s%d' already registered\n",
2233 bcon->name, bcon->index))
2234 return;
2235
2229 /* 2236 /*
2230 * before we register a new CON_BOOT console, make sure we don't 2237 * before we register a new CON_BOOT console, make sure we don't
2231 * already have a valid console 2238 * already have a valid console
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a146ee327f6a..dd562e9aa2c8 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -236,7 +236,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
236 */ 236 */
237 int dumpable = 0; 237 int dumpable = 0;
238 /* Don't let security modules deny introspection */ 238 /* Don't let security modules deny introspection */
239 if (task == current) 239 if (same_thread_group(task, current))
240 return 0; 240 return 0;
241 rcu_read_lock(); 241 rcu_read_lock();
242 tcred = __task_cred(task); 242 tcred = __task_cred(task);
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 7f8e7590e3e5..77131966c4ad 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -67,12 +67,15 @@
67 67
68extern struct debug_obj_descr rcuhead_debug_descr; 68extern struct debug_obj_descr rcuhead_debug_descr;
69 69
70static inline void debug_rcu_head_queue(struct rcu_head *head) 70static inline int debug_rcu_head_queue(struct rcu_head *head)
71{ 71{
72 debug_object_activate(head, &rcuhead_debug_descr); 72 int r1;
73
74 r1 = debug_object_activate(head, &rcuhead_debug_descr);
73 debug_object_active_state(head, &rcuhead_debug_descr, 75 debug_object_active_state(head, &rcuhead_debug_descr,
74 STATE_RCU_HEAD_READY, 76 STATE_RCU_HEAD_READY,
75 STATE_RCU_HEAD_QUEUED); 77 STATE_RCU_HEAD_QUEUED);
78 return r1;
76} 79}
77 80
78static inline void debug_rcu_head_unqueue(struct rcu_head *head) 81static inline void debug_rcu_head_unqueue(struct rcu_head *head)
@@ -83,8 +86,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
83 debug_object_deactivate(head, &rcuhead_debug_descr); 86 debug_object_deactivate(head, &rcuhead_debug_descr);
84} 87}
85#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 88#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
86static inline void debug_rcu_head_queue(struct rcu_head *head) 89static inline int debug_rcu_head_queue(struct rcu_head *head)
87{ 90{
91 return 0;
88} 92}
89 93
90static inline void debug_rcu_head_unqueue(struct rcu_head *head) 94static inline void debug_rcu_head_unqueue(struct rcu_head *head)
@@ -94,7 +98,7 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
94 98
95extern void kfree(const void *); 99extern void kfree(const void *);
96 100
97static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) 101static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
98{ 102{
99 unsigned long offset = (unsigned long)head->func; 103 unsigned long offset = (unsigned long)head->func;
100 104
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index cce6ba8bbace..b02a339836b4 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -122,7 +122,7 @@ struct lockdep_map rcu_sched_lock_map =
122 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); 122 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
123EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 123EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
124 124
125int debug_lockdep_rcu_enabled(void) 125int notrace debug_lockdep_rcu_enabled(void)
126{ 126{
127 return rcu_scheduler_active && debug_locks && 127 return rcu_scheduler_active && debug_locks &&
128 current->lockdep_recursion == 0; 128 current->lockdep_recursion == 0;
@@ -212,43 +212,6 @@ static inline void debug_rcu_head_free(struct rcu_head *head)
212} 212}
213 213
214/* 214/*
215 * fixup_init is called when:
216 * - an active object is initialized
217 */
218static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
219{
220 struct rcu_head *head = addr;
221
222 switch (state) {
223 case ODEBUG_STATE_ACTIVE:
224 /*
225 * Ensure that queued callbacks are all executed.
226 * If we detect that we are nested in a RCU read-side critical
227 * section, we should simply fail, otherwise we would deadlock.
228 * In !PREEMPT configurations, there is no way to tell if we are
229 * in a RCU read-side critical section or not, so we never
230 * attempt any fixup and just print a warning.
231 */
232#ifndef CONFIG_PREEMPT
233 WARN_ON_ONCE(1);
234 return 0;
235#endif
236 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
237 irqs_disabled()) {
238 WARN_ON_ONCE(1);
239 return 0;
240 }
241 rcu_barrier();
242 rcu_barrier_sched();
243 rcu_barrier_bh();
244 debug_object_init(head, &rcuhead_debug_descr);
245 return 1;
246 default:
247 return 0;
248 }
249}
250
251/*
252 * fixup_activate is called when: 215 * fixup_activate is called when:
253 * - an active object is activated 216 * - an active object is activated
254 * - an unknown object is activated (might be a statically initialized object) 217 * - an unknown object is activated (might be a statically initialized object)
@@ -268,69 +231,8 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
268 debug_object_init(head, &rcuhead_debug_descr); 231 debug_object_init(head, &rcuhead_debug_descr);
269 debug_object_activate(head, &rcuhead_debug_descr); 232 debug_object_activate(head, &rcuhead_debug_descr);
270 return 0; 233 return 0;
271
272 case ODEBUG_STATE_ACTIVE:
273 /*
274 * Ensure that queued callbacks are all executed.
275 * If we detect that we are nested in a RCU read-side critical
276 * section, we should simply fail, otherwise we would deadlock.
277 * In !PREEMPT configurations, there is no way to tell if we are
278 * in a RCU read-side critical section or not, so we never
279 * attempt any fixup and just print a warning.
280 */
281#ifndef CONFIG_PREEMPT
282 WARN_ON_ONCE(1);
283 return 0;
284#endif
285 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
286 irqs_disabled()) {
287 WARN_ON_ONCE(1);
288 return 0;
289 }
290 rcu_barrier();
291 rcu_barrier_sched();
292 rcu_barrier_bh();
293 debug_object_activate(head, &rcuhead_debug_descr);
294 return 1;
295 default: 234 default:
296 return 0;
297 }
298}
299
300/*
301 * fixup_free is called when:
302 * - an active object is freed
303 */
304static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
305{
306 struct rcu_head *head = addr;
307
308 switch (state) {
309 case ODEBUG_STATE_ACTIVE:
310 /*
311 * Ensure that queued callbacks are all executed.
312 * If we detect that we are nested in a RCU read-side critical
313 * section, we should simply fail, otherwise we would deadlock.
314 * In !PREEMPT configurations, there is no way to tell if we are
315 * in a RCU read-side critical section or not, so we never
316 * attempt any fixup and just print a warning.
317 */
318#ifndef CONFIG_PREEMPT
319 WARN_ON_ONCE(1);
320 return 0;
321#endif
322 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
323 irqs_disabled()) {
324 WARN_ON_ONCE(1);
325 return 0;
326 }
327 rcu_barrier();
328 rcu_barrier_sched();
329 rcu_barrier_bh();
330 debug_object_free(head, &rcuhead_debug_descr);
331 return 1; 235 return 1;
332 default:
333 return 0;
334 } 236 }
335} 237}
336 238
@@ -369,15 +271,13 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
369 271
370struct debug_obj_descr rcuhead_debug_descr = { 272struct debug_obj_descr rcuhead_debug_descr = {
371 .name = "rcu_head", 273 .name = "rcu_head",
372 .fixup_init = rcuhead_fixup_init,
373 .fixup_activate = rcuhead_fixup_activate, 274 .fixup_activate = rcuhead_fixup_activate,
374 .fixup_free = rcuhead_fixup_free,
375}; 275};
376EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 276EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
377#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 277#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
378 278
379#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) 279#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
380void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, 280void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
381 unsigned long secs, 281 unsigned long secs,
382 unsigned long c_old, unsigned long c) 282 unsigned long c_old, unsigned long c)
383{ 283{
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index aa344111de3e..9ed6075dc562 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -264,7 +264,7 @@ void rcu_check_callbacks(int cpu, int user)
264 */ 264 */
265static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) 265static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
266{ 266{
267 char *rn = NULL; 267 const char *rn = NULL;
268 struct rcu_head *next, *list; 268 struct rcu_head *next, *list;
269 unsigned long flags; 269 unsigned long flags;
270 RCU_TRACE(int cb_count = 0); 270 RCU_TRACE(int cb_count = 0);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 0cd385acccfa..280d06cae352 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -36,7 +36,7 @@ struct rcu_ctrlblk {
36 RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ 36 RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
37 RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ 37 RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
38 RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ 38 RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
39 RCU_TRACE(char *name); /* Name of RCU type. */ 39 RCU_TRACE(const char *name); /* Name of RCU type. */
40}; 40};
41 41
42/* Definition for rcupdate control block. */ 42/* Definition for rcupdate control block. */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index f4871e52c546..be63101c6175 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -52,72 +52,78 @@
52MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); 53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
54 54
55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 55static int fqs_duration;
56static int nfakewriters = 4; /* # fake writer threads */
57static int stat_interval = 60; /* Interval between stats, in seconds. */
58 /* Zero means "only at end of test". */
59static bool verbose; /* Print more debug info. */
60static bool test_no_idle_hz = true;
61 /* Test RCU support for tickless idle CPUs. */
62static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
63static int stutter = 5; /* Start/stop testing interval (in sec) */
64static int irqreader = 1; /* RCU readers from irq (timers). */
65static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
66static int fqs_holdoff; /* Hold time within burst (us). */
67static int fqs_stutter = 3; /* Wait time between bursts (s). */
68static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */
69static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
70static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
71static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
72static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */
73static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */
74static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
75static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
76static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
77static char *torture_type = "rcu"; /* What RCU implementation to torture. */
78
79module_param(nreaders, int, 0444);
80MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
81module_param(nfakewriters, int, 0444);
82MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
83module_param(stat_interval, int, 0644);
84MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
85module_param(verbose, bool, 0444);
86MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
87module_param(test_no_idle_hz, bool, 0444);
88MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
89module_param(shuffle_interval, int, 0444);
90MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
91module_param(stutter, int, 0444);
92MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
93module_param(irqreader, int, 0444);
94MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
95module_param(fqs_duration, int, 0444); 56module_param(fqs_duration, int, 0444);
96MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); 57MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
58static int fqs_holdoff;
97module_param(fqs_holdoff, int, 0444); 59module_param(fqs_holdoff, int, 0444);
98MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 60MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
61static int fqs_stutter = 3;
99module_param(fqs_stutter, int, 0444); 62module_param(fqs_stutter, int, 0444);
100MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 63MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
64static bool gp_exp;
65module_param(gp_exp, bool, 0444);
66MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives");
67static bool gp_normal;
68module_param(gp_normal, bool, 0444);
69MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives");
70static int irqreader = 1;
71module_param(irqreader, int, 0444);
72MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
73static int n_barrier_cbs;
101module_param(n_barrier_cbs, int, 0444); 74module_param(n_barrier_cbs, int, 0444);
102MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); 75MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
103module_param(onoff_interval, int, 0444); 76static int nfakewriters = 4;
104MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 77module_param(nfakewriters, int, 0444);
78MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
79static int nreaders = -1;
80module_param(nreaders, int, 0444);
81MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
82static int object_debug;
83module_param(object_debug, int, 0444);
84MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing");
85static int onoff_holdoff;
105module_param(onoff_holdoff, int, 0444); 86module_param(onoff_holdoff, int, 0444);
106MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); 87MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
88static int onoff_interval;
89module_param(onoff_interval, int, 0444);
90MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
91static int shuffle_interval = 3;
92module_param(shuffle_interval, int, 0444);
93MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
94static int shutdown_secs;
107module_param(shutdown_secs, int, 0444); 95module_param(shutdown_secs, int, 0444);
108MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); 96MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable.");
97static int stall_cpu;
109module_param(stall_cpu, int, 0444); 98module_param(stall_cpu, int, 0444);
110MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); 99MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
100static int stall_cpu_holdoff = 10;
111module_param(stall_cpu_holdoff, int, 0444); 101module_param(stall_cpu_holdoff, int, 0444);
112MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); 102MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
103static int stat_interval = 60;
104module_param(stat_interval, int, 0644);
105MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
106static int stutter = 5;
107module_param(stutter, int, 0444);
108MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
109static int test_boost = 1;
113module_param(test_boost, int, 0444); 110module_param(test_boost, int, 0444);
114MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 111MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
115module_param(test_boost_interval, int, 0444); 112static int test_boost_duration = 4;
116MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
117module_param(test_boost_duration, int, 0444); 113module_param(test_boost_duration, int, 0444);
118MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); 114MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
115static int test_boost_interval = 7;
116module_param(test_boost_interval, int, 0444);
117MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
118static bool test_no_idle_hz = true;
119module_param(test_no_idle_hz, bool, 0444);
120MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
121static char *torture_type = "rcu";
119module_param(torture_type, charp, 0444); 122module_param(torture_type, charp, 0444);
120MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 123MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
124static bool verbose;
125module_param(verbose, bool, 0444);
126MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
121 127
122#define TORTURE_FLAG "-torture:" 128#define TORTURE_FLAG "-torture:"
123#define PRINTK_STRING(s) \ 129#define PRINTK_STRING(s) \
@@ -267,7 +273,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
267 * Absorb kthreads into a kernel function that won't return, so that 273 * Absorb kthreads into a kernel function that won't return, so that
268 * they won't ever access module text or data again. 274 * they won't ever access module text or data again.
269 */ 275 */
270static void rcutorture_shutdown_absorb(char *title) 276static void rcutorture_shutdown_absorb(const char *title)
271{ 277{
272 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { 278 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
273 pr_notice( 279 pr_notice(
@@ -337,7 +343,7 @@ rcu_random(struct rcu_random_state *rrsp)
337} 343}
338 344
339static void 345static void
340rcu_stutter_wait(char *title) 346rcu_stutter_wait(const char *title)
341{ 347{
342 while (stutter_pause_test || !rcutorture_runnable) { 348 while (stutter_pause_test || !rcutorture_runnable) {
343 if (rcutorture_runnable) 349 if (rcutorture_runnable)
@@ -360,13 +366,14 @@ struct rcu_torture_ops {
360 int (*completed)(void); 366 int (*completed)(void);
361 void (*deferred_free)(struct rcu_torture *p); 367 void (*deferred_free)(struct rcu_torture *p);
362 void (*sync)(void); 368 void (*sync)(void);
369 void (*exp_sync)(void);
363 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 370 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
364 void (*cb_barrier)(void); 371 void (*cb_barrier)(void);
365 void (*fqs)(void); 372 void (*fqs)(void);
366 int (*stats)(char *page); 373 int (*stats)(char *page);
367 int irq_capable; 374 int irq_capable;
368 int can_boost; 375 int can_boost;
369 char *name; 376 const char *name;
370}; 377};
371 378
372static struct rcu_torture_ops *cur_ops; 379static struct rcu_torture_ops *cur_ops;
@@ -443,81 +450,27 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
443 call_rcu(&p->rtort_rcu, rcu_torture_cb); 450 call_rcu(&p->rtort_rcu, rcu_torture_cb);
444} 451}
445 452
446static struct rcu_torture_ops rcu_ops = {
447 .init = NULL,
448 .readlock = rcu_torture_read_lock,
449 .read_delay = rcu_read_delay,
450 .readunlock = rcu_torture_read_unlock,
451 .completed = rcu_torture_completed,
452 .deferred_free = rcu_torture_deferred_free,
453 .sync = synchronize_rcu,
454 .call = call_rcu,
455 .cb_barrier = rcu_barrier,
456 .fqs = rcu_force_quiescent_state,
457 .stats = NULL,
458 .irq_capable = 1,
459 .can_boost = rcu_can_boost(),
460 .name = "rcu"
461};
462
463static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
464{
465 int i;
466 struct rcu_torture *rp;
467 struct rcu_torture *rp1;
468
469 cur_ops->sync();
470 list_add(&p->rtort_free, &rcu_torture_removed);
471 list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
472 i = rp->rtort_pipe_count;
473 if (i > RCU_TORTURE_PIPE_LEN)
474 i = RCU_TORTURE_PIPE_LEN;
475 atomic_inc(&rcu_torture_wcount[i]);
476 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
477 rp->rtort_mbtest = 0;
478 list_del(&rp->rtort_free);
479 rcu_torture_free(rp);
480 }
481 }
482}
483
484static void rcu_sync_torture_init(void) 453static void rcu_sync_torture_init(void)
485{ 454{
486 INIT_LIST_HEAD(&rcu_torture_removed); 455 INIT_LIST_HEAD(&rcu_torture_removed);
487} 456}
488 457
489static struct rcu_torture_ops rcu_sync_ops = { 458static struct rcu_torture_ops rcu_ops = {
490 .init = rcu_sync_torture_init, 459 .init = rcu_sync_torture_init,
491 .readlock = rcu_torture_read_lock, 460 .readlock = rcu_torture_read_lock,
492 .read_delay = rcu_read_delay, 461 .read_delay = rcu_read_delay,
493 .readunlock = rcu_torture_read_unlock, 462 .readunlock = rcu_torture_read_unlock,
494 .completed = rcu_torture_completed, 463 .completed = rcu_torture_completed,
495 .deferred_free = rcu_sync_torture_deferred_free, 464 .deferred_free = rcu_torture_deferred_free,
496 .sync = synchronize_rcu, 465 .sync = synchronize_rcu,
497 .call = NULL, 466 .exp_sync = synchronize_rcu_expedited,
498 .cb_barrier = NULL, 467 .call = call_rcu,
499 .fqs = rcu_force_quiescent_state, 468 .cb_barrier = rcu_barrier,
500 .stats = NULL,
501 .irq_capable = 1,
502 .can_boost = rcu_can_boost(),
503 .name = "rcu_sync"
504};
505
506static struct rcu_torture_ops rcu_expedited_ops = {
507 .init = rcu_sync_torture_init,
508 .readlock = rcu_torture_read_lock,
509 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
510 .readunlock = rcu_torture_read_unlock,
511 .completed = rcu_no_completed,
512 .deferred_free = rcu_sync_torture_deferred_free,
513 .sync = synchronize_rcu_expedited,
514 .call = NULL,
515 .cb_barrier = NULL,
516 .fqs = rcu_force_quiescent_state, 469 .fqs = rcu_force_quiescent_state,
517 .stats = NULL, 470 .stats = NULL,
518 .irq_capable = 1, 471 .irq_capable = 1,
519 .can_boost = rcu_can_boost(), 472 .can_boost = rcu_can_boost(),
520 .name = "rcu_expedited" 473 .name = "rcu"
521}; 474};
522 475
523/* 476/*
@@ -546,13 +499,14 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
546} 499}
547 500
548static struct rcu_torture_ops rcu_bh_ops = { 501static struct rcu_torture_ops rcu_bh_ops = {
549 .init = NULL, 502 .init = rcu_sync_torture_init,
550 .readlock = rcu_bh_torture_read_lock, 503 .readlock = rcu_bh_torture_read_lock,
551 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 504 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
552 .readunlock = rcu_bh_torture_read_unlock, 505 .readunlock = rcu_bh_torture_read_unlock,
553 .completed = rcu_bh_torture_completed, 506 .completed = rcu_bh_torture_completed,
554 .deferred_free = rcu_bh_torture_deferred_free, 507 .deferred_free = rcu_bh_torture_deferred_free,
555 .sync = synchronize_rcu_bh, 508 .sync = synchronize_rcu_bh,
509 .exp_sync = synchronize_rcu_bh_expedited,
556 .call = call_rcu_bh, 510 .call = call_rcu_bh,
557 .cb_barrier = rcu_barrier_bh, 511 .cb_barrier = rcu_barrier_bh,
558 .fqs = rcu_bh_force_quiescent_state, 512 .fqs = rcu_bh_force_quiescent_state,
@@ -561,38 +515,6 @@ static struct rcu_torture_ops rcu_bh_ops = {
561 .name = "rcu_bh" 515 .name = "rcu_bh"
562}; 516};
563 517
564static struct rcu_torture_ops rcu_bh_sync_ops = {
565 .init = rcu_sync_torture_init,
566 .readlock = rcu_bh_torture_read_lock,
567 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
568 .readunlock = rcu_bh_torture_read_unlock,
569 .completed = rcu_bh_torture_completed,
570 .deferred_free = rcu_sync_torture_deferred_free,
571 .sync = synchronize_rcu_bh,
572 .call = NULL,
573 .cb_barrier = NULL,
574 .fqs = rcu_bh_force_quiescent_state,
575 .stats = NULL,
576 .irq_capable = 1,
577 .name = "rcu_bh_sync"
578};
579
580static struct rcu_torture_ops rcu_bh_expedited_ops = {
581 .init = rcu_sync_torture_init,
582 .readlock = rcu_bh_torture_read_lock,
583 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
584 .readunlock = rcu_bh_torture_read_unlock,
585 .completed = rcu_bh_torture_completed,
586 .deferred_free = rcu_sync_torture_deferred_free,
587 .sync = synchronize_rcu_bh_expedited,
588 .call = NULL,
589 .cb_barrier = NULL,
590 .fqs = rcu_bh_force_quiescent_state,
591 .stats = NULL,
592 .irq_capable = 1,
593 .name = "rcu_bh_expedited"
594};
595
596/* 518/*
597 * Definitions for srcu torture testing. 519 * Definitions for srcu torture testing.
598 */ 520 */
@@ -667,6 +589,11 @@ static int srcu_torture_stats(char *page)
667 return cnt; 589 return cnt;
668} 590}
669 591
592static void srcu_torture_synchronize_expedited(void)
593{
594 synchronize_srcu_expedited(&srcu_ctl);
595}
596
670static struct rcu_torture_ops srcu_ops = { 597static struct rcu_torture_ops srcu_ops = {
671 .init = rcu_sync_torture_init, 598 .init = rcu_sync_torture_init,
672 .readlock = srcu_torture_read_lock, 599 .readlock = srcu_torture_read_lock,
@@ -675,45 +602,13 @@ static struct rcu_torture_ops srcu_ops = {
675 .completed = srcu_torture_completed, 602 .completed = srcu_torture_completed,
676 .deferred_free = srcu_torture_deferred_free, 603 .deferred_free = srcu_torture_deferred_free,
677 .sync = srcu_torture_synchronize, 604 .sync = srcu_torture_synchronize,
605 .exp_sync = srcu_torture_synchronize_expedited,
678 .call = srcu_torture_call, 606 .call = srcu_torture_call,
679 .cb_barrier = srcu_torture_barrier, 607 .cb_barrier = srcu_torture_barrier,
680 .stats = srcu_torture_stats, 608 .stats = srcu_torture_stats,
681 .name = "srcu" 609 .name = "srcu"
682}; 610};
683 611
684static struct rcu_torture_ops srcu_sync_ops = {
685 .init = rcu_sync_torture_init,
686 .readlock = srcu_torture_read_lock,
687 .read_delay = srcu_read_delay,
688 .readunlock = srcu_torture_read_unlock,
689 .completed = srcu_torture_completed,
690 .deferred_free = rcu_sync_torture_deferred_free,
691 .sync = srcu_torture_synchronize,
692 .call = NULL,
693 .cb_barrier = NULL,
694 .stats = srcu_torture_stats,
695 .name = "srcu_sync"
696};
697
698static void srcu_torture_synchronize_expedited(void)
699{
700 synchronize_srcu_expedited(&srcu_ctl);
701}
702
703static struct rcu_torture_ops srcu_expedited_ops = {
704 .init = rcu_sync_torture_init,
705 .readlock = srcu_torture_read_lock,
706 .read_delay = srcu_read_delay,
707 .readunlock = srcu_torture_read_unlock,
708 .completed = srcu_torture_completed,
709 .deferred_free = rcu_sync_torture_deferred_free,
710 .sync = srcu_torture_synchronize_expedited,
711 .call = NULL,
712 .cb_barrier = NULL,
713 .stats = srcu_torture_stats,
714 .name = "srcu_expedited"
715};
716
717/* 612/*
718 * Definitions for sched torture testing. 613 * Definitions for sched torture testing.
719 */ 614 */
@@ -742,6 +637,8 @@ static struct rcu_torture_ops sched_ops = {
742 .completed = rcu_no_completed, 637 .completed = rcu_no_completed,
743 .deferred_free = rcu_sched_torture_deferred_free, 638 .deferred_free = rcu_sched_torture_deferred_free,
744 .sync = synchronize_sched, 639 .sync = synchronize_sched,
640 .exp_sync = synchronize_sched_expedited,
641 .call = call_rcu_sched,
745 .cb_barrier = rcu_barrier_sched, 642 .cb_barrier = rcu_barrier_sched,
746 .fqs = rcu_sched_force_quiescent_state, 643 .fqs = rcu_sched_force_quiescent_state,
747 .stats = NULL, 644 .stats = NULL,
@@ -749,35 +646,6 @@ static struct rcu_torture_ops sched_ops = {
749 .name = "sched" 646 .name = "sched"
750}; 647};
751 648
752static struct rcu_torture_ops sched_sync_ops = {
753 .init = rcu_sync_torture_init,
754 .readlock = sched_torture_read_lock,
755 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
756 .readunlock = sched_torture_read_unlock,
757 .completed = rcu_no_completed,
758 .deferred_free = rcu_sync_torture_deferred_free,
759 .sync = synchronize_sched,
760 .cb_barrier = NULL,
761 .fqs = rcu_sched_force_quiescent_state,
762 .stats = NULL,
763 .name = "sched_sync"
764};
765
766static struct rcu_torture_ops sched_expedited_ops = {
767 .init = rcu_sync_torture_init,
768 .readlock = sched_torture_read_lock,
769 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
770 .readunlock = sched_torture_read_unlock,
771 .completed = rcu_no_completed,
772 .deferred_free = rcu_sync_torture_deferred_free,
773 .sync = synchronize_sched_expedited,
774 .cb_barrier = NULL,
775 .fqs = rcu_sched_force_quiescent_state,
776 .stats = NULL,
777 .irq_capable = 1,
778 .name = "sched_expedited"
779};
780
781/* 649/*
782 * RCU torture priority-boost testing. Runs one real-time thread per 650 * RCU torture priority-boost testing. Runs one real-time thread per
783 * CPU for moderate bursts, repeatedly registering RCU callbacks and 651 * CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -927,9 +795,10 @@ rcu_torture_fqs(void *arg)
927static int 795static int
928rcu_torture_writer(void *arg) 796rcu_torture_writer(void *arg)
929{ 797{
798 bool exp;
930 int i; 799 int i;
931 long oldbatch = rcu_batches_completed();
932 struct rcu_torture *rp; 800 struct rcu_torture *rp;
801 struct rcu_torture *rp1;
933 struct rcu_torture *old_rp; 802 struct rcu_torture *old_rp;
934 static DEFINE_RCU_RANDOM(rand); 803 static DEFINE_RCU_RANDOM(rand);
935 804
@@ -954,10 +823,33 @@ rcu_torture_writer(void *arg)
954 i = RCU_TORTURE_PIPE_LEN; 823 i = RCU_TORTURE_PIPE_LEN;
955 atomic_inc(&rcu_torture_wcount[i]); 824 atomic_inc(&rcu_torture_wcount[i]);
956 old_rp->rtort_pipe_count++; 825 old_rp->rtort_pipe_count++;
957 cur_ops->deferred_free(old_rp); 826 if (gp_normal == gp_exp)
827 exp = !!(rcu_random(&rand) & 0x80);
828 else
829 exp = gp_exp;
830 if (!exp) {
831 cur_ops->deferred_free(old_rp);
832 } else {
833 cur_ops->exp_sync();
834 list_add(&old_rp->rtort_free,
835 &rcu_torture_removed);
836 list_for_each_entry_safe(rp, rp1,
837 &rcu_torture_removed,
838 rtort_free) {
839 i = rp->rtort_pipe_count;
840 if (i > RCU_TORTURE_PIPE_LEN)
841 i = RCU_TORTURE_PIPE_LEN;
842 atomic_inc(&rcu_torture_wcount[i]);
843 if (++rp->rtort_pipe_count >=
844 RCU_TORTURE_PIPE_LEN) {
845 rp->rtort_mbtest = 0;
846 list_del(&rp->rtort_free);
847 rcu_torture_free(rp);
848 }
849 }
850 }
958 } 851 }
959 rcutorture_record_progress(++rcu_torture_current_version); 852 rcutorture_record_progress(++rcu_torture_current_version);
960 oldbatch = cur_ops->completed();
961 rcu_stutter_wait("rcu_torture_writer"); 853 rcu_stutter_wait("rcu_torture_writer");
962 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 854 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
963 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 855 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
@@ -983,10 +875,18 @@ rcu_torture_fakewriter(void *arg)
983 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 875 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
984 udelay(rcu_random(&rand) & 0x3ff); 876 udelay(rcu_random(&rand) & 0x3ff);
985 if (cur_ops->cb_barrier != NULL && 877 if (cur_ops->cb_barrier != NULL &&
986 rcu_random(&rand) % (nfakewriters * 8) == 0) 878 rcu_random(&rand) % (nfakewriters * 8) == 0) {
987 cur_ops->cb_barrier(); 879 cur_ops->cb_barrier();
988 else 880 } else if (gp_normal == gp_exp) {
881 if (rcu_random(&rand) & 0x80)
882 cur_ops->sync();
883 else
884 cur_ops->exp_sync();
885 } else if (gp_normal) {
989 cur_ops->sync(); 886 cur_ops->sync();
887 } else {
888 cur_ops->exp_sync();
889 }
990 rcu_stutter_wait("rcu_torture_fakewriter"); 890 rcu_stutter_wait("rcu_torture_fakewriter");
991 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 891 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
992 892
@@ -1364,7 +1264,7 @@ rcu_torture_stutter(void *arg)
1364} 1264}
1365 1265
1366static inline void 1266static inline void
1367rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) 1267rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
1368{ 1268{
1369 pr_alert("%s" TORTURE_FLAG 1269 pr_alert("%s" TORTURE_FLAG
1370 "--- %s: nreaders=%d nfakewriters=%d " 1270 "--- %s: nreaders=%d nfakewriters=%d "
@@ -1534,7 +1434,13 @@ rcu_torture_onoff(void *arg)
1534 torture_type, cpu); 1434 torture_type, cpu);
1535 starttime = jiffies; 1435 starttime = jiffies;
1536 n_online_attempts++; 1436 n_online_attempts++;
1537 if (cpu_up(cpu) == 0) { 1437 ret = cpu_up(cpu);
1438 if (ret) {
1439 if (verbose)
1440 pr_alert("%s" TORTURE_FLAG
1441 "rcu_torture_onoff task: online %d failed: errno %d\n",
1442 torture_type, cpu, ret);
1443 } else {
1538 if (verbose) 1444 if (verbose)
1539 pr_alert("%s" TORTURE_FLAG 1445 pr_alert("%s" TORTURE_FLAG
1540 "rcu_torture_onoff task: onlined %d\n", 1446 "rcu_torture_onoff task: onlined %d\n",
@@ -1934,6 +1840,62 @@ rcu_torture_cleanup(void)
1934 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); 1840 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1935} 1841}
1936 1842
1843#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
1844static void rcu_torture_leak_cb(struct rcu_head *rhp)
1845{
1846}
1847
1848static void rcu_torture_err_cb(struct rcu_head *rhp)
1849{
1850 /*
1851 * This -might- happen due to race conditions, but is unlikely.
1852 * The scenario that leads to this happening is that the
1853 * first of the pair of duplicate callbacks is queued,
1854 * someone else starts a grace period that includes that
1855 * callback, then the second of the pair must wait for the
1856 * next grace period. Unlikely, but can happen. If it
1857 * does happen, the debug-objects subsystem won't have splatted.
1858 */
1859 pr_alert("rcutorture: duplicated callback was invoked.\n");
1860}
1861#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
1862
1863/*
1864 * Verify that double-free causes debug-objects to complain, but only
1865 * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test
1866 * cannot be carried out.
1867 */
1868static void rcu_test_debug_objects(void)
1869{
1870#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
1871 struct rcu_head rh1;
1872 struct rcu_head rh2;
1873
1874 init_rcu_head_on_stack(&rh1);
1875 init_rcu_head_on_stack(&rh2);
1876 pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n");
1877
1878 /* Try to queue the rh2 pair of callbacks for the same grace period. */
1879 preempt_disable(); /* Prevent preemption from interrupting test. */
1880 rcu_read_lock(); /* Make it impossible to finish a grace period. */
1881 call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
1882 local_irq_disable(); /* Make it harder to start a new grace period. */
1883 call_rcu(&rh2, rcu_torture_leak_cb);
1884 call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
1885 local_irq_enable();
1886 rcu_read_unlock();
1887 preempt_enable();
1888
1889 /* Wait for them all to get done so we can safely return. */
1890 rcu_barrier();
1891 pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n");
1892 destroy_rcu_head_on_stack(&rh1);
1893 destroy_rcu_head_on_stack(&rh2);
1894#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
1895 pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n");
1896#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
1897}
1898
1937static int __init 1899static int __init
1938rcu_torture_init(void) 1900rcu_torture_init(void)
1939{ 1901{
@@ -1941,11 +1903,9 @@ rcu_torture_init(void)
1941 int cpu; 1903 int cpu;
1942 int firsterr = 0; 1904 int firsterr = 0;
1943 int retval; 1905 int retval;
1944 static struct rcu_torture_ops *torture_ops[] = 1906 static struct rcu_torture_ops *torture_ops[] = {
1945 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1907 &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
1946 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1908 };
1947 &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
1948 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1949 1909
1950 mutex_lock(&fullstop_mutex); 1910 mutex_lock(&fullstop_mutex);
1951 1911
@@ -2163,6 +2123,8 @@ rcu_torture_init(void)
2163 firsterr = retval; 2123 firsterr = retval;
2164 goto unwind; 2124 goto unwind;
2165 } 2125 }
2126 if (object_debug)
2127 rcu_test_debug_objects();
2166 rcutorture_record_test_transition(); 2128 rcutorture_record_test_transition();
2167 mutex_unlock(&fullstop_mutex); 2129 mutex_unlock(&fullstop_mutex);
2168 return 0; 2130 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 068de3a93606..32618b3fe4e6 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -53,18 +53,38 @@
53#include <linux/delay.h> 53#include <linux/delay.h>
54#include <linux/stop_machine.h> 54#include <linux/stop_machine.h>
55#include <linux/random.h> 55#include <linux/random.h>
56#include <linux/ftrace_event.h>
57#include <linux/suspend.h>
56 58
57#include "rcutree.h" 59#include "rcutree.h"
58#include <trace/events/rcu.h> 60#include <trace/events/rcu.h>
59 61
60#include "rcu.h" 62#include "rcu.h"
61 63
64/*
65 * Strings used in tracepoints need to be exported via the
66 * tracing system such that tools like perf and trace-cmd can
67 * translate the string address pointers to actual text.
68 */
69#define TPS(x) tracepoint_string(x)
70
62/* Data structures. */ 71/* Data structures. */
63 72
64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 73static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 74static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
66 75
67#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ 76/*
77 * In order to export the rcu_state name to the tracing tools, it
78 * needs to be added in the __tracepoint_string section.
79 * This requires defining a separate variable tp_<sname>_varname
80 * that points to the string being used, and this will allow
81 * the tracing userspace tools to be able to decipher the string
82 * address to the matching string.
83 */
84#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
85static char sname##_varname[] = #sname; \
86static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \
87struct rcu_state sname##_state = { \
68 .level = { &sname##_state.node[0] }, \ 88 .level = { &sname##_state.node[0] }, \
69 .call = cr, \ 89 .call = cr, \
70 .fqs_state = RCU_GP_IDLE, \ 90 .fqs_state = RCU_GP_IDLE, \
@@ -75,16 +95,13 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
75 .orphan_donetail = &sname##_state.orphan_donelist, \ 95 .orphan_donetail = &sname##_state.orphan_donelist, \
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 96 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ 97 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
78 .name = #sname, \ 98 .name = sname##_varname, \
79 .abbr = sabbr, \ 99 .abbr = sabbr, \
80} 100}; \
81 101DEFINE_PER_CPU(struct rcu_data, sname##_data)
82struct rcu_state rcu_sched_state =
83 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
84DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
85 102
86struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 103RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
87DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 104RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
88 105
89static struct rcu_state *rcu_state; 106static struct rcu_state *rcu_state;
90LIST_HEAD(rcu_struct_flavors); 107LIST_HEAD(rcu_struct_flavors);
@@ -178,7 +195,7 @@ void rcu_sched_qs(int cpu)
178 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); 195 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
179 196
180 if (rdp->passed_quiesce == 0) 197 if (rdp->passed_quiesce == 0)
181 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); 198 trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs"));
182 rdp->passed_quiesce = 1; 199 rdp->passed_quiesce = 1;
183} 200}
184 201
@@ -187,7 +204,7 @@ void rcu_bh_qs(int cpu)
187 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 204 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
188 205
189 if (rdp->passed_quiesce == 0) 206 if (rdp->passed_quiesce == 0)
190 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); 207 trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs"));
191 rdp->passed_quiesce = 1; 208 rdp->passed_quiesce = 1;
192} 209}
193 210
@@ -198,16 +215,20 @@ void rcu_bh_qs(int cpu)
198 */ 215 */
199void rcu_note_context_switch(int cpu) 216void rcu_note_context_switch(int cpu)
200{ 217{
201 trace_rcu_utilization("Start context switch"); 218 trace_rcu_utilization(TPS("Start context switch"));
202 rcu_sched_qs(cpu); 219 rcu_sched_qs(cpu);
203 rcu_preempt_note_context_switch(cpu); 220 rcu_preempt_note_context_switch(cpu);
204 trace_rcu_utilization("End context switch"); 221 trace_rcu_utilization(TPS("End context switch"));
205} 222}
206EXPORT_SYMBOL_GPL(rcu_note_context_switch); 223EXPORT_SYMBOL_GPL(rcu_note_context_switch);
207 224
208DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 225DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
209 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 226 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
210 .dynticks = ATOMIC_INIT(1), 227 .dynticks = ATOMIC_INIT(1),
228#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
229 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
230 .dynticks_idle = ATOMIC_INIT(1),
231#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
211}; 232};
212 233
213static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 234static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
@@ -226,7 +247,10 @@ module_param(jiffies_till_next_fqs, ulong, 0644);
226 247
227static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 248static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
228 struct rcu_data *rdp); 249 struct rcu_data *rdp);
229static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); 250static void force_qs_rnp(struct rcu_state *rsp,
251 int (*f)(struct rcu_data *rsp, bool *isidle,
252 unsigned long *maxj),
253 bool *isidle, unsigned long *maxj);
230static void force_quiescent_state(struct rcu_state *rsp); 254static void force_quiescent_state(struct rcu_state *rsp);
231static int rcu_pending(int cpu); 255static int rcu_pending(int cpu);
232 256
@@ -345,11 +369,11 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
345static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, 369static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
346 bool user) 370 bool user)
347{ 371{
348 trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); 372 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
349 if (!user && !is_idle_task(current)) { 373 if (!user && !is_idle_task(current)) {
350 struct task_struct *idle = idle_task(smp_processor_id()); 374 struct task_struct *idle = idle_task(smp_processor_id());
351 375
352 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); 376 trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
353 ftrace_dump(DUMP_ORIG); 377 ftrace_dump(DUMP_ORIG);
354 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 378 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
355 current->pid, current->comm, 379 current->pid, current->comm,
@@ -411,6 +435,7 @@ void rcu_idle_enter(void)
411 435
412 local_irq_save(flags); 436 local_irq_save(flags);
413 rcu_eqs_enter(false); 437 rcu_eqs_enter(false);
438 rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0);
414 local_irq_restore(flags); 439 local_irq_restore(flags);
415} 440}
416EXPORT_SYMBOL_GPL(rcu_idle_enter); 441EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -428,27 +453,6 @@ void rcu_user_enter(void)
428{ 453{
429 rcu_eqs_enter(1); 454 rcu_eqs_enter(1);
430} 455}
431
432/**
433 * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace
434 * after the current irq returns.
435 *
436 * This is similar to rcu_user_enter() but in the context of a non-nesting
437 * irq. After this call, RCU enters into idle mode when the interrupt
438 * returns.
439 */
440void rcu_user_enter_after_irq(void)
441{
442 unsigned long flags;
443 struct rcu_dynticks *rdtp;
444
445 local_irq_save(flags);
446 rdtp = &__get_cpu_var(rcu_dynticks);
447 /* Ensure this irq is interrupting a non-idle RCU state. */
448 WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK));
449 rdtp->dynticks_nesting = 1;
450 local_irq_restore(flags);
451}
452#endif /* CONFIG_RCU_USER_QS */ 456#endif /* CONFIG_RCU_USER_QS */
453 457
454/** 458/**
@@ -479,9 +483,10 @@ void rcu_irq_exit(void)
479 rdtp->dynticks_nesting--; 483 rdtp->dynticks_nesting--;
480 WARN_ON_ONCE(rdtp->dynticks_nesting < 0); 484 WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
481 if (rdtp->dynticks_nesting) 485 if (rdtp->dynticks_nesting)
482 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); 486 trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
483 else 487 else
484 rcu_eqs_enter_common(rdtp, oldval, true); 488 rcu_eqs_enter_common(rdtp, oldval, true);
489 rcu_sysidle_enter(rdtp, 1);
485 local_irq_restore(flags); 490 local_irq_restore(flags);
486} 491}
487 492
@@ -501,11 +506,11 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
501 smp_mb__after_atomic_inc(); /* See above. */ 506 smp_mb__after_atomic_inc(); /* See above. */
502 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 507 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
503 rcu_cleanup_after_idle(smp_processor_id()); 508 rcu_cleanup_after_idle(smp_processor_id());
504 trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); 509 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
505 if (!user && !is_idle_task(current)) { 510 if (!user && !is_idle_task(current)) {
506 struct task_struct *idle = idle_task(smp_processor_id()); 511 struct task_struct *idle = idle_task(smp_processor_id());
507 512
508 trace_rcu_dyntick("Error on exit: not idle task", 513 trace_rcu_dyntick(TPS("Error on exit: not idle task"),
509 oldval, rdtp->dynticks_nesting); 514 oldval, rdtp->dynticks_nesting);
510 ftrace_dump(DUMP_ORIG); 515 ftrace_dump(DUMP_ORIG);
511 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 516 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -550,6 +555,7 @@ void rcu_idle_exit(void)
550 555
551 local_irq_save(flags); 556 local_irq_save(flags);
552 rcu_eqs_exit(false); 557 rcu_eqs_exit(false);
558 rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0);
553 local_irq_restore(flags); 559 local_irq_restore(flags);
554} 560}
555EXPORT_SYMBOL_GPL(rcu_idle_exit); 561EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -565,28 +571,6 @@ void rcu_user_exit(void)
565{ 571{
566 rcu_eqs_exit(1); 572 rcu_eqs_exit(1);
567} 573}
568
569/**
570 * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace
571 * idle mode after the current non-nesting irq returns.
572 *
573 * This is similar to rcu_user_exit() but in the context of an irq.
574 * This is called when the irq has interrupted a userspace RCU idle mode
575 * context. When the current non-nesting interrupt returns after this call,
576 * the CPU won't restore the RCU idle mode.
577 */
578void rcu_user_exit_after_irq(void)
579{
580 unsigned long flags;
581 struct rcu_dynticks *rdtp;
582
583 local_irq_save(flags);
584 rdtp = &__get_cpu_var(rcu_dynticks);
585 /* Ensure we are interrupting an RCU idle mode. */
586 WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK);
587 rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE;
588 local_irq_restore(flags);
589}
590#endif /* CONFIG_RCU_USER_QS */ 574#endif /* CONFIG_RCU_USER_QS */
591 575
592/** 576/**
@@ -620,9 +604,10 @@ void rcu_irq_enter(void)
620 rdtp->dynticks_nesting++; 604 rdtp->dynticks_nesting++;
621 WARN_ON_ONCE(rdtp->dynticks_nesting == 0); 605 WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
622 if (oldval) 606 if (oldval)
623 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); 607 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
624 else 608 else
625 rcu_eqs_exit_common(rdtp, oldval, true); 609 rcu_eqs_exit_common(rdtp, oldval, true);
610 rcu_sysidle_exit(rdtp, 1);
626 local_irq_restore(flags); 611 local_irq_restore(flags);
627} 612}
628 613
@@ -746,9 +731,11 @@ static int rcu_is_cpu_rrupt_from_idle(void)
746 * credit them with an implicit quiescent state. Return 1 if this CPU 731 * credit them with an implicit quiescent state. Return 1 if this CPU
747 * is in dynticks idle mode, which is an extended quiescent state. 732 * is in dynticks idle mode, which is an extended quiescent state.
748 */ 733 */
749static int dyntick_save_progress_counter(struct rcu_data *rdp) 734static int dyntick_save_progress_counter(struct rcu_data *rdp,
735 bool *isidle, unsigned long *maxj)
750{ 736{
751 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 737 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
738 rcu_sysidle_check_cpu(rdp, isidle, maxj);
752 return (rdp->dynticks_snap & 0x1) == 0; 739 return (rdp->dynticks_snap & 0x1) == 0;
753} 740}
754 741
@@ -758,7 +745,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
758 * idle state since the last call to dyntick_save_progress_counter() 745 * idle state since the last call to dyntick_save_progress_counter()
759 * for this same CPU, or by virtue of having been offline. 746 * for this same CPU, or by virtue of having been offline.
760 */ 747 */
761static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 748static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
749 bool *isidle, unsigned long *maxj)
762{ 750{
763 unsigned int curr; 751 unsigned int curr;
764 unsigned int snap; 752 unsigned int snap;
@@ -775,7 +763,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
775 * of the current RCU grace period. 763 * of the current RCU grace period.
776 */ 764 */
777 if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { 765 if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
778 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); 766 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
779 rdp->dynticks_fqs++; 767 rdp->dynticks_fqs++;
780 return 1; 768 return 1;
781 } 769 }
@@ -795,7 +783,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
795 return 0; /* Grace period is not old enough. */ 783 return 0; /* Grace period is not old enough. */
796 barrier(); 784 barrier();
797 if (cpu_is_offline(rdp->cpu)) { 785 if (cpu_is_offline(rdp->cpu)) {
798 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); 786 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
799 rdp->offline_fqs++; 787 rdp->offline_fqs++;
800 return 1; 788 return 1;
801 } 789 }
@@ -1032,7 +1020,7 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
1032 * rcu_nocb_wait_gp(). 1020 * rcu_nocb_wait_gp().
1033 */ 1021 */
1034static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, 1022static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1035 unsigned long c, char *s) 1023 unsigned long c, const char *s)
1036{ 1024{
1037 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, 1025 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
1038 rnp->completed, c, rnp->level, 1026 rnp->completed, c, rnp->level,
@@ -1058,9 +1046,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1058 * grace period is already marked as needed, return to the caller. 1046 * grace period is already marked as needed, return to the caller.
1059 */ 1047 */
1060 c = rcu_cbs_completed(rdp->rsp, rnp); 1048 c = rcu_cbs_completed(rdp->rsp, rnp);
1061 trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); 1049 trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
1062 if (rnp->need_future_gp[c & 0x1]) { 1050 if (rnp->need_future_gp[c & 0x1]) {
1063 trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); 1051 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
1064 return c; 1052 return c;
1065 } 1053 }
1066 1054
@@ -1074,7 +1062,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1074 if (rnp->gpnum != rnp->completed || 1062 if (rnp->gpnum != rnp->completed ||
1075 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { 1063 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
1076 rnp->need_future_gp[c & 0x1]++; 1064 rnp->need_future_gp[c & 0x1]++;
1077 trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); 1065 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
1078 return c; 1066 return c;
1079 } 1067 }
1080 1068
@@ -1102,7 +1090,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1102 * recorded, trace and leave. 1090 * recorded, trace and leave.
1103 */ 1091 */
1104 if (rnp_root->need_future_gp[c & 0x1]) { 1092 if (rnp_root->need_future_gp[c & 0x1]) {
1105 trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); 1093 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot"));
1106 goto unlock_out; 1094 goto unlock_out;
1107 } 1095 }
1108 1096
@@ -1111,9 +1099,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1111 1099
1112 /* If a grace period is not already in progress, start one. */ 1100 /* If a grace period is not already in progress, start one. */
1113 if (rnp_root->gpnum != rnp_root->completed) { 1101 if (rnp_root->gpnum != rnp_root->completed) {
1114 trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); 1102 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
1115 } else { 1103 } else {
1116 trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); 1104 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
1117 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); 1105 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
1118 } 1106 }
1119unlock_out: 1107unlock_out:
@@ -1137,7 +1125,8 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1137 rcu_nocb_gp_cleanup(rsp, rnp); 1125 rcu_nocb_gp_cleanup(rsp, rnp);
1138 rnp->need_future_gp[c & 0x1] = 0; 1126 rnp->need_future_gp[c & 0x1] = 0;
1139 needmore = rnp->need_future_gp[(c + 1) & 0x1]; 1127 needmore = rnp->need_future_gp[(c + 1) & 0x1];
1140 trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); 1128 trace_rcu_future_gp(rnp, rdp, c,
1129 needmore ? TPS("CleanupMore") : TPS("Cleanup"));
1141 return needmore; 1130 return needmore;
1142} 1131}
1143 1132
@@ -1205,9 +1194,9 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1205 1194
1206 /* Trace depending on how much we were able to accelerate. */ 1195 /* Trace depending on how much we were able to accelerate. */
1207 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1196 if (!*rdp->nxttail[RCU_WAIT_TAIL])
1208 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); 1197 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
1209 else 1198 else
1210 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); 1199 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
1211} 1200}
1212 1201
1213/* 1202/*
@@ -1273,7 +1262,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
1273 1262
1274 /* Remember that we saw this grace-period completion. */ 1263 /* Remember that we saw this grace-period completion. */
1275 rdp->completed = rnp->completed; 1264 rdp->completed = rnp->completed;
1276 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); 1265 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
1277 } 1266 }
1278 1267
1279 if (rdp->gpnum != rnp->gpnum) { 1268 if (rdp->gpnum != rnp->gpnum) {
@@ -1283,7 +1272,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
1283 * go looking for one. 1272 * go looking for one.
1284 */ 1273 */
1285 rdp->gpnum = rnp->gpnum; 1274 rdp->gpnum = rnp->gpnum;
1286 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); 1275 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
1287 rdp->passed_quiesce = 0; 1276 rdp->passed_quiesce = 0;
1288 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1277 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1289 zero_cpu_stall_ticks(rdp); 1278 zero_cpu_stall_ticks(rdp);
@@ -1315,6 +1304,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1315 struct rcu_data *rdp; 1304 struct rcu_data *rdp;
1316 struct rcu_node *rnp = rcu_get_root(rsp); 1305 struct rcu_node *rnp = rcu_get_root(rsp);
1317 1306
1307 rcu_bind_gp_kthread();
1318 raw_spin_lock_irq(&rnp->lock); 1308 raw_spin_lock_irq(&rnp->lock);
1319 rsp->gp_flags = 0; /* Clear all flags: New grace period. */ 1309 rsp->gp_flags = 0; /* Clear all flags: New grace period. */
1320 1310
@@ -1326,7 +1316,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1326 1316
1327 /* Advance to a new grace period and initialize state. */ 1317 /* Advance to a new grace period and initialize state. */
1328 rsp->gpnum++; 1318 rsp->gpnum++;
1329 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); 1319 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
1330 record_gp_stall_check_time(rsp); 1320 record_gp_stall_check_time(rsp);
1331 raw_spin_unlock_irq(&rnp->lock); 1321 raw_spin_unlock_irq(&rnp->lock);
1332 1322
@@ -1379,16 +1369,25 @@ static int rcu_gp_init(struct rcu_state *rsp)
1379int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) 1369int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1380{ 1370{
1381 int fqs_state = fqs_state_in; 1371 int fqs_state = fqs_state_in;
1372 bool isidle = false;
1373 unsigned long maxj;
1382 struct rcu_node *rnp = rcu_get_root(rsp); 1374 struct rcu_node *rnp = rcu_get_root(rsp);
1383 1375
1384 rsp->n_force_qs++; 1376 rsp->n_force_qs++;
1385 if (fqs_state == RCU_SAVE_DYNTICK) { 1377 if (fqs_state == RCU_SAVE_DYNTICK) {
1386 /* Collect dyntick-idle snapshots. */ 1378 /* Collect dyntick-idle snapshots. */
1387 force_qs_rnp(rsp, dyntick_save_progress_counter); 1379 if (is_sysidle_rcu_state(rsp)) {
1380 isidle = 1;
1381 maxj = jiffies - ULONG_MAX / 4;
1382 }
1383 force_qs_rnp(rsp, dyntick_save_progress_counter,
1384 &isidle, &maxj);
1385 rcu_sysidle_report_gp(rsp, isidle, maxj);
1388 fqs_state = RCU_FORCE_QS; 1386 fqs_state = RCU_FORCE_QS;
1389 } else { 1387 } else {
1390 /* Handle dyntick-idle and offline CPUs. */ 1388 /* Handle dyntick-idle and offline CPUs. */
1391 force_qs_rnp(rsp, rcu_implicit_dynticks_qs); 1389 isidle = 0;
1390 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
1392 } 1391 }
1393 /* Clear flag to prevent immediate re-entry. */ 1392 /* Clear flag to prevent immediate re-entry. */
1394 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1393 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
@@ -1448,7 +1447,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1448 rcu_nocb_gp_set(rnp, nocb); 1447 rcu_nocb_gp_set(rnp, nocb);
1449 1448
1450 rsp->completed = rsp->gpnum; /* Declare grace period done. */ 1449 rsp->completed = rsp->gpnum; /* Declare grace period done. */
1451 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1450 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
1452 rsp->fqs_state = RCU_GP_IDLE; 1451 rsp->fqs_state = RCU_GP_IDLE;
1453 rdp = this_cpu_ptr(rsp->rda); 1452 rdp = this_cpu_ptr(rsp->rda);
1454 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ 1453 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
@@ -1558,10 +1557,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1558 1557
1559 /* 1558 /*
1560 * We can't do wakeups while holding the rnp->lock, as that 1559 * We can't do wakeups while holding the rnp->lock, as that
1561 * could cause possible deadlocks with the rq->lock. Deter 1560 * could cause possible deadlocks with the rq->lock. Defer
1562 * the wakeup to interrupt context. 1561 * the wakeup to interrupt context. And don't bother waking
1562 * up the running kthread.
1563 */ 1563 */
1564 irq_work_queue(&rsp->wakeup_work); 1564 if (current != rsp->gp_kthread)
1565 irq_work_queue(&rsp->wakeup_work);
1565} 1566}
1566 1567
1567/* 1568/*
@@ -1857,7 +1858,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1857 RCU_TRACE(mask = rdp->grpmask); 1858 RCU_TRACE(mask = rdp->grpmask);
1858 trace_rcu_grace_period(rsp->name, 1859 trace_rcu_grace_period(rsp->name,
1859 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 1860 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
1860 "cpuofl"); 1861 TPS("cpuofl"));
1861} 1862}
1862 1863
1863/* 1864/*
@@ -2044,7 +2045,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2044 */ 2045 */
2045void rcu_check_callbacks(int cpu, int user) 2046void rcu_check_callbacks(int cpu, int user)
2046{ 2047{
2047 trace_rcu_utilization("Start scheduler-tick"); 2048 trace_rcu_utilization(TPS("Start scheduler-tick"));
2048 increment_cpu_stall_ticks(); 2049 increment_cpu_stall_ticks();
2049 if (user || rcu_is_cpu_rrupt_from_idle()) { 2050 if (user || rcu_is_cpu_rrupt_from_idle()) {
2050 2051
@@ -2077,7 +2078,7 @@ void rcu_check_callbacks(int cpu, int user)
2077 rcu_preempt_check_callbacks(cpu); 2078 rcu_preempt_check_callbacks(cpu);
2078 if (rcu_pending(cpu)) 2079 if (rcu_pending(cpu))
2079 invoke_rcu_core(); 2080 invoke_rcu_core();
2080 trace_rcu_utilization("End scheduler-tick"); 2081 trace_rcu_utilization(TPS("End scheduler-tick"));
2081} 2082}
2082 2083
2083/* 2084/*
@@ -2087,7 +2088,10 @@ void rcu_check_callbacks(int cpu, int user)
2087 * 2088 *
2088 * The caller must have suppressed start of new grace periods. 2089 * The caller must have suppressed start of new grace periods.
2089 */ 2090 */
2090static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) 2091static void force_qs_rnp(struct rcu_state *rsp,
2092 int (*f)(struct rcu_data *rsp, bool *isidle,
2093 unsigned long *maxj),
2094 bool *isidle, unsigned long *maxj)
2091{ 2095{
2092 unsigned long bit; 2096 unsigned long bit;
2093 int cpu; 2097 int cpu;
@@ -2110,9 +2114,12 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
2110 cpu = rnp->grplo; 2114 cpu = rnp->grplo;
2111 bit = 1; 2115 bit = 1;
2112 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 2116 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
2113 if ((rnp->qsmask & bit) != 0 && 2117 if ((rnp->qsmask & bit) != 0) {
2114 f(per_cpu_ptr(rsp->rda, cpu))) 2118 if ((rnp->qsmaskinit & bit) != 0)
2115 mask |= bit; 2119 *isidle = 0;
2120 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
2121 mask |= bit;
2122 }
2116 } 2123 }
2117 if (mask != 0) { 2124 if (mask != 0) {
2118 2125
@@ -2208,10 +2215,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
2208 2215
2209 if (cpu_is_offline(smp_processor_id())) 2216 if (cpu_is_offline(smp_processor_id()))
2210 return; 2217 return;
2211 trace_rcu_utilization("Start RCU core"); 2218 trace_rcu_utilization(TPS("Start RCU core"));
2212 for_each_rcu_flavor(rsp) 2219 for_each_rcu_flavor(rsp)
2213 __rcu_process_callbacks(rsp); 2220 __rcu_process_callbacks(rsp);
2214 trace_rcu_utilization("End RCU core"); 2221 trace_rcu_utilization(TPS("End RCU core"));
2215} 2222}
2216 2223
2217/* 2224/*
@@ -2287,6 +2294,13 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2287} 2294}
2288 2295
2289/* 2296/*
2297 * RCU callback function to leak a callback.
2298 */
2299static void rcu_leak_callback(struct rcu_head *rhp)
2300{
2301}
2302
2303/*
2290 * Helper function for call_rcu() and friends. The cpu argument will 2304 * Helper function for call_rcu() and friends. The cpu argument will
2291 * normally be -1, indicating "currently running CPU". It may specify 2305 * normally be -1, indicating "currently running CPU". It may specify
2292 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() 2306 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier()
@@ -2300,7 +2314,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2300 struct rcu_data *rdp; 2314 struct rcu_data *rdp;
2301 2315
2302 WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ 2316 WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
2303 debug_rcu_head_queue(head); 2317 if (debug_rcu_head_queue(head)) {
2318 /* Probable double call_rcu(), so leak the callback. */
2319 ACCESS_ONCE(head->func) = rcu_leak_callback;
2320 WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
2321 return;
2322 }
2304 head->func = func; 2323 head->func = func;
2305 head->next = NULL; 2324 head->next = NULL;
2306 2325
@@ -2720,7 +2739,7 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2720 * Helper function for _rcu_barrier() tracing. If tracing is disabled, 2739 * Helper function for _rcu_barrier() tracing. If tracing is disabled,
2721 * the compiler is expected to optimize this away. 2740 * the compiler is expected to optimize this away.
2722 */ 2741 */
2723static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, 2742static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s,
2724 int cpu, unsigned long done) 2743 int cpu, unsigned long done)
2725{ 2744{
2726 trace_rcu_barrier(rsp->name, s, cpu, 2745 trace_rcu_barrier(rsp->name, s, cpu,
@@ -2785,9 +2804,20 @@ static void _rcu_barrier(struct rcu_state *rsp)
2785 * transition. The "if" expression below therefore rounds the old 2804 * transition. The "if" expression below therefore rounds the old
2786 * value up to the next even number and adds two before comparing. 2805 * value up to the next even number and adds two before comparing.
2787 */ 2806 */
2788 snap_done = ACCESS_ONCE(rsp->n_barrier_done); 2807 snap_done = rsp->n_barrier_done;
2789 _rcu_barrier_trace(rsp, "Check", -1, snap_done); 2808 _rcu_barrier_trace(rsp, "Check", -1, snap_done);
2790 if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { 2809
2810 /*
2811 * If the value in snap is odd, we needed to wait for the current
2812 * rcu_barrier() to complete, then wait for the next one, in other
2813 * words, we need the value of snap_done to be three larger than
2814 * the value of snap. On the other hand, if the value in snap is
2815 * even, we only had to wait for the next rcu_barrier() to complete,
2816 * in other words, we need the value of snap_done to be only two
2817 * greater than the value of snap. The "(snap + 3) & ~0x1" computes
2818 * this for us (thank you, Linus!).
2819 */
2820 if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
2791 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); 2821 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
2792 smp_mb(); /* caller's subsequent code after above check. */ 2822 smp_mb(); /* caller's subsequent code after above check. */
2793 mutex_unlock(&rsp->barrier_mutex); 2823 mutex_unlock(&rsp->barrier_mutex);
@@ -2930,6 +2960,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2930 rdp->blimit = blimit; 2960 rdp->blimit = blimit;
2931 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ 2961 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
2932 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 2962 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2963 rcu_sysidle_init_percpu_data(rdp->dynticks);
2933 atomic_set(&rdp->dynticks->dynticks, 2964 atomic_set(&rdp->dynticks->dynticks,
2934 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 2965 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2935 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2966 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
@@ -2952,7 +2983,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2952 rdp->completed = rnp->completed; 2983 rdp->completed = rnp->completed;
2953 rdp->passed_quiesce = 0; 2984 rdp->passed_quiesce = 0;
2954 rdp->qs_pending = 0; 2985 rdp->qs_pending = 0;
2955 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); 2986 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
2956 } 2987 }
2957 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ 2988 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
2958 rnp = rnp->parent; 2989 rnp = rnp->parent;
@@ -2982,7 +3013,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
2982 struct rcu_node *rnp = rdp->mynode; 3013 struct rcu_node *rnp = rdp->mynode;
2983 struct rcu_state *rsp; 3014 struct rcu_state *rsp;
2984 3015
2985 trace_rcu_utilization("Start CPU hotplug"); 3016 trace_rcu_utilization(TPS("Start CPU hotplug"));
2986 switch (action) { 3017 switch (action) {
2987 case CPU_UP_PREPARE: 3018 case CPU_UP_PREPARE:
2988 case CPU_UP_PREPARE_FROZEN: 3019 case CPU_UP_PREPARE_FROZEN:
@@ -3011,7 +3042,26 @@ static int rcu_cpu_notify(struct notifier_block *self,
3011 default: 3042 default:
3012 break; 3043 break;
3013 } 3044 }
3014 trace_rcu_utilization("End CPU hotplug"); 3045 trace_rcu_utilization(TPS("End CPU hotplug"));
3046 return NOTIFY_OK;
3047}
3048
3049static int rcu_pm_notify(struct notifier_block *self,
3050 unsigned long action, void *hcpu)
3051{
3052 switch (action) {
3053 case PM_HIBERNATION_PREPARE:
3054 case PM_SUSPEND_PREPARE:
3055 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
3056 rcu_expedited = 1;
3057 break;
3058 case PM_POST_HIBERNATION:
3059 case PM_POST_SUSPEND:
3060 rcu_expedited = 0;
3061 break;
3062 default:
3063 break;
3064 }
3015 return NOTIFY_OK; 3065 return NOTIFY_OK;
3016} 3066}
3017 3067
@@ -3256,6 +3306,7 @@ void __init rcu_init(void)
3256 * or the scheduler are operational. 3306 * or the scheduler are operational.
3257 */ 3307 */
3258 cpu_notifier(rcu_cpu_notify, 0); 3308 cpu_notifier(rcu_cpu_notify, 0);
3309 pm_notifier(rcu_pm_notify, 0);
3259 for_each_online_cpu(cpu) 3310 for_each_online_cpu(cpu)
3260 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3311 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3261} 3312}
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index b3832581043c..5f97eab602cd 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -88,6 +88,14 @@ struct rcu_dynticks {
88 /* Process level is worth LLONG_MAX/2. */ 88 /* Process level is worth LLONG_MAX/2. */
89 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 89 int dynticks_nmi_nesting; /* Track NMI nesting level. */
90 atomic_t dynticks; /* Even value for idle, else odd. */ 90 atomic_t dynticks; /* Even value for idle, else odd. */
91#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
92 long long dynticks_idle_nesting;
93 /* irq/process nesting level from idle. */
94 atomic_t dynticks_idle; /* Even value for idle, else odd. */
95 /* "Idle" excludes userspace execution. */
96 unsigned long dynticks_idle_jiffies;
97 /* End of last non-NMI non-idle period. */
98#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
91#ifdef CONFIG_RCU_FAST_NO_HZ 99#ifdef CONFIG_RCU_FAST_NO_HZ
92 bool all_lazy; /* Are all CPU's CBs lazy? */ 100 bool all_lazy; /* Are all CPU's CBs lazy? */
93 unsigned long nonlazy_posted; 101 unsigned long nonlazy_posted;
@@ -445,7 +453,7 @@ struct rcu_state {
445 /* for CPU stalls. */ 453 /* for CPU stalls. */
446 unsigned long gp_max; /* Maximum GP duration in */ 454 unsigned long gp_max; /* Maximum GP duration in */
447 /* jiffies. */ 455 /* jiffies. */
448 char *name; /* Name of structure. */ 456 const char *name; /* Name of structure. */
449 char abbr; /* Abbreviated name. */ 457 char abbr; /* Abbreviated name. */
450 struct list_head flavors; /* List of RCU flavors. */ 458 struct list_head flavors; /* List of RCU flavors. */
451 struct irq_work wakeup_work; /* Postponed wakeups */ 459 struct irq_work wakeup_work; /* Postponed wakeups */
@@ -545,6 +553,15 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
545static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 553static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
546static void rcu_kick_nohz_cpu(int cpu); 554static void rcu_kick_nohz_cpu(int cpu);
547static bool init_nocb_callback_list(struct rcu_data *rdp); 555static bool init_nocb_callback_list(struct rcu_data *rdp);
556static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
557static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
558static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
559 unsigned long *maxj);
560static bool is_sysidle_rcu_state(struct rcu_state *rsp);
561static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
562 unsigned long maxj);
563static void rcu_bind_gp_kthread(void);
564static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
548 565
549#endif /* #ifndef RCU_TREE_NONCORE */ 566#endif /* #ifndef RCU_TREE_NONCORE */
550 567
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 769e12e3151b..130c97b027f2 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,7 +28,7 @@
28#include <linux/gfp.h> 28#include <linux/gfp.h>
29#include <linux/oom.h> 29#include <linux/oom.h>
30#include <linux/smpboot.h> 30#include <linux/smpboot.h>
31#include <linux/tick.h> 31#include "time/tick-internal.h"
32 32
33#define RCU_KTHREAD_PRIO 1 33#define RCU_KTHREAD_PRIO 1
34 34
@@ -110,9 +110,7 @@ static void __init rcu_bootup_announce_oddness(void)
110 110
111#ifdef CONFIG_TREE_PREEMPT_RCU 111#ifdef CONFIG_TREE_PREEMPT_RCU
112 112
113struct rcu_state rcu_preempt_state = 113RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
114 RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
115DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
116static struct rcu_state *rcu_state = &rcu_preempt_state; 114static struct rcu_state *rcu_state = &rcu_preempt_state;
117 115
118static int rcu_preempted_readers_exp(struct rcu_node *rnp); 116static int rcu_preempted_readers_exp(struct rcu_node *rnp);
@@ -169,7 +167,7 @@ static void rcu_preempt_qs(int cpu)
169 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 167 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
170 168
171 if (rdp->passed_quiesce == 0) 169 if (rdp->passed_quiesce == 0)
172 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); 170 trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs"));
173 rdp->passed_quiesce = 1; 171 rdp->passed_quiesce = 1;
174 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 172 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
175} 173}
@@ -388,7 +386,7 @@ void rcu_read_unlock_special(struct task_struct *t)
388 np = rcu_next_node_entry(t, rnp); 386 np = rcu_next_node_entry(t, rnp);
389 list_del_init(&t->rcu_node_entry); 387 list_del_init(&t->rcu_node_entry);
390 t->rcu_blocked_node = NULL; 388 t->rcu_blocked_node = NULL;
391 trace_rcu_unlock_preempted_task("rcu_preempt", 389 trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
392 rnp->gpnum, t->pid); 390 rnp->gpnum, t->pid);
393 if (&t->rcu_node_entry == rnp->gp_tasks) 391 if (&t->rcu_node_entry == rnp->gp_tasks)
394 rnp->gp_tasks = np; 392 rnp->gp_tasks = np;
@@ -412,7 +410,7 @@ void rcu_read_unlock_special(struct task_struct *t)
412 */ 410 */
413 empty_exp_now = !rcu_preempted_readers_exp(rnp); 411 empty_exp_now = !rcu_preempted_readers_exp(rnp);
414 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { 412 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
415 trace_rcu_quiescent_state_report("preempt_rcu", 413 trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
416 rnp->gpnum, 414 rnp->gpnum,
417 0, rnp->qsmask, 415 0, rnp->qsmask,
418 rnp->level, 416 rnp->level,
@@ -1250,12 +1248,12 @@ static int rcu_boost_kthread(void *arg)
1250 int spincnt = 0; 1248 int spincnt = 0;
1251 int more2boost; 1249 int more2boost;
1252 1250
1253 trace_rcu_utilization("Start boost kthread@init"); 1251 trace_rcu_utilization(TPS("Start boost kthread@init"));
1254 for (;;) { 1252 for (;;) {
1255 rnp->boost_kthread_status = RCU_KTHREAD_WAITING; 1253 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1256 trace_rcu_utilization("End boost kthread@rcu_wait"); 1254 trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
1257 rcu_wait(rnp->boost_tasks || rnp->exp_tasks); 1255 rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1258 trace_rcu_utilization("Start boost kthread@rcu_wait"); 1256 trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
1259 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; 1257 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1260 more2boost = rcu_boost(rnp); 1258 more2boost = rcu_boost(rnp);
1261 if (more2boost) 1259 if (more2boost)
@@ -1264,14 +1262,14 @@ static int rcu_boost_kthread(void *arg)
1264 spincnt = 0; 1262 spincnt = 0;
1265 if (spincnt > 10) { 1263 if (spincnt > 10) {
1266 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; 1264 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
1267 trace_rcu_utilization("End boost kthread@rcu_yield"); 1265 trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
1268 schedule_timeout_interruptible(2); 1266 schedule_timeout_interruptible(2);
1269 trace_rcu_utilization("Start boost kthread@rcu_yield"); 1267 trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
1270 spincnt = 0; 1268 spincnt = 0;
1271 } 1269 }
1272 } 1270 }
1273 /* NOTREACHED */ 1271 /* NOTREACHED */
1274 trace_rcu_utilization("End boost kthread@notreached"); 1272 trace_rcu_utilization(TPS("End boost kthread@notreached"));
1275 return 0; 1273 return 0;
1276} 1274}
1277 1275
@@ -1419,7 +1417,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
1419 int spincnt; 1417 int spincnt;
1420 1418
1421 for (spincnt = 0; spincnt < 10; spincnt++) { 1419 for (spincnt = 0; spincnt < 10; spincnt++) {
1422 trace_rcu_utilization("Start CPU kthread@rcu_wait"); 1420 trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
1423 local_bh_disable(); 1421 local_bh_disable();
1424 *statusp = RCU_KTHREAD_RUNNING; 1422 *statusp = RCU_KTHREAD_RUNNING;
1425 this_cpu_inc(rcu_cpu_kthread_loops); 1423 this_cpu_inc(rcu_cpu_kthread_loops);
@@ -1431,15 +1429,15 @@ static void rcu_cpu_kthread(unsigned int cpu)
1431 rcu_kthread_do_work(); 1429 rcu_kthread_do_work();
1432 local_bh_enable(); 1430 local_bh_enable();
1433 if (*workp == 0) { 1431 if (*workp == 0) {
1434 trace_rcu_utilization("End CPU kthread@rcu_wait"); 1432 trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
1435 *statusp = RCU_KTHREAD_WAITING; 1433 *statusp = RCU_KTHREAD_WAITING;
1436 return; 1434 return;
1437 } 1435 }
1438 } 1436 }
1439 *statusp = RCU_KTHREAD_YIELDING; 1437 *statusp = RCU_KTHREAD_YIELDING;
1440 trace_rcu_utilization("Start CPU kthread@rcu_yield"); 1438 trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
1441 schedule_timeout_interruptible(2); 1439 schedule_timeout_interruptible(2);
1442 trace_rcu_utilization("End CPU kthread@rcu_yield"); 1440 trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
1443 *statusp = RCU_KTHREAD_WAITING; 1441 *statusp = RCU_KTHREAD_WAITING;
1444} 1442}
1445 1443
@@ -2202,7 +2200,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2202 * Wait for the grace period. Do so interruptibly to avoid messing 2200 * Wait for the grace period. Do so interruptibly to avoid messing
2203 * up the load average. 2201 * up the load average.
2204 */ 2202 */
2205 trace_rcu_future_gp(rnp, rdp, c, "StartWait"); 2203 trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
2206 for (;;) { 2204 for (;;) {
2207 wait_event_interruptible( 2205 wait_event_interruptible(
2208 rnp->nocb_gp_wq[c & 0x1], 2206 rnp->nocb_gp_wq[c & 0x1],
@@ -2210,9 +2208,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2210 if (likely(d)) 2208 if (likely(d))
2211 break; 2209 break;
2212 flush_signals(current); 2210 flush_signals(current);
2213 trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); 2211 trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
2214 } 2212 }
2215 trace_rcu_future_gp(rnp, rdp, c, "EndWait"); 2213 trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
2216 smp_mb(); /* Ensure that CB invocation happens after GP end. */ 2214 smp_mb(); /* Ensure that CB invocation happens after GP end. */
2217} 2215}
2218 2216
@@ -2375,3 +2373,425 @@ static void rcu_kick_nohz_cpu(int cpu)
2375 smp_send_reschedule(cpu); 2373 smp_send_reschedule(cpu);
2376#endif /* #ifdef CONFIG_NO_HZ_FULL */ 2374#endif /* #ifdef CONFIG_NO_HZ_FULL */
2377} 2375}
2376
2377
2378#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
2379
2380/*
2381 * Define RCU flavor that holds sysidle state. This needs to be the
2382 * most active flavor of RCU.
2383 */
2384#ifdef CONFIG_PREEMPT_RCU
2385static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
2386#else /* #ifdef CONFIG_PREEMPT_RCU */
2387static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
2388#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
2389
2390static int full_sysidle_state; /* Current system-idle state. */
2391#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
2392#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
2393#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
2394#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
2395#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
2396
2397/*
2398 * Invoked to note exit from irq or task transition to idle. Note that
2399 * usermode execution does -not- count as idle here! After all, we want
2400 * to detect full-system idle states, not RCU quiescent states and grace
2401 * periods. The caller must have disabled interrupts.
2402 */
2403static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
2404{
2405 unsigned long j;
2406
2407 /* Adjust nesting, check for fully idle. */
2408 if (irq) {
2409 rdtp->dynticks_idle_nesting--;
2410 WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2411 if (rdtp->dynticks_idle_nesting != 0)
2412 return; /* Still not fully idle. */
2413 } else {
2414 if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
2415 DYNTICK_TASK_NEST_VALUE) {
2416 rdtp->dynticks_idle_nesting = 0;
2417 } else {
2418 rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
2419 WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2420 return; /* Still not fully idle. */
2421 }
2422 }
2423
2424 /* Record start of fully idle period. */
2425 j = jiffies;
2426 ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
2427 smp_mb__before_atomic_inc();
2428 atomic_inc(&rdtp->dynticks_idle);
2429 smp_mb__after_atomic_inc();
2430 WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
2431}
2432
2433/*
2434 * Unconditionally force exit from full system-idle state. This is
2435 * invoked when a normal CPU exits idle, but must be called separately
2436 * for the timekeeping CPU (tick_do_timer_cpu). The reason for this
2437 * is that the timekeeping CPU is permitted to take scheduling-clock
2438 * interrupts while the system is in system-idle state, and of course
2439 * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
2440 * interrupt from any other type of interrupt.
2441 */
2442void rcu_sysidle_force_exit(void)
2443{
2444 int oldstate = ACCESS_ONCE(full_sysidle_state);
2445 int newoldstate;
2446
2447 /*
2448 * Each pass through the following loop attempts to exit full
2449 * system-idle state. If contention proves to be a problem,
2450 * a trylock-based contention tree could be used here.
2451 */
2452 while (oldstate > RCU_SYSIDLE_SHORT) {
2453 newoldstate = cmpxchg(&full_sysidle_state,
2454 oldstate, RCU_SYSIDLE_NOT);
2455 if (oldstate == newoldstate &&
2456 oldstate == RCU_SYSIDLE_FULL_NOTED) {
2457 rcu_kick_nohz_cpu(tick_do_timer_cpu);
2458 return; /* We cleared it, done! */
2459 }
2460 oldstate = newoldstate;
2461 }
2462 smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
2463}
2464
2465/*
2466 * Invoked to note entry to irq or task transition from idle. Note that
2467 * usermode execution does -not- count as idle here! The caller must
2468 * have disabled interrupts.
2469 */
2470static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
2471{
2472 /* Adjust nesting, check for already non-idle. */
2473 if (irq) {
2474 rdtp->dynticks_idle_nesting++;
2475 WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2476 if (rdtp->dynticks_idle_nesting != 1)
2477 return; /* Already non-idle. */
2478 } else {
2479 /*
2480 * Allow for irq misnesting. Yes, it really is possible
2481 * to enter an irq handler then never leave it, and maybe
2482 * also vice versa. Handle both possibilities.
2483 */
2484 if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
2485 rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
2486 WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2487 return; /* Already non-idle. */
2488 } else {
2489 rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
2490 }
2491 }
2492
2493 /* Record end of idle period. */
2494 smp_mb__before_atomic_inc();
2495 atomic_inc(&rdtp->dynticks_idle);
2496 smp_mb__after_atomic_inc();
2497 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
2498
2499 /*
2500 * If we are the timekeeping CPU, we are permitted to be non-idle
2501 * during a system-idle state. This must be the case, because
2502 * the timekeeping CPU has to take scheduling-clock interrupts
2503 * during the time that the system is transitioning to full
2504 * system-idle state. This means that the timekeeping CPU must
2505 * invoke rcu_sysidle_force_exit() directly if it does anything
2506 * more than take a scheduling-clock interrupt.
2507 */
2508 if (smp_processor_id() == tick_do_timer_cpu)
2509 return;
2510
2511 /* Update system-idle state: We are clearly no longer fully idle! */
2512 rcu_sysidle_force_exit();
2513}
2514
2515/*
2516 * Check to see if the current CPU is idle. Note that usermode execution
2517 * does not count as idle. The caller must have disabled interrupts.
2518 */
2519static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2520 unsigned long *maxj)
2521{
2522 int cur;
2523 unsigned long j;
2524 struct rcu_dynticks *rdtp = rdp->dynticks;
2525
2526 /*
2527 * If some other CPU has already reported non-idle, if this is
2528 * not the flavor of RCU that tracks sysidle state, or if this
2529 * is an offline or the timekeeping CPU, nothing to do.
2530 */
2531 if (!*isidle || rdp->rsp != rcu_sysidle_state ||
2532 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
2533 return;
2534 if (rcu_gp_in_progress(rdp->rsp))
2535 WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
2536
2537 /* Pick up current idle and NMI-nesting counter and check. */
2538 cur = atomic_read(&rdtp->dynticks_idle);
2539 if (cur & 0x1) {
2540 *isidle = false; /* We are not idle! */
2541 return;
2542 }
2543 smp_mb(); /* Read counters before timestamps. */
2544
2545 /* Pick up timestamps. */
2546 j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
2547 /* If this CPU entered idle more recently, update maxj timestamp. */
2548 if (ULONG_CMP_LT(*maxj, j))
2549 *maxj = j;
2550}
2551
2552/*
2553 * Is this the flavor of RCU that is handling full-system idle?
2554 */
2555static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2556{
2557 return rsp == rcu_sysidle_state;
2558}
2559
2560/*
2561 * Bind the grace-period kthread for the sysidle flavor of RCU to the
2562 * timekeeping CPU.
2563 */
2564static void rcu_bind_gp_kthread(void)
2565{
2566 int cpu = ACCESS_ONCE(tick_do_timer_cpu);
2567
2568 if (cpu < 0 || cpu >= nr_cpu_ids)
2569 return;
2570 if (raw_smp_processor_id() != cpu)
2571 set_cpus_allowed_ptr(current, cpumask_of(cpu));
2572}
2573
2574/*
2575 * Return a delay in jiffies based on the number of CPUs, rcu_node
2576 * leaf fanout, and jiffies tick rate. The idea is to allow larger
2577 * systems more time to transition to full-idle state in order to
2578 * avoid the cache thrashing that otherwise occur on the state variable.
2579 * Really small systems (less than a couple of tens of CPUs) should
2580 * instead use a single global atomically incremented counter, and later
2581 * versions of this will automatically reconfigure themselves accordingly.
2582 */
2583static unsigned long rcu_sysidle_delay(void)
2584{
2585 if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2586 return 0;
2587 return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
2588}
2589
2590/*
2591 * Advance the full-system-idle state. This is invoked when all of
2592 * the non-timekeeping CPUs are idle.
2593 */
2594static void rcu_sysidle(unsigned long j)
2595{
2596 /* Check the current state. */
2597 switch (ACCESS_ONCE(full_sysidle_state)) {
2598 case RCU_SYSIDLE_NOT:
2599
2600 /* First time all are idle, so note a short idle period. */
2601 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
2602 break;
2603
2604 case RCU_SYSIDLE_SHORT:
2605
2606 /*
2607 * Idle for a bit, time to advance to next state?
2608 * cmpxchg failure means race with non-idle, let them win.
2609 */
2610 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2611 (void)cmpxchg(&full_sysidle_state,
2612 RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
2613 break;
2614
2615 case RCU_SYSIDLE_LONG:
2616
2617 /*
2618 * Do an additional check pass before advancing to full.
2619 * cmpxchg failure means race with non-idle, let them win.
2620 */
2621 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2622 (void)cmpxchg(&full_sysidle_state,
2623 RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
2624 break;
2625
2626 default:
2627 break;
2628 }
2629}
2630
2631/*
2632 * Found a non-idle non-timekeeping CPU, so kick the system-idle state
2633 * back to the beginning.
2634 */
2635static void rcu_sysidle_cancel(void)
2636{
2637 smp_mb();
2638 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
2639}
2640
2641/*
2642 * Update the sysidle state based on the results of a force-quiescent-state
2643 * scan of the CPUs' dyntick-idle state.
2644 */
2645static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
2646 unsigned long maxj, bool gpkt)
2647{
2648 if (rsp != rcu_sysidle_state)
2649 return; /* Wrong flavor, ignore. */
2650 if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2651 return; /* Running state machine from timekeeping CPU. */
2652 if (isidle)
2653 rcu_sysidle(maxj); /* More idle! */
2654 else
2655 rcu_sysidle_cancel(); /* Idle is over. */
2656}
2657
2658/*
2659 * Wrapper for rcu_sysidle_report() when called from the grace-period
2660 * kthread's context.
2661 */
2662static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2663 unsigned long maxj)
2664{
2665 rcu_sysidle_report(rsp, isidle, maxj, true);
2666}
2667
2668/* Callback and function for forcing an RCU grace period. */
2669struct rcu_sysidle_head {
2670 struct rcu_head rh;
2671 int inuse;
2672};
2673
2674static void rcu_sysidle_cb(struct rcu_head *rhp)
2675{
2676 struct rcu_sysidle_head *rshp;
2677
2678 /*
2679 * The following memory barrier is needed to replace the
2680 * memory barriers that would normally be in the memory
2681 * allocator.
2682 */
2683 smp_mb(); /* grace period precedes setting inuse. */
2684
2685 rshp = container_of(rhp, struct rcu_sysidle_head, rh);
2686 ACCESS_ONCE(rshp->inuse) = 0;
2687}
2688
2689/*
2690 * Check to see if the system is fully idle, other than the timekeeping CPU.
2691 * The caller must have disabled interrupts.
2692 */
2693bool rcu_sys_is_idle(void)
2694{
2695 static struct rcu_sysidle_head rsh;
2696 int rss = ACCESS_ONCE(full_sysidle_state);
2697
2698 if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
2699 return false;
2700
2701 /* Handle small-system case by doing a full scan of CPUs. */
2702 if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
2703 int oldrss = rss - 1;
2704
2705 /*
2706 * One pass to advance to each state up to _FULL.
2707 * Give up if any pass fails to advance the state.
2708 */
2709 while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
2710 int cpu;
2711 bool isidle = true;
2712 unsigned long maxj = jiffies - ULONG_MAX / 4;
2713 struct rcu_data *rdp;
2714
2715 /* Scan all the CPUs looking for nonidle CPUs. */
2716 for_each_possible_cpu(cpu) {
2717 rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
2718 rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
2719 if (!isidle)
2720 break;
2721 }
2722 rcu_sysidle_report(rcu_sysidle_state,
2723 isidle, maxj, false);
2724 oldrss = rss;
2725 rss = ACCESS_ONCE(full_sysidle_state);
2726 }
2727 }
2728
2729 /* If this is the first observation of an idle period, record it. */
2730 if (rss == RCU_SYSIDLE_FULL) {
2731 rss = cmpxchg(&full_sysidle_state,
2732 RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
2733 return rss == RCU_SYSIDLE_FULL;
2734 }
2735
2736 smp_mb(); /* ensure rss load happens before later caller actions. */
2737
2738 /* If already fully idle, tell the caller (in case of races). */
2739 if (rss == RCU_SYSIDLE_FULL_NOTED)
2740 return true;
2741
2742 /*
2743 * If we aren't there yet, and a grace period is not in flight,
2744 * initiate a grace period. Either way, tell the caller that
2745 * we are not there yet. We use an xchg() rather than an assignment
2746 * to make up for the memory barriers that would otherwise be
2747 * provided by the memory allocator.
2748 */
2749 if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
2750 !rcu_gp_in_progress(rcu_sysidle_state) &&
2751 !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
2752 call_rcu(&rsh.rh, rcu_sysidle_cb);
2753 return false;
2754}
2755
2756/*
2757 * Initialize dynticks sysidle state for CPUs coming online.
2758 */
2759static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2760{
2761 rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
2762}
2763
2764#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2765
2766static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
2767{
2768}
2769
2770static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
2771{
2772}
2773
2774static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2775 unsigned long *maxj)
2776{
2777}
2778
2779static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2780{
2781 return false;
2782}
2783
2784static void rcu_bind_gp_kthread(void)
2785{
2786}
2787
2788static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2789 unsigned long maxj)
2790{
2791}
2792
2793static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2794{
2795}
2796
2797#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 269ed9384cc4..f813b3474646 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -32,7 +32,14 @@ EXPORT_SYMBOL(cad_pid);
32#endif 32#endif
33enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; 33enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
34 34
35int reboot_default; 35/*
36 * This variable is used privately to keep track of whether or not
37 * reboot_type is still set to its default value (i.e., reboot= hasn't
38 * been set on the command line). This is needed so that we can
39 * suppress DMI scanning for reboot quirks. Without it, it's
40 * impossible to override a faulty reboot quirk without recompiling.
41 */
42int reboot_default = 1;
36int reboot_cpu; 43int reboot_cpu;
37enum reboot_type reboot_type = BOOT_ACPI; 44enum reboot_type reboot_type = BOOT_ACPI;
38int reboot_force; 45int reboot_force;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ff55247e7049..4aa8a305aede 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -17,8 +17,8 @@
17void res_counter_init(struct res_counter *counter, struct res_counter *parent) 17void res_counter_init(struct res_counter *counter, struct res_counter *parent)
18{ 18{
19 spin_lock_init(&counter->lock); 19 spin_lock_init(&counter->lock);
20 counter->limit = RESOURCE_MAX; 20 counter->limit = RES_COUNTER_MAX;
21 counter->soft_limit = RESOURCE_MAX; 21 counter->soft_limit = RES_COUNTER_MAX;
22 counter->parent = parent; 22 counter->parent = parent;
23} 23}
24 24
@@ -178,23 +178,30 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
178#endif 178#endif
179 179
180int res_counter_memparse_write_strategy(const char *buf, 180int res_counter_memparse_write_strategy(const char *buf,
181 unsigned long long *res) 181 unsigned long long *resp)
182{ 182{
183 char *end; 183 char *end;
184 unsigned long long res;
184 185
185 /* return RESOURCE_MAX(unlimited) if "-1" is specified */ 186 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
186 if (*buf == '-') { 187 if (*buf == '-') {
187 *res = simple_strtoull(buf + 1, &end, 10); 188 res = simple_strtoull(buf + 1, &end, 10);
188 if (*res != 1 || *end != '\0') 189 if (res != 1 || *end != '\0')
189 return -EINVAL; 190 return -EINVAL;
190 *res = RESOURCE_MAX; 191 *resp = RES_COUNTER_MAX;
191 return 0; 192 return 0;
192 } 193 }
193 194
194 *res = memparse(buf, &end); 195 res = memparse(buf, &end);
195 if (*end != '\0') 196 if (*end != '\0')
196 return -EINVAL; 197 return -EINVAL;
197 198
198 *res = PAGE_ALIGN(*res); 199 if (PAGE_ALIGN(res) >= res)
200 res = PAGE_ALIGN(res);
201 else
202 res = RES_COUNTER_MAX;
203
204 *resp = res;
205
199 return 0; 206 return 0;
200} 207}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 05c39f030314..5ac63c9a995a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -978,13 +978,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
978 rq->skip_clock_update = 1; 978 rq->skip_clock_update = 1;
979} 979}
980 980
981static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
982
983void register_task_migration_notifier(struct notifier_block *n)
984{
985 atomic_notifier_chain_register(&task_migration_notifier, n);
986}
987
988#ifdef CONFIG_SMP 981#ifdef CONFIG_SMP
989void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 982void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
990{ 983{
@@ -1015,18 +1008,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1015 trace_sched_migrate_task(p, new_cpu); 1008 trace_sched_migrate_task(p, new_cpu);
1016 1009
1017 if (task_cpu(p) != new_cpu) { 1010 if (task_cpu(p) != new_cpu) {
1018 struct task_migration_notifier tmn;
1019
1020 if (p->sched_class->migrate_task_rq) 1011 if (p->sched_class->migrate_task_rq)
1021 p->sched_class->migrate_task_rq(p, new_cpu); 1012 p->sched_class->migrate_task_rq(p, new_cpu);
1022 p->se.nr_migrations++; 1013 p->se.nr_migrations++;
1023 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 1014 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1024
1025 tmn.task = p;
1026 tmn.from_cpu = task_cpu(p);
1027 tmn.to_cpu = new_cpu;
1028
1029 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
1030 } 1015 }
1031 1016
1032 __set_task_cpu(p, new_cpu); 1017 __set_task_cpu(p, new_cpu);
@@ -2527,13 +2512,11 @@ void __sched schedule_preempt_disabled(void)
2527 */ 2512 */
2528asmlinkage void __sched notrace preempt_schedule(void) 2513asmlinkage void __sched notrace preempt_schedule(void)
2529{ 2514{
2530 struct thread_info *ti = current_thread_info();
2531
2532 /* 2515 /*
2533 * If there is a non-zero preempt_count or interrupts are disabled, 2516 * If there is a non-zero preempt_count or interrupts are disabled,
2534 * we do not want to preempt the current task. Just return.. 2517 * we do not want to preempt the current task. Just return..
2535 */ 2518 */
2536 if (likely(ti->preempt_count || irqs_disabled())) 2519 if (likely(!preemptible()))
2537 return; 2520 return;
2538 2521
2539 do { 2522 do {
@@ -2677,7 +2660,7 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
2677 if (unlikely(!q)) 2660 if (unlikely(!q))
2678 return; 2661 return;
2679 2662
2680 if (unlikely(!nr_exclusive)) 2663 if (unlikely(nr_exclusive != 1))
2681 wake_flags = 0; 2664 wake_flags = 0;
2682 2665
2683 spin_lock_irqsave(&q->lock, flags); 2666 spin_lock_irqsave(&q->lock, flags);
@@ -4964,7 +4947,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
4964 SD_BALANCE_FORK | 4947 SD_BALANCE_FORK |
4965 SD_BALANCE_EXEC | 4948 SD_BALANCE_EXEC |
4966 SD_SHARE_CPUPOWER | 4949 SD_SHARE_CPUPOWER |
4967 SD_SHARE_PKG_RESOURCES); 4950 SD_SHARE_PKG_RESOURCES |
4951 SD_PREFER_SIBLING);
4968 if (nr_node_ids == 1) 4952 if (nr_node_ids == 1)
4969 pflags &= ~SD_SERIALIZE; 4953 pflags &= ~SD_SERIALIZE;
4970 } 4954 }
@@ -5133,18 +5117,23 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5133 * two cpus are in the same cache domain, see cpus_share_cache(). 5117 * two cpus are in the same cache domain, see cpus_share_cache().
5134 */ 5118 */
5135DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5119DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5120DEFINE_PER_CPU(int, sd_llc_size);
5136DEFINE_PER_CPU(int, sd_llc_id); 5121DEFINE_PER_CPU(int, sd_llc_id);
5137 5122
5138static void update_top_cache_domain(int cpu) 5123static void update_top_cache_domain(int cpu)
5139{ 5124{
5140 struct sched_domain *sd; 5125 struct sched_domain *sd;
5141 int id = cpu; 5126 int id = cpu;
5127 int size = 1;
5142 5128
5143 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 5129 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5144 if (sd) 5130 if (sd) {
5145 id = cpumask_first(sched_domain_span(sd)); 5131 id = cpumask_first(sched_domain_span(sd));
5132 size = cpumask_weight(sched_domain_span(sd));
5133 }
5146 5134
5147 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5135 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5136 per_cpu(sd_llc_size, cpu) = size;
5148 per_cpu(sd_llc_id, cpu) = id; 5137 per_cpu(sd_llc_id, cpu) = id;
5149} 5138}
5150 5139
@@ -5168,6 +5157,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5168 tmp->parent = parent->parent; 5157 tmp->parent = parent->parent;
5169 if (parent->parent) 5158 if (parent->parent)
5170 parent->parent->child = tmp; 5159 parent->parent->child = tmp;
5160 /*
5161 * Transfer SD_PREFER_SIBLING down in case of a
5162 * degenerate parent; the spans match for this
5163 * so the property transfers.
5164 */
5165 if (parent->flags & SD_PREFER_SIBLING)
5166 tmp->flags |= SD_PREFER_SIBLING;
5171 destroy_sched_domain(parent, cpu); 5167 destroy_sched_domain(parent, cpu);
5172 } else 5168 } else
5173 tmp = tmp->parent; 5169 tmp = tmp->parent;
@@ -6234,8 +6230,9 @@ match1:
6234 ; 6230 ;
6235 } 6231 }
6236 6232
6233 n = ndoms_cur;
6237 if (doms_new == NULL) { 6234 if (doms_new == NULL) {
6238 ndoms_cur = 0; 6235 n = 0;
6239 doms_new = &fallback_doms; 6236 doms_new = &fallback_doms;
6240 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 6237 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6241 WARN_ON_ONCE(dattr_new); 6238 WARN_ON_ONCE(dattr_new);
@@ -6243,7 +6240,7 @@ match1:
6243 6240
6244 /* Build new domains */ 6241 /* Build new domains */
6245 for (i = 0; i < ndoms_new; i++) { 6242 for (i = 0; i < ndoms_new; i++) {
6246 for (j = 0; j < ndoms_cur && !new_topology; j++) { 6243 for (j = 0; j < n && !new_topology; j++) {
6247 if (cpumask_equal(doms_new[i], doms_cur[j]) 6244 if (cpumask_equal(doms_new[i], doms_cur[j])
6248 && dattrs_equal(dattr_new, i, dattr_cur, j)) 6245 && dattrs_equal(dattr_new, i, dattr_cur, j))
6249 goto match2; 6246 goto match2;
@@ -6815,7 +6812,7 @@ void sched_move_task(struct task_struct *tsk)
6815 if (unlikely(running)) 6812 if (unlikely(running))
6816 tsk->sched_class->put_prev_task(rq, tsk); 6813 tsk->sched_class->put_prev_task(rq, tsk);
6817 6814
6818 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, 6815 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
6819 lockdep_is_held(&tsk->sighand->siglock)), 6816 lockdep_is_held(&tsk->sighand->siglock)),
6820 struct task_group, css); 6817 struct task_group, css);
6821 tg = autogroup_task_group(tsk, tg); 6818 tg = autogroup_task_group(tsk, tg);
@@ -7137,23 +7134,22 @@ int sched_rt_handler(struct ctl_table *table, int write,
7137 7134
7138#ifdef CONFIG_CGROUP_SCHED 7135#ifdef CONFIG_CGROUP_SCHED
7139 7136
7140/* return corresponding task_group object of a cgroup */ 7137static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
7141static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7142{ 7138{
7143 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 7139 return css ? container_of(css, struct task_group, css) : NULL;
7144 struct task_group, css);
7145} 7140}
7146 7141
7147static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) 7142static struct cgroup_subsys_state *
7143cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7148{ 7144{
7149 struct task_group *tg, *parent; 7145 struct task_group *parent = css_tg(parent_css);
7146 struct task_group *tg;
7150 7147
7151 if (!cgrp->parent) { 7148 if (!parent) {
7152 /* This is early initialization for the top cgroup */ 7149 /* This is early initialization for the top cgroup */
7153 return &root_task_group.css; 7150 return &root_task_group.css;
7154 } 7151 }
7155 7152
7156 parent = cgroup_tg(cgrp->parent);
7157 tg = sched_create_group(parent); 7153 tg = sched_create_group(parent);
7158 if (IS_ERR(tg)) 7154 if (IS_ERR(tg))
7159 return ERR_PTR(-ENOMEM); 7155 return ERR_PTR(-ENOMEM);
@@ -7161,41 +7157,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7161 return &tg->css; 7157 return &tg->css;
7162} 7158}
7163 7159
7164static int cpu_cgroup_css_online(struct cgroup *cgrp) 7160static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7165{ 7161{
7166 struct task_group *tg = cgroup_tg(cgrp); 7162 struct task_group *tg = css_tg(css);
7167 struct task_group *parent; 7163 struct task_group *parent = css_tg(css_parent(css));
7168 7164
7169 if (!cgrp->parent) 7165 if (parent)
7170 return 0; 7166 sched_online_group(tg, parent);
7171
7172 parent = cgroup_tg(cgrp->parent);
7173 sched_online_group(tg, parent);
7174 return 0; 7167 return 0;
7175} 7168}
7176 7169
7177static void cpu_cgroup_css_free(struct cgroup *cgrp) 7170static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
7178{ 7171{
7179 struct task_group *tg = cgroup_tg(cgrp); 7172 struct task_group *tg = css_tg(css);
7180 7173
7181 sched_destroy_group(tg); 7174 sched_destroy_group(tg);
7182} 7175}
7183 7176
7184static void cpu_cgroup_css_offline(struct cgroup *cgrp) 7177static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7185{ 7178{
7186 struct task_group *tg = cgroup_tg(cgrp); 7179 struct task_group *tg = css_tg(css);
7187 7180
7188 sched_offline_group(tg); 7181 sched_offline_group(tg);
7189} 7182}
7190 7183
7191static int cpu_cgroup_can_attach(struct cgroup *cgrp, 7184static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7192 struct cgroup_taskset *tset) 7185 struct cgroup_taskset *tset)
7193{ 7186{
7194 struct task_struct *task; 7187 struct task_struct *task;
7195 7188
7196 cgroup_taskset_for_each(task, cgrp, tset) { 7189 cgroup_taskset_for_each(task, css, tset) {
7197#ifdef CONFIG_RT_GROUP_SCHED 7190#ifdef CONFIG_RT_GROUP_SCHED
7198 if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) 7191 if (!sched_rt_can_attach(css_tg(css), task))
7199 return -EINVAL; 7192 return -EINVAL;
7200#else 7193#else
7201 /* We don't support RT-tasks being in separate groups */ 7194 /* We don't support RT-tasks being in separate groups */
@@ -7206,18 +7199,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7206 return 0; 7199 return 0;
7207} 7200}
7208 7201
7209static void cpu_cgroup_attach(struct cgroup *cgrp, 7202static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7210 struct cgroup_taskset *tset) 7203 struct cgroup_taskset *tset)
7211{ 7204{
7212 struct task_struct *task; 7205 struct task_struct *task;
7213 7206
7214 cgroup_taskset_for_each(task, cgrp, tset) 7207 cgroup_taskset_for_each(task, css, tset)
7215 sched_move_task(task); 7208 sched_move_task(task);
7216} 7209}
7217 7210
7218static void 7211static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
7219cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7212 struct cgroup_subsys_state *old_css,
7220 struct task_struct *task) 7213 struct task_struct *task)
7221{ 7214{
7222 /* 7215 /*
7223 * cgroup_exit() is called in the copy_process() failure path. 7216 * cgroup_exit() is called in the copy_process() failure path.
@@ -7231,15 +7224,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7231} 7224}
7232 7225
7233#ifdef CONFIG_FAIR_GROUP_SCHED 7226#ifdef CONFIG_FAIR_GROUP_SCHED
7234static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7227static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7235 u64 shareval) 7228 struct cftype *cftype, u64 shareval)
7236{ 7229{
7237 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); 7230 return sched_group_set_shares(css_tg(css), scale_load(shareval));
7238} 7231}
7239 7232
7240static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 7233static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7234 struct cftype *cft)
7241{ 7235{
7242 struct task_group *tg = cgroup_tg(cgrp); 7236 struct task_group *tg = css_tg(css);
7243 7237
7244 return (u64) scale_load_down(tg->shares); 7238 return (u64) scale_load_down(tg->shares);
7245} 7239}
@@ -7361,26 +7355,28 @@ long tg_get_cfs_period(struct task_group *tg)
7361 return cfs_period_us; 7355 return cfs_period_us;
7362} 7356}
7363 7357
7364static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) 7358static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
7359 struct cftype *cft)
7365{ 7360{
7366 return tg_get_cfs_quota(cgroup_tg(cgrp)); 7361 return tg_get_cfs_quota(css_tg(css));
7367} 7362}
7368 7363
7369static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, 7364static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
7370 s64 cfs_quota_us) 7365 struct cftype *cftype, s64 cfs_quota_us)
7371{ 7366{
7372 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); 7367 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
7373} 7368}
7374 7369
7375static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) 7370static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
7371 struct cftype *cft)
7376{ 7372{
7377 return tg_get_cfs_period(cgroup_tg(cgrp)); 7373 return tg_get_cfs_period(css_tg(css));
7378} 7374}
7379 7375
7380static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7376static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
7381 u64 cfs_period_us) 7377 struct cftype *cftype, u64 cfs_period_us)
7382{ 7378{
7383 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); 7379 return tg_set_cfs_period(css_tg(css), cfs_period_us);
7384} 7380}
7385 7381
7386struct cfs_schedulable_data { 7382struct cfs_schedulable_data {
@@ -7461,10 +7457,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7461 return ret; 7457 return ret;
7462} 7458}
7463 7459
7464static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, 7460static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
7465 struct cgroup_map_cb *cb) 7461 struct cgroup_map_cb *cb)
7466{ 7462{
7467 struct task_group *tg = cgroup_tg(cgrp); 7463 struct task_group *tg = css_tg(css);
7468 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7464 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7469 7465
7470 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7466 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
@@ -7477,26 +7473,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
7477#endif /* CONFIG_FAIR_GROUP_SCHED */ 7473#endif /* CONFIG_FAIR_GROUP_SCHED */
7478 7474
7479#ifdef CONFIG_RT_GROUP_SCHED 7475#ifdef CONFIG_RT_GROUP_SCHED
7480static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 7476static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
7481 s64 val) 7477 struct cftype *cft, s64 val)
7482{ 7478{
7483 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 7479 return sched_group_set_rt_runtime(css_tg(css), val);
7484} 7480}
7485 7481
7486static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 7482static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
7483 struct cftype *cft)
7487{ 7484{
7488 return sched_group_rt_runtime(cgroup_tg(cgrp)); 7485 return sched_group_rt_runtime(css_tg(css));
7489} 7486}
7490 7487
7491static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 7488static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
7492 u64 rt_period_us) 7489 struct cftype *cftype, u64 rt_period_us)
7493{ 7490{
7494 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 7491 return sched_group_set_rt_period(css_tg(css), rt_period_us);
7495} 7492}
7496 7493
7497static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 7494static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
7495 struct cftype *cft)
7498{ 7496{
7499 return sched_group_rt_period(cgroup_tg(cgrp)); 7497 return sched_group_rt_period(css_tg(css));
7500} 7498}
7501#endif /* CONFIG_RT_GROUP_SCHED */ 7499#endif /* CONFIG_RT_GROUP_SCHED */
7502 7500
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dbb7e2cd95eb..f64722ff0299 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -33,30 +33,20 @@ struct cpuacct {
33 struct kernel_cpustat __percpu *cpustat; 33 struct kernel_cpustat __percpu *cpustat;
34}; 34};
35 35
36/* return cpu accounting group corresponding to this container */ 36static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
37static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
38{ 37{
39 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), 38 return css ? container_of(css, struct cpuacct, css) : NULL;
40 struct cpuacct, css);
41} 39}
42 40
43/* return cpu accounting group to which this task belongs */ 41/* return cpu accounting group to which this task belongs */
44static inline struct cpuacct *task_ca(struct task_struct *tsk) 42static inline struct cpuacct *task_ca(struct task_struct *tsk)
45{ 43{
46 return container_of(task_subsys_state(tsk, cpuacct_subsys_id), 44 return css_ca(task_css(tsk, cpuacct_subsys_id));
47 struct cpuacct, css);
48}
49
50static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
51{
52 return cgroup_ca(ca->css.cgroup->parent);
53} 45}
54 46
55static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
56{ 48{
57 if (!ca->css.cgroup->parent) 49 return css_ca(css_parent(&ca->css));
58 return NULL;
59 return cgroup_ca(ca->css.cgroup->parent);
60} 50}
61 51
62static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); 52static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
@@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = {
66}; 56};
67 57
68/* create a new cpu accounting group */ 58/* create a new cpu accounting group */
69static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) 59static struct cgroup_subsys_state *
60cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
70{ 61{
71 struct cpuacct *ca; 62 struct cpuacct *ca;
72 63
73 if (!cgrp->parent) 64 if (!parent_css)
74 return &root_cpuacct.css; 65 return &root_cpuacct.css;
75 66
76 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 67 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
@@ -96,9 +87,9 @@ out:
96} 87}
97 88
98/* destroy an existing cpu accounting group */ 89/* destroy an existing cpu accounting group */
99static void cpuacct_css_free(struct cgroup *cgrp) 90static void cpuacct_css_free(struct cgroup_subsys_state *css)
100{ 91{
101 struct cpuacct *ca = cgroup_ca(cgrp); 92 struct cpuacct *ca = css_ca(css);
102 93
103 free_percpu(ca->cpustat); 94 free_percpu(ca->cpustat);
104 free_percpu(ca->cpuusage); 95 free_percpu(ca->cpuusage);
@@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
141} 132}
142 133
143/* return total cpu usage (in nanoseconds) of a group */ 134/* return total cpu usage (in nanoseconds) of a group */
144static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 135static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
145{ 136{
146 struct cpuacct *ca = cgroup_ca(cgrp); 137 struct cpuacct *ca = css_ca(css);
147 u64 totalcpuusage = 0; 138 u64 totalcpuusage = 0;
148 int i; 139 int i;
149 140
@@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
153 return totalcpuusage; 144 return totalcpuusage;
154} 145}
155 146
156static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 147static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
157 u64 reset) 148 u64 reset)
158{ 149{
159 struct cpuacct *ca = cgroup_ca(cgrp); 150 struct cpuacct *ca = css_ca(css);
160 int err = 0; 151 int err = 0;
161 int i; 152 int i;
162 153
@@ -172,10 +163,10 @@ out:
172 return err; 163 return err;
173} 164}
174 165
175static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 166static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css,
176 struct seq_file *m) 167 struct cftype *cft, struct seq_file *m)
177{ 168{
178 struct cpuacct *ca = cgroup_ca(cgroup); 169 struct cpuacct *ca = css_ca(css);
179 u64 percpu; 170 u64 percpu;
180 int i; 171 int i;
181 172
@@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = {
192 [CPUACCT_STAT_SYSTEM] = "system", 183 [CPUACCT_STAT_SYSTEM] = "system",
193}; 184};
194 185
195static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 186static int cpuacct_stats_show(struct cgroup_subsys_state *css,
196 struct cgroup_map_cb *cb) 187 struct cftype *cft, struct cgroup_map_cb *cb)
197{ 188{
198 struct cpuacct *ca = cgroup_ca(cgrp); 189 struct cpuacct *ca = css_ca(css);
199 int cpu; 190 int cpu;
200 s64 val = 0; 191 s64 val = 0;
201 192
@@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
281 while (ca != &root_cpuacct) { 272 while (ca != &root_cpuacct) {
282 kcpustat = this_cpu_ptr(ca->cpustat); 273 kcpustat = this_cpu_ptr(ca->cpustat);
283 kcpustat->cpustat[index] += val; 274 kcpustat->cpustat[index] += val;
284 ca = __parent_ca(ca); 275 ca = parent_ca(ca);
285 } 276 }
286 rcu_read_unlock(); 277 rcu_read_unlock();
287} 278}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a7959e05a9d5..99947919e30b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
121 * is the only cgroup, then nothing else should be necessary. 121 * is the only cgroup, then nothing else should be necessary.
122 * 122 *
123 */ 123 */
124 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; 124 __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
125 125
126 cpuacct_account_field(p, index, tmp); 126 cpuacct_account_field(p, index, tmp);
127} 127}
@@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
378#ifdef CONFIG_VIRT_CPU_ACCOUNTING 378#ifdef CONFIG_VIRT_CPU_ACCOUNTING
379 379
380#ifndef __ARCH_HAS_VTIME_TASK_SWITCH 380#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
381void vtime_task_switch(struct task_struct *prev) 381void vtime_common_task_switch(struct task_struct *prev)
382{ 382{
383 if (!vtime_accounting_enabled())
384 return;
385
386 if (is_idle_task(prev)) 383 if (is_idle_task(prev))
387 vtime_account_idle(prev); 384 vtime_account_idle(prev);
388 else 385 else
@@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev)
404 * vtime_account(). 401 * vtime_account().
405 */ 402 */
406#ifndef __ARCH_HAS_VTIME_ACCOUNT 403#ifndef __ARCH_HAS_VTIME_ACCOUNT
407void vtime_account_irq_enter(struct task_struct *tsk) 404void vtime_common_account_irq_enter(struct task_struct *tsk)
408{ 405{
409 if (!vtime_accounting_enabled())
410 return;
411
412 if (!in_interrupt()) { 406 if (!in_interrupt()) {
413 /* 407 /*
414 * If we interrupted user, context_tracking_in_user() 408 * If we interrupted user, context_tracking_in_user()
@@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
428 } 422 }
429 vtime_account_system(tsk); 423 vtime_account_system(tsk);
430} 424}
431EXPORT_SYMBOL_GPL(vtime_account_irq_enter); 425EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
432#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 426#endif /* __ARCH_HAS_VTIME_ACCOUNT */
433#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ 427#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
434 428
@@ -557,16 +551,7 @@ static void cputime_adjust(struct task_cputime *curr,
557 struct cputime *prev, 551 struct cputime *prev,
558 cputime_t *ut, cputime_t *st) 552 cputime_t *ut, cputime_t *st)
559{ 553{
560 cputime_t rtime, stime, utime, total; 554 cputime_t rtime, stime, utime;
561
562 if (vtime_accounting_enabled()) {
563 *ut = curr->utime;
564 *st = curr->stime;
565 return;
566 }
567
568 stime = curr->stime;
569 total = stime + curr->utime;
570 555
571 /* 556 /*
572 * Tick based cputime accounting depend on random scheduling 557 * Tick based cputime accounting depend on random scheduling
@@ -588,13 +573,19 @@ static void cputime_adjust(struct task_cputime *curr,
588 if (prev->stime + prev->utime >= rtime) 573 if (prev->stime + prev->utime >= rtime)
589 goto out; 574 goto out;
590 575
591 if (total) { 576 stime = curr->stime;
577 utime = curr->utime;
578
579 if (utime == 0) {
580 stime = rtime;
581 } else if (stime == 0) {
582 utime = rtime;
583 } else {
584 cputime_t total = stime + utime;
585
592 stime = scale_stime((__force u64)stime, 586 stime = scale_stime((__force u64)stime,
593 (__force u64)rtime, (__force u64)total); 587 (__force u64)rtime, (__force u64)total);
594 utime = rtime - stime; 588 utime = rtime - stime;
595 } else {
596 stime = rtime;
597 utime = 0;
598 } 589 }
599 590
600 /* 591 /*
@@ -664,23 +655,17 @@ static void __vtime_account_system(struct task_struct *tsk)
664 655
665void vtime_account_system(struct task_struct *tsk) 656void vtime_account_system(struct task_struct *tsk)
666{ 657{
667 if (!vtime_accounting_enabled())
668 return;
669
670 write_seqlock(&tsk->vtime_seqlock); 658 write_seqlock(&tsk->vtime_seqlock);
671 __vtime_account_system(tsk); 659 __vtime_account_system(tsk);
672 write_sequnlock(&tsk->vtime_seqlock); 660 write_sequnlock(&tsk->vtime_seqlock);
673} 661}
674 662
675void vtime_account_irq_exit(struct task_struct *tsk) 663void vtime_gen_account_irq_exit(struct task_struct *tsk)
676{ 664{
677 if (!vtime_accounting_enabled())
678 return;
679
680 write_seqlock(&tsk->vtime_seqlock); 665 write_seqlock(&tsk->vtime_seqlock);
666 __vtime_account_system(tsk);
681 if (context_tracking_in_user()) 667 if (context_tracking_in_user())
682 tsk->vtime_snap_whence = VTIME_USER; 668 tsk->vtime_snap_whence = VTIME_USER;
683 __vtime_account_system(tsk);
684 write_sequnlock(&tsk->vtime_seqlock); 669 write_sequnlock(&tsk->vtime_seqlock);
685} 670}
686 671
@@ -688,12 +673,8 @@ void vtime_account_user(struct task_struct *tsk)
688{ 673{
689 cputime_t delta_cpu; 674 cputime_t delta_cpu;
690 675
691 if (!vtime_accounting_enabled())
692 return;
693
694 delta_cpu = get_vtime_delta(tsk);
695
696 write_seqlock(&tsk->vtime_seqlock); 676 write_seqlock(&tsk->vtime_seqlock);
677 delta_cpu = get_vtime_delta(tsk);
697 tsk->vtime_snap_whence = VTIME_SYS; 678 tsk->vtime_snap_whence = VTIME_SYS;
698 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); 679 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
699 write_sequnlock(&tsk->vtime_seqlock); 680 write_sequnlock(&tsk->vtime_seqlock);
@@ -701,22 +682,27 @@ void vtime_account_user(struct task_struct *tsk)
701 682
702void vtime_user_enter(struct task_struct *tsk) 683void vtime_user_enter(struct task_struct *tsk)
703{ 684{
704 if (!vtime_accounting_enabled())
705 return;
706
707 write_seqlock(&tsk->vtime_seqlock); 685 write_seqlock(&tsk->vtime_seqlock);
708 tsk->vtime_snap_whence = VTIME_USER;
709 __vtime_account_system(tsk); 686 __vtime_account_system(tsk);
687 tsk->vtime_snap_whence = VTIME_USER;
710 write_sequnlock(&tsk->vtime_seqlock); 688 write_sequnlock(&tsk->vtime_seqlock);
711} 689}
712 690
713void vtime_guest_enter(struct task_struct *tsk) 691void vtime_guest_enter(struct task_struct *tsk)
714{ 692{
693 /*
694 * The flags must be updated under the lock with
695 * the vtime_snap flush and update.
696 * That enforces a right ordering and update sequence
697 * synchronization against the reader (task_gtime())
698 * that can thus safely catch up with a tickless delta.
699 */
715 write_seqlock(&tsk->vtime_seqlock); 700 write_seqlock(&tsk->vtime_seqlock);
716 __vtime_account_system(tsk); 701 __vtime_account_system(tsk);
717 current->flags |= PF_VCPU; 702 current->flags |= PF_VCPU;
718 write_sequnlock(&tsk->vtime_seqlock); 703 write_sequnlock(&tsk->vtime_seqlock);
719} 704}
705EXPORT_SYMBOL_GPL(vtime_guest_enter);
720 706
721void vtime_guest_exit(struct task_struct *tsk) 707void vtime_guest_exit(struct task_struct *tsk)
722{ 708{
@@ -725,6 +711,7 @@ void vtime_guest_exit(struct task_struct *tsk)
725 current->flags &= ~PF_VCPU; 711 current->flags &= ~PF_VCPU;
726 write_sequnlock(&tsk->vtime_seqlock); 712 write_sequnlock(&tsk->vtime_seqlock);
727} 713}
714EXPORT_SYMBOL_GPL(vtime_guest_exit);
728 715
729void vtime_account_idle(struct task_struct *tsk) 716void vtime_account_idle(struct task_struct *tsk)
730{ 717{
@@ -733,11 +720,6 @@ void vtime_account_idle(struct task_struct *tsk)
733 account_idle_time(delta_cpu); 720 account_idle_time(delta_cpu);
734} 721}
735 722
736bool vtime_accounting_enabled(void)
737{
738 return context_tracking_active();
739}
740
741void arch_vtime_task_switch(struct task_struct *prev) 723void arch_vtime_task_switch(struct task_struct *prev)
742{ 724{
743 write_seqlock(&prev->vtime_seqlock); 725 write_seqlock(&prev->vtime_seqlock);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index e076bddd4c66..196559994f7c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -124,7 +124,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
124 SEQ_printf(m, " "); 124 SEQ_printf(m, " ");
125 125
126 SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", 126 SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
127 p->comm, p->pid, 127 p->comm, task_pid_nr(p),
128 SPLIT_NS(p->se.vruntime), 128 SPLIT_NS(p->se.vruntime),
129 (long long)(p->nvcsw + p->nivcsw), 129 (long long)(p->nvcsw + p->nivcsw),
130 p->prio); 130 p->prio);
@@ -289,7 +289,7 @@ do { \
289 P(nr_load_updates); 289 P(nr_load_updates);
290 P(nr_uninterruptible); 290 P(nr_uninterruptible);
291 PN(next_balance); 291 PN(next_balance);
292 P(curr->pid); 292 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
293 PN(clock); 293 PN(clock);
294 P(cpu_load[0]); 294 P(cpu_load[0]);
295 P(cpu_load[1]); 295 P(cpu_load[1]);
@@ -492,7 +492,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
492{ 492{
493 unsigned long nr_switches; 493 unsigned long nr_switches;
494 494
495 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, 495 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
496 get_nr_threads(p)); 496 get_nr_threads(p));
497 SEQ_printf(m, 497 SEQ_printf(m,
498 "---------------------------------------------------------" 498 "---------------------------------------------------------"
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 68f1609ca149..7c70201fbc61 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3018,6 +3018,23 @@ static unsigned long cpu_avg_load_per_task(int cpu)
3018 return 0; 3018 return 0;
3019} 3019}
3020 3020
3021static void record_wakee(struct task_struct *p)
3022{
3023 /*
3024 * Rough decay (wiping) for cost saving, don't worry
3025 * about the boundary, really active task won't care
3026 * about the loss.
3027 */
3028 if (jiffies > current->wakee_flip_decay_ts + HZ) {
3029 current->wakee_flips = 0;
3030 current->wakee_flip_decay_ts = jiffies;
3031 }
3032
3033 if (current->last_wakee != p) {
3034 current->last_wakee = p;
3035 current->wakee_flips++;
3036 }
3037}
3021 3038
3022static void task_waking_fair(struct task_struct *p) 3039static void task_waking_fair(struct task_struct *p)
3023{ 3040{
@@ -3038,6 +3055,7 @@ static void task_waking_fair(struct task_struct *p)
3038#endif 3055#endif
3039 3056
3040 se->vruntime -= min_vruntime; 3057 se->vruntime -= min_vruntime;
3058 record_wakee(p);
3041} 3059}
3042 3060
3043#ifdef CONFIG_FAIR_GROUP_SCHED 3061#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3156,6 +3174,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
3156 3174
3157#endif 3175#endif
3158 3176
3177static int wake_wide(struct task_struct *p)
3178{
3179 int factor = this_cpu_read(sd_llc_size);
3180
3181 /*
3182 * Yeah, it's the switching-frequency, could means many wakee or
3183 * rapidly switch, use factor here will just help to automatically
3184 * adjust the loose-degree, so bigger node will lead to more pull.
3185 */
3186 if (p->wakee_flips > factor) {
3187 /*
3188 * wakee is somewhat hot, it needs certain amount of cpu
3189 * resource, so if waker is far more hot, prefer to leave
3190 * it alone.
3191 */
3192 if (current->wakee_flips > (factor * p->wakee_flips))
3193 return 1;
3194 }
3195
3196 return 0;
3197}
3198
3159static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 3199static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3160{ 3200{
3161 s64 this_load, load; 3201 s64 this_load, load;
@@ -3165,6 +3205,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3165 unsigned long weight; 3205 unsigned long weight;
3166 int balanced; 3206 int balanced;
3167 3207
3208 /*
3209 * If we wake multiple tasks be careful to not bounce
3210 * ourselves around too much.
3211 */
3212 if (wake_wide(p))
3213 return 0;
3214
3168 idx = sd->wake_idx; 3215 idx = sd->wake_idx;
3169 this_cpu = smp_processor_id(); 3216 this_cpu = smp_processor_id();
3170 prev_cpu = task_cpu(p); 3217 prev_cpu = task_cpu(p);
@@ -4172,47 +4219,48 @@ static void update_blocked_averages(int cpu)
4172} 4219}
4173 4220
4174/* 4221/*
4175 * Compute the cpu's hierarchical load factor for each task group. 4222 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
4176 * This needs to be done in a top-down fashion because the load of a child 4223 * This needs to be done in a top-down fashion because the load of a child
4177 * group is a fraction of its parents load. 4224 * group is a fraction of its parents load.
4178 */ 4225 */
4179static int tg_load_down(struct task_group *tg, void *data) 4226static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
4180{ 4227{
4181 unsigned long load; 4228 struct rq *rq = rq_of(cfs_rq);
4182 long cpu = (long)data; 4229 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
4183
4184 if (!tg->parent) {
4185 load = cpu_rq(cpu)->avg.load_avg_contrib;
4186 } else {
4187 load = tg->parent->cfs_rq[cpu]->h_load;
4188 load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
4189 tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
4190 }
4191
4192 tg->cfs_rq[cpu]->h_load = load;
4193
4194 return 0;
4195}
4196
4197static void update_h_load(long cpu)
4198{
4199 struct rq *rq = cpu_rq(cpu);
4200 unsigned long now = jiffies; 4230 unsigned long now = jiffies;
4231 unsigned long load;
4201 4232
4202 if (rq->h_load_throttle == now) 4233 if (cfs_rq->last_h_load_update == now)
4203 return; 4234 return;
4204 4235
4205 rq->h_load_throttle = now; 4236 cfs_rq->h_load_next = NULL;
4237 for_each_sched_entity(se) {
4238 cfs_rq = cfs_rq_of(se);
4239 cfs_rq->h_load_next = se;
4240 if (cfs_rq->last_h_load_update == now)
4241 break;
4242 }
4206 4243
4207 rcu_read_lock(); 4244 if (!se) {
4208 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 4245 cfs_rq->h_load = cfs_rq->runnable_load_avg;
4209 rcu_read_unlock(); 4246 cfs_rq->last_h_load_update = now;
4247 }
4248
4249 while ((se = cfs_rq->h_load_next) != NULL) {
4250 load = cfs_rq->h_load;
4251 load = div64_ul(load * se->avg.load_avg_contrib,
4252 cfs_rq->runnable_load_avg + 1);
4253 cfs_rq = group_cfs_rq(se);
4254 cfs_rq->h_load = load;
4255 cfs_rq->last_h_load_update = now;
4256 }
4210} 4257}
4211 4258
4212static unsigned long task_h_load(struct task_struct *p) 4259static unsigned long task_h_load(struct task_struct *p)
4213{ 4260{
4214 struct cfs_rq *cfs_rq = task_cfs_rq(p); 4261 struct cfs_rq *cfs_rq = task_cfs_rq(p);
4215 4262
4263 update_cfs_rq_h_load(cfs_rq);
4216 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, 4264 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
4217 cfs_rq->runnable_load_avg + 1); 4265 cfs_rq->runnable_load_avg + 1);
4218} 4266}
@@ -4221,10 +4269,6 @@ static inline void update_blocked_averages(int cpu)
4221{ 4269{
4222} 4270}
4223 4271
4224static inline void update_h_load(long cpu)
4225{
4226}
4227
4228static unsigned long task_h_load(struct task_struct *p) 4272static unsigned long task_h_load(struct task_struct *p)
4229{ 4273{
4230 return p->se.avg.load_avg_contrib; 4274 return p->se.avg.load_avg_contrib;
@@ -4233,50 +4277,56 @@ static unsigned long task_h_load(struct task_struct *p)
4233 4277
4234/********** Helpers for find_busiest_group ************************/ 4278/********** Helpers for find_busiest_group ************************/
4235/* 4279/*
4236 * sd_lb_stats - Structure to store the statistics of a sched_domain
4237 * during load balancing.
4238 */
4239struct sd_lb_stats {
4240 struct sched_group *busiest; /* Busiest group in this sd */
4241 struct sched_group *this; /* Local group in this sd */
4242 unsigned long total_load; /* Total load of all groups in sd */
4243 unsigned long total_pwr; /* Total power of all groups in sd */
4244 unsigned long avg_load; /* Average load across all groups in sd */
4245
4246 /** Statistics of this group */
4247 unsigned long this_load;
4248 unsigned long this_load_per_task;
4249 unsigned long this_nr_running;
4250 unsigned long this_has_capacity;
4251 unsigned int this_idle_cpus;
4252
4253 /* Statistics of the busiest group */
4254 unsigned int busiest_idle_cpus;
4255 unsigned long max_load;
4256 unsigned long busiest_load_per_task;
4257 unsigned long busiest_nr_running;
4258 unsigned long busiest_group_capacity;
4259 unsigned long busiest_has_capacity;
4260 unsigned int busiest_group_weight;
4261
4262 int group_imb; /* Is there imbalance in this sd */
4263};
4264
4265/*
4266 * sg_lb_stats - stats of a sched_group required for load_balancing 4280 * sg_lb_stats - stats of a sched_group required for load_balancing
4267 */ 4281 */
4268struct sg_lb_stats { 4282struct sg_lb_stats {
4269 unsigned long avg_load; /*Avg load across the CPUs of the group */ 4283 unsigned long avg_load; /*Avg load across the CPUs of the group */
4270 unsigned long group_load; /* Total load over the CPUs of the group */ 4284 unsigned long group_load; /* Total load over the CPUs of the group */
4271 unsigned long sum_nr_running; /* Nr tasks running in the group */
4272 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 4285 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
4273 unsigned long group_capacity; 4286 unsigned long load_per_task;
4274 unsigned long idle_cpus; 4287 unsigned long group_power;
4275 unsigned long group_weight; 4288 unsigned int sum_nr_running; /* Nr tasks running in the group */
4289 unsigned int group_capacity;
4290 unsigned int idle_cpus;
4291 unsigned int group_weight;
4276 int group_imb; /* Is there an imbalance in the group ? */ 4292 int group_imb; /* Is there an imbalance in the group ? */
4277 int group_has_capacity; /* Is there extra capacity in the group? */ 4293 int group_has_capacity; /* Is there extra capacity in the group? */
4278}; 4294};
4279 4295
4296/*
4297 * sd_lb_stats - Structure to store the statistics of a sched_domain
4298 * during load balancing.
4299 */
4300struct sd_lb_stats {
4301 struct sched_group *busiest; /* Busiest group in this sd */
4302 struct sched_group *local; /* Local group in this sd */
4303 unsigned long total_load; /* Total load of all groups in sd */
4304 unsigned long total_pwr; /* Total power of all groups in sd */
4305 unsigned long avg_load; /* Average load across all groups in sd */
4306
4307 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
4308 struct sg_lb_stats local_stat; /* Statistics of the local group */
4309};
4310
4311static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
4312{
4313 /*
4314 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
4315 * local_stat because update_sg_lb_stats() does a full clear/assignment.
4316 * We must however clear busiest_stat::avg_load because
4317 * update_sd_pick_busiest() reads this before assignment.
4318 */
4319 *sds = (struct sd_lb_stats){
4320 .busiest = NULL,
4321 .local = NULL,
4322 .total_load = 0UL,
4323 .total_pwr = 0UL,
4324 .busiest_stat = {
4325 .avg_load = 0UL,
4326 },
4327 };
4328}
4329
4280/** 4330/**
4281 * get_sd_load_idx - Obtain the load index for a given sched domain. 4331 * get_sd_load_idx - Obtain the load index for a given sched domain.
4282 * @sd: The sched_domain whose load_idx is to be obtained. 4332 * @sd: The sched_domain whose load_idx is to be obtained.
@@ -4460,33 +4510,99 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4460 return 0; 4510 return 0;
4461} 4511}
4462 4512
4513/*
4514 * Group imbalance indicates (and tries to solve) the problem where balancing
4515 * groups is inadequate due to tsk_cpus_allowed() constraints.
4516 *
4517 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
4518 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
4519 * Something like:
4520 *
4521 * { 0 1 2 3 } { 4 5 6 7 }
4522 * * * * *
4523 *
4524 * If we were to balance group-wise we'd place two tasks in the first group and
4525 * two tasks in the second group. Clearly this is undesired as it will overload
4526 * cpu 3 and leave one of the cpus in the second group unused.
4527 *
4528 * The current solution to this issue is detecting the skew in the first group
4529 * by noticing it has a cpu that is overloaded while the remaining cpus are
4530 * idle -- or rather, there's a distinct imbalance in the cpus; see
4531 * sg_imbalanced().
4532 *
4533 * When this is so detected; this group becomes a candidate for busiest; see
4534 * update_sd_pick_busiest(). And calculcate_imbalance() and
4535 * find_busiest_group() avoid some of the usual balance conditional to allow it
4536 * to create an effective group imbalance.
4537 *
4538 * This is a somewhat tricky proposition since the next run might not find the
4539 * group imbalance and decide the groups need to be balanced again. A most
4540 * subtle and fragile situation.
4541 */
4542
4543struct sg_imb_stats {
4544 unsigned long max_nr_running, min_nr_running;
4545 unsigned long max_cpu_load, min_cpu_load;
4546};
4547
4548static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
4549{
4550 sgi->max_cpu_load = sgi->max_nr_running = 0UL;
4551 sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
4552}
4553
4554static inline void
4555update_sg_imb_stats(struct sg_imb_stats *sgi,
4556 unsigned long load, unsigned long nr_running)
4557{
4558 if (load > sgi->max_cpu_load)
4559 sgi->max_cpu_load = load;
4560 if (sgi->min_cpu_load > load)
4561 sgi->min_cpu_load = load;
4562
4563 if (nr_running > sgi->max_nr_running)
4564 sgi->max_nr_running = nr_running;
4565 if (sgi->min_nr_running > nr_running)
4566 sgi->min_nr_running = nr_running;
4567}
4568
4569static inline int
4570sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
4571{
4572 /*
4573 * Consider the group unbalanced when the imbalance is larger
4574 * than the average weight of a task.
4575 *
4576 * APZ: with cgroup the avg task weight can vary wildly and
4577 * might not be a suitable number - should we keep a
4578 * normalized nr_running number somewhere that negates
4579 * the hierarchy?
4580 */
4581 if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
4582 (sgi->max_nr_running - sgi->min_nr_running) > 1)
4583 return 1;
4584
4585 return 0;
4586}
4587
4463/** 4588/**
4464 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 4589 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
4465 * @env: The load balancing environment. 4590 * @env: The load balancing environment.
4466 * @group: sched_group whose statistics are to be updated. 4591 * @group: sched_group whose statistics are to be updated.
4467 * @load_idx: Load index of sched_domain of this_cpu for load calc. 4592 * @load_idx: Load index of sched_domain of this_cpu for load calc.
4468 * @local_group: Does group contain this_cpu. 4593 * @local_group: Does group contain this_cpu.
4469 * @balance: Should we balance.
4470 * @sgs: variable to hold the statistics for this group. 4594 * @sgs: variable to hold the statistics for this group.
4471 */ 4595 */
4472static inline void update_sg_lb_stats(struct lb_env *env, 4596static inline void update_sg_lb_stats(struct lb_env *env,
4473 struct sched_group *group, int load_idx, 4597 struct sched_group *group, int load_idx,
4474 int local_group, int *balance, struct sg_lb_stats *sgs) 4598 int local_group, struct sg_lb_stats *sgs)
4475{ 4599{
4476 unsigned long nr_running, max_nr_running, min_nr_running; 4600 struct sg_imb_stats sgi;
4477 unsigned long load, max_cpu_load, min_cpu_load; 4601 unsigned long nr_running;
4478 unsigned int balance_cpu = -1, first_idle_cpu = 0; 4602 unsigned long load;
4479 unsigned long avg_load_per_task = 0;
4480 int i; 4603 int i;
4481 4604
4482 if (local_group) 4605 init_sg_imb_stats(&sgi);
4483 balance_cpu = group_balance_cpu(group);
4484
4485 /* Tally up the load of all CPUs in the group */
4486 max_cpu_load = 0;
4487 min_cpu_load = ~0UL;
4488 max_nr_running = 0;
4489 min_nr_running = ~0UL;
4490 4606
4491 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 4607 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4492 struct rq *rq = cpu_rq(i); 4608 struct rq *rq = cpu_rq(i);
@@ -4495,24 +4611,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4495 4611
4496 /* Bias balancing toward cpus of our domain */ 4612 /* Bias balancing toward cpus of our domain */
4497 if (local_group) { 4613 if (local_group) {
4498 if (idle_cpu(i) && !first_idle_cpu &&
4499 cpumask_test_cpu(i, sched_group_mask(group))) {
4500 first_idle_cpu = 1;
4501 balance_cpu = i;
4502 }
4503
4504 load = target_load(i, load_idx); 4614 load = target_load(i, load_idx);
4505 } else { 4615 } else {
4506 load = source_load(i, load_idx); 4616 load = source_load(i, load_idx);
4507 if (load > max_cpu_load) 4617 update_sg_imb_stats(&sgi, load, nr_running);
4508 max_cpu_load = load;
4509 if (min_cpu_load > load)
4510 min_cpu_load = load;
4511
4512 if (nr_running > max_nr_running)
4513 max_nr_running = nr_running;
4514 if (min_nr_running > nr_running)
4515 min_nr_running = nr_running;
4516 } 4618 }
4517 4619
4518 sgs->group_load += load; 4620 sgs->group_load += load;
@@ -4522,46 +4624,25 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4522 sgs->idle_cpus++; 4624 sgs->idle_cpus++;
4523 } 4625 }
4524 4626
4525 /* 4627 if (local_group && (env->idle != CPU_NEWLY_IDLE ||
4526 * First idle cpu or the first cpu(busiest) in this sched group 4628 time_after_eq(jiffies, group->sgp->next_update)))
4527 * is eligible for doing load balancing at this and above 4629 update_group_power(env->sd, env->dst_cpu);
4528 * domains. In the newly idle case, we will allow all the cpu's
4529 * to do the newly idle load balance.
4530 */
4531 if (local_group) {
4532 if (env->idle != CPU_NEWLY_IDLE) {
4533 if (balance_cpu != env->dst_cpu) {
4534 *balance = 0;
4535 return;
4536 }
4537 update_group_power(env->sd, env->dst_cpu);
4538 } else if (time_after_eq(jiffies, group->sgp->next_update))
4539 update_group_power(env->sd, env->dst_cpu);
4540 }
4541 4630
4542 /* Adjust by relative CPU power of the group */ 4631 /* Adjust by relative CPU power of the group */
4543 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; 4632 sgs->group_power = group->sgp->power;
4633 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
4544 4634
4545 /*
4546 * Consider the group unbalanced when the imbalance is larger
4547 * than the average weight of a task.
4548 *
4549 * APZ: with cgroup the avg task weight can vary wildly and
4550 * might not be a suitable number - should we keep a
4551 * normalized nr_running number somewhere that negates
4552 * the hierarchy?
4553 */
4554 if (sgs->sum_nr_running) 4635 if (sgs->sum_nr_running)
4555 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 4636 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4556 4637
4557 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && 4638 sgs->group_imb = sg_imbalanced(sgs, &sgi);
4558 (max_nr_running - min_nr_running) > 1) 4639
4559 sgs->group_imb = 1; 4640 sgs->group_capacity =
4641 DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
4560 4642
4561 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
4562 SCHED_POWER_SCALE);
4563 if (!sgs->group_capacity) 4643 if (!sgs->group_capacity)
4564 sgs->group_capacity = fix_small_capacity(env->sd, group); 4644 sgs->group_capacity = fix_small_capacity(env->sd, group);
4645
4565 sgs->group_weight = group->group_weight; 4646 sgs->group_weight = group->group_weight;
4566 4647
4567 if (sgs->group_capacity > sgs->sum_nr_running) 4648 if (sgs->group_capacity > sgs->sum_nr_running)
@@ -4586,7 +4667,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
4586 struct sched_group *sg, 4667 struct sched_group *sg,
4587 struct sg_lb_stats *sgs) 4668 struct sg_lb_stats *sgs)
4588{ 4669{
4589 if (sgs->avg_load <= sds->max_load) 4670 if (sgs->avg_load <= sds->busiest_stat.avg_load)
4590 return false; 4671 return false;
4591 4672
4592 if (sgs->sum_nr_running > sgs->group_capacity) 4673 if (sgs->sum_nr_running > sgs->group_capacity)
@@ -4619,11 +4700,11 @@ static bool update_sd_pick_busiest(struct lb_env *env,
4619 * @sds: variable to hold the statistics for this sched_domain. 4700 * @sds: variable to hold the statistics for this sched_domain.
4620 */ 4701 */
4621static inline void update_sd_lb_stats(struct lb_env *env, 4702static inline void update_sd_lb_stats(struct lb_env *env,
4622 int *balance, struct sd_lb_stats *sds) 4703 struct sd_lb_stats *sds)
4623{ 4704{
4624 struct sched_domain *child = env->sd->child; 4705 struct sched_domain *child = env->sd->child;
4625 struct sched_group *sg = env->sd->groups; 4706 struct sched_group *sg = env->sd->groups;
4626 struct sg_lb_stats sgs; 4707 struct sg_lb_stats tmp_sgs;
4627 int load_idx, prefer_sibling = 0; 4708 int load_idx, prefer_sibling = 0;
4628 4709
4629 if (child && child->flags & SD_PREFER_SIBLING) 4710 if (child && child->flags & SD_PREFER_SIBLING)
@@ -4632,17 +4713,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4632 load_idx = get_sd_load_idx(env->sd, env->idle); 4713 load_idx = get_sd_load_idx(env->sd, env->idle);
4633 4714
4634 do { 4715 do {
4716 struct sg_lb_stats *sgs = &tmp_sgs;
4635 int local_group; 4717 int local_group;
4636 4718
4637 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); 4719 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
4638 memset(&sgs, 0, sizeof(sgs)); 4720 if (local_group) {
4639 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); 4721 sds->local = sg;
4640 4722 sgs = &sds->local_stat;
4641 if (local_group && !(*balance)) 4723 }
4642 return;
4643 4724
4644 sds->total_load += sgs.group_load; 4725 memset(sgs, 0, sizeof(*sgs));
4645 sds->total_pwr += sg->sgp->power; 4726 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
4646 4727
4647 /* 4728 /*
4648 * In case the child domain prefers tasks go to siblings 4729 * In case the child domain prefers tasks go to siblings
@@ -4654,26 +4735,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4654 * heaviest group when it is already under-utilized (possible 4735 * heaviest group when it is already under-utilized (possible
4655 * with a large weight task outweighs the tasks on the system). 4736 * with a large weight task outweighs the tasks on the system).
4656 */ 4737 */
4657 if (prefer_sibling && !local_group && sds->this_has_capacity) 4738 if (prefer_sibling && !local_group &&
4658 sgs.group_capacity = min(sgs.group_capacity, 1UL); 4739 sds->local && sds->local_stat.group_has_capacity)
4740 sgs->group_capacity = min(sgs->group_capacity, 1U);
4659 4741
4660 if (local_group) { 4742 /* Now, start updating sd_lb_stats */
4661 sds->this_load = sgs.avg_load; 4743 sds->total_load += sgs->group_load;
4662 sds->this = sg; 4744 sds->total_pwr += sgs->group_power;
4663 sds->this_nr_running = sgs.sum_nr_running; 4745
4664 sds->this_load_per_task = sgs.sum_weighted_load; 4746 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
4665 sds->this_has_capacity = sgs.group_has_capacity;
4666 sds->this_idle_cpus = sgs.idle_cpus;
4667 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
4668 sds->max_load = sgs.avg_load;
4669 sds->busiest = sg; 4747 sds->busiest = sg;
4670 sds->busiest_nr_running = sgs.sum_nr_running; 4748 sds->busiest_stat = *sgs;
4671 sds->busiest_idle_cpus = sgs.idle_cpus;
4672 sds->busiest_group_capacity = sgs.group_capacity;
4673 sds->busiest_load_per_task = sgs.sum_weighted_load;
4674 sds->busiest_has_capacity = sgs.group_has_capacity;
4675 sds->busiest_group_weight = sgs.group_weight;
4676 sds->group_imb = sgs.group_imb;
4677 } 4749 }
4678 4750
4679 sg = sg->next; 4751 sg = sg->next;
@@ -4718,7 +4790,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4718 return 0; 4790 return 0;
4719 4791
4720 env->imbalance = DIV_ROUND_CLOSEST( 4792 env->imbalance = DIV_ROUND_CLOSEST(
4721 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); 4793 sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
4794 SCHED_POWER_SCALE);
4722 4795
4723 return 1; 4796 return 1;
4724} 4797}
@@ -4736,24 +4809,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4736 unsigned long tmp, pwr_now = 0, pwr_move = 0; 4809 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4737 unsigned int imbn = 2; 4810 unsigned int imbn = 2;
4738 unsigned long scaled_busy_load_per_task; 4811 unsigned long scaled_busy_load_per_task;
4812 struct sg_lb_stats *local, *busiest;
4739 4813
4740 if (sds->this_nr_running) { 4814 local = &sds->local_stat;
4741 sds->this_load_per_task /= sds->this_nr_running; 4815 busiest = &sds->busiest_stat;
4742 if (sds->busiest_load_per_task > 4816
4743 sds->this_load_per_task) 4817 if (!local->sum_nr_running)
4744 imbn = 1; 4818 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
4745 } else { 4819 else if (busiest->load_per_task > local->load_per_task)
4746 sds->this_load_per_task = 4820 imbn = 1;
4747 cpu_avg_load_per_task(env->dst_cpu);
4748 }
4749 4821
4750 scaled_busy_load_per_task = sds->busiest_load_per_task 4822 scaled_busy_load_per_task =
4751 * SCHED_POWER_SCALE; 4823 (busiest->load_per_task * SCHED_POWER_SCALE) /
4752 scaled_busy_load_per_task /= sds->busiest->sgp->power; 4824 busiest->group_power;
4753 4825
4754 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 4826 if (busiest->avg_load + scaled_busy_load_per_task >=
4755 (scaled_busy_load_per_task * imbn)) { 4827 local->avg_load + (scaled_busy_load_per_task * imbn)) {
4756 env->imbalance = sds->busiest_load_per_task; 4828 env->imbalance = busiest->load_per_task;
4757 return; 4829 return;
4758 } 4830 }
4759 4831
@@ -4763,34 +4835,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4763 * moving them. 4835 * moving them.
4764 */ 4836 */
4765 4837
4766 pwr_now += sds->busiest->sgp->power * 4838 pwr_now += busiest->group_power *
4767 min(sds->busiest_load_per_task, sds->max_load); 4839 min(busiest->load_per_task, busiest->avg_load);
4768 pwr_now += sds->this->sgp->power * 4840 pwr_now += local->group_power *
4769 min(sds->this_load_per_task, sds->this_load); 4841 min(local->load_per_task, local->avg_load);
4770 pwr_now /= SCHED_POWER_SCALE; 4842 pwr_now /= SCHED_POWER_SCALE;
4771 4843
4772 /* Amount of load we'd subtract */ 4844 /* Amount of load we'd subtract */
4773 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 4845 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
4774 sds->busiest->sgp->power; 4846 busiest->group_power;
4775 if (sds->max_load > tmp) 4847 if (busiest->avg_load > tmp) {
4776 pwr_move += sds->busiest->sgp->power * 4848 pwr_move += busiest->group_power *
4777 min(sds->busiest_load_per_task, sds->max_load - tmp); 4849 min(busiest->load_per_task,
4850 busiest->avg_load - tmp);
4851 }
4778 4852
4779 /* Amount of load we'd add */ 4853 /* Amount of load we'd add */
4780 if (sds->max_load * sds->busiest->sgp->power < 4854 if (busiest->avg_load * busiest->group_power <
4781 sds->busiest_load_per_task * SCHED_POWER_SCALE) 4855 busiest->load_per_task * SCHED_POWER_SCALE) {
4782 tmp = (sds->max_load * sds->busiest->sgp->power) / 4856 tmp = (busiest->avg_load * busiest->group_power) /
4783 sds->this->sgp->power; 4857 local->group_power;
4784 else 4858 } else {
4785 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 4859 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
4786 sds->this->sgp->power; 4860 local->group_power;
4787 pwr_move += sds->this->sgp->power * 4861 }
4788 min(sds->this_load_per_task, sds->this_load + tmp); 4862 pwr_move += local->group_power *
4863 min(local->load_per_task, local->avg_load + tmp);
4789 pwr_move /= SCHED_POWER_SCALE; 4864 pwr_move /= SCHED_POWER_SCALE;
4790 4865
4791 /* Move if we gain throughput */ 4866 /* Move if we gain throughput */
4792 if (pwr_move > pwr_now) 4867 if (pwr_move > pwr_now)
4793 env->imbalance = sds->busiest_load_per_task; 4868 env->imbalance = busiest->load_per_task;
4794} 4869}
4795 4870
4796/** 4871/**
@@ -4802,11 +4877,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4802static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) 4877static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4803{ 4878{
4804 unsigned long max_pull, load_above_capacity = ~0UL; 4879 unsigned long max_pull, load_above_capacity = ~0UL;
4880 struct sg_lb_stats *local, *busiest;
4881
4882 local = &sds->local_stat;
4883 busiest = &sds->busiest_stat;
4805 4884
4806 sds->busiest_load_per_task /= sds->busiest_nr_running; 4885 if (busiest->group_imb) {
4807 if (sds->group_imb) { 4886 /*
4808 sds->busiest_load_per_task = 4887 * In the group_imb case we cannot rely on group-wide averages
4809 min(sds->busiest_load_per_task, sds->avg_load); 4888 * to ensure cpu-load equilibrium, look at wider averages. XXX
4889 */
4890 busiest->load_per_task =
4891 min(busiest->load_per_task, sds->avg_load);
4810 } 4892 }
4811 4893
4812 /* 4894 /*
@@ -4814,21 +4896,23 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4814 * max load less than avg load(as we skip the groups at or below 4896 * max load less than avg load(as we skip the groups at or below
4815 * its cpu_power, while calculating max_load..) 4897 * its cpu_power, while calculating max_load..)
4816 */ 4898 */
4817 if (sds->max_load < sds->avg_load) { 4899 if (busiest->avg_load <= sds->avg_load ||
4900 local->avg_load >= sds->avg_load) {
4818 env->imbalance = 0; 4901 env->imbalance = 0;
4819 return fix_small_imbalance(env, sds); 4902 return fix_small_imbalance(env, sds);
4820 } 4903 }
4821 4904
4822 if (!sds->group_imb) { 4905 if (!busiest->group_imb) {
4823 /* 4906 /*
4824 * Don't want to pull so many tasks that a group would go idle. 4907 * Don't want to pull so many tasks that a group would go idle.
4908 * Except of course for the group_imb case, since then we might
4909 * have to drop below capacity to reach cpu-load equilibrium.
4825 */ 4910 */
4826 load_above_capacity = (sds->busiest_nr_running - 4911 load_above_capacity =
4827 sds->busiest_group_capacity); 4912 (busiest->sum_nr_running - busiest->group_capacity);
4828 4913
4829 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); 4914 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4830 4915 load_above_capacity /= busiest->group_power;
4831 load_above_capacity /= sds->busiest->sgp->power;
4832 } 4916 }
4833 4917
4834 /* 4918 /*
@@ -4838,15 +4922,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4838 * we also don't want to reduce the group load below the group capacity 4922 * we also don't want to reduce the group load below the group capacity
4839 * (so that we can implement power-savings policies etc). Thus we look 4923 * (so that we can implement power-savings policies etc). Thus we look
4840 * for the minimum possible imbalance. 4924 * for the minimum possible imbalance.
4841 * Be careful of negative numbers as they'll appear as very large values
4842 * with unsigned longs.
4843 */ 4925 */
4844 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 4926 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
4845 4927
4846 /* How much load to actually move to equalise the imbalance */ 4928 /* How much load to actually move to equalise the imbalance */
4847 env->imbalance = min(max_pull * sds->busiest->sgp->power, 4929 env->imbalance = min(
4848 (sds->avg_load - sds->this_load) * sds->this->sgp->power) 4930 max_pull * busiest->group_power,
4849 / SCHED_POWER_SCALE; 4931 (sds->avg_load - local->avg_load) * local->group_power
4932 ) / SCHED_POWER_SCALE;
4850 4933
4851 /* 4934 /*
4852 * if *imbalance is less than the average load per runnable task 4935 * if *imbalance is less than the average load per runnable task
@@ -4854,9 +4937,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4854 * a think about bumping its value to force at least one task to be 4937 * a think about bumping its value to force at least one task to be
4855 * moved 4938 * moved
4856 */ 4939 */
4857 if (env->imbalance < sds->busiest_load_per_task) 4940 if (env->imbalance < busiest->load_per_task)
4858 return fix_small_imbalance(env, sds); 4941 return fix_small_imbalance(env, sds);
4859
4860} 4942}
4861 4943
4862/******* find_busiest_group() helpers end here *********************/ 4944/******* find_busiest_group() helpers end here *********************/
@@ -4872,69 +4954,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4872 * to restore balance. 4954 * to restore balance.
4873 * 4955 *
4874 * @env: The load balancing environment. 4956 * @env: The load balancing environment.
4875 * @balance: Pointer to a variable indicating if this_cpu
4876 * is the appropriate cpu to perform load balancing at this_level.
4877 * 4957 *
4878 * Return: - The busiest group if imbalance exists. 4958 * Return: - The busiest group if imbalance exists.
4879 * - If no imbalance and user has opted for power-savings balance, 4959 * - If no imbalance and user has opted for power-savings balance,
4880 * return the least loaded group whose CPUs can be 4960 * return the least loaded group whose CPUs can be
4881 * put to idle by rebalancing its tasks onto our group. 4961 * put to idle by rebalancing its tasks onto our group.
4882 */ 4962 */
4883static struct sched_group * 4963static struct sched_group *find_busiest_group(struct lb_env *env)
4884find_busiest_group(struct lb_env *env, int *balance)
4885{ 4964{
4965 struct sg_lb_stats *local, *busiest;
4886 struct sd_lb_stats sds; 4966 struct sd_lb_stats sds;
4887 4967
4888 memset(&sds, 0, sizeof(sds)); 4968 init_sd_lb_stats(&sds);
4889 4969
4890 /* 4970 /*
4891 * Compute the various statistics relavent for load balancing at 4971 * Compute the various statistics relavent for load balancing at
4892 * this level. 4972 * this level.
4893 */ 4973 */
4894 update_sd_lb_stats(env, balance, &sds); 4974 update_sd_lb_stats(env, &sds);
4895 4975 local = &sds.local_stat;
4896 /* 4976 busiest = &sds.busiest_stat;
4897 * this_cpu is not the appropriate cpu to perform load balancing at
4898 * this level.
4899 */
4900 if (!(*balance))
4901 goto ret;
4902 4977
4903 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && 4978 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4904 check_asym_packing(env, &sds)) 4979 check_asym_packing(env, &sds))
4905 return sds.busiest; 4980 return sds.busiest;
4906 4981
4907 /* There is no busy sibling group to pull tasks from */ 4982 /* There is no busy sibling group to pull tasks from */
4908 if (!sds.busiest || sds.busiest_nr_running == 0) 4983 if (!sds.busiest || busiest->sum_nr_running == 0)
4909 goto out_balanced; 4984 goto out_balanced;
4910 4985
4911 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; 4986 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4912 4987
4913 /* 4988 /*
4914 * If the busiest group is imbalanced the below checks don't 4989 * If the busiest group is imbalanced the below checks don't
4915 * work because they assumes all things are equal, which typically 4990 * work because they assume all things are equal, which typically
4916 * isn't true due to cpus_allowed constraints and the like. 4991 * isn't true due to cpus_allowed constraints and the like.
4917 */ 4992 */
4918 if (sds.group_imb) 4993 if (busiest->group_imb)
4919 goto force_balance; 4994 goto force_balance;
4920 4995
4921 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 4996 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4922 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 4997 if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
4923 !sds.busiest_has_capacity) 4998 !busiest->group_has_capacity)
4924 goto force_balance; 4999 goto force_balance;
4925 5000
4926 /* 5001 /*
4927 * If the local group is more busy than the selected busiest group 5002 * If the local group is more busy than the selected busiest group
4928 * don't try and pull any tasks. 5003 * don't try and pull any tasks.
4929 */ 5004 */
4930 if (sds.this_load >= sds.max_load) 5005 if (local->avg_load >= busiest->avg_load)
4931 goto out_balanced; 5006 goto out_balanced;
4932 5007
4933 /* 5008 /*
4934 * Don't pull any tasks if this group is already above the domain 5009 * Don't pull any tasks if this group is already above the domain
4935 * average load. 5010 * average load.
4936 */ 5011 */
4937 if (sds.this_load >= sds.avg_load) 5012 if (local->avg_load >= sds.avg_load)
4938 goto out_balanced; 5013 goto out_balanced;
4939 5014
4940 if (env->idle == CPU_IDLE) { 5015 if (env->idle == CPU_IDLE) {
@@ -4944,15 +5019,16 @@ find_busiest_group(struct lb_env *env, int *balance)
4944 * there is no imbalance between this and busiest group 5019 * there is no imbalance between this and busiest group
4945 * wrt to idle cpu's, it is balanced. 5020 * wrt to idle cpu's, it is balanced.
4946 */ 5021 */
4947 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && 5022 if ((local->idle_cpus < busiest->idle_cpus) &&
4948 sds.busiest_nr_running <= sds.busiest_group_weight) 5023 busiest->sum_nr_running <= busiest->group_weight)
4949 goto out_balanced; 5024 goto out_balanced;
4950 } else { 5025 } else {
4951 /* 5026 /*
4952 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 5027 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4953 * imbalance_pct to be conservative. 5028 * imbalance_pct to be conservative.
4954 */ 5029 */
4955 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) 5030 if (100 * busiest->avg_load <=
5031 env->sd->imbalance_pct * local->avg_load)
4956 goto out_balanced; 5032 goto out_balanced;
4957 } 5033 }
4958 5034
@@ -4962,7 +5038,6 @@ force_balance:
4962 return sds.busiest; 5038 return sds.busiest;
4963 5039
4964out_balanced: 5040out_balanced:
4965ret:
4966 env->imbalance = 0; 5041 env->imbalance = 0;
4967 return NULL; 5042 return NULL;
4968} 5043}
@@ -4974,10 +5049,10 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4974 struct sched_group *group) 5049 struct sched_group *group)
4975{ 5050{
4976 struct rq *busiest = NULL, *rq; 5051 struct rq *busiest = NULL, *rq;
4977 unsigned long max_load = 0; 5052 unsigned long busiest_load = 0, busiest_power = 1;
4978 int i; 5053 int i;
4979 5054
4980 for_each_cpu(i, sched_group_cpus(group)) { 5055 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4981 unsigned long power = power_of(i); 5056 unsigned long power = power_of(i);
4982 unsigned long capacity = DIV_ROUND_CLOSEST(power, 5057 unsigned long capacity = DIV_ROUND_CLOSEST(power,
4983 SCHED_POWER_SCALE); 5058 SCHED_POWER_SCALE);
@@ -4986,9 +5061,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4986 if (!capacity) 5061 if (!capacity)
4987 capacity = fix_small_capacity(env->sd, group); 5062 capacity = fix_small_capacity(env->sd, group);
4988 5063
4989 if (!cpumask_test_cpu(i, env->cpus))
4990 continue;
4991
4992 rq = cpu_rq(i); 5064 rq = cpu_rq(i);
4993 wl = weighted_cpuload(i); 5065 wl = weighted_cpuload(i);
4994 5066
@@ -5004,11 +5076,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
5004 * the weighted_cpuload() scaled with the cpu power, so that 5076 * the weighted_cpuload() scaled with the cpu power, so that
5005 * the load can be moved away from the cpu that is potentially 5077 * the load can be moved away from the cpu that is potentially
5006 * running at a lower capacity. 5078 * running at a lower capacity.
5079 *
5080 * Thus we're looking for max(wl_i / power_i), crosswise
5081 * multiplication to rid ourselves of the division works out
5082 * to: wl_i * power_j > wl_j * power_i; where j is our
5083 * previous maximum.
5007 */ 5084 */
5008 wl = (wl * SCHED_POWER_SCALE) / power; 5085 if (wl * busiest_power > busiest_load * power) {
5009 5086 busiest_load = wl;
5010 if (wl > max_load) { 5087 busiest_power = power;
5011 max_load = wl;
5012 busiest = rq; 5088 busiest = rq;
5013 } 5089 }
5014 } 5090 }
@@ -5045,13 +5121,47 @@ static int need_active_balance(struct lb_env *env)
5045 5121
5046static int active_load_balance_cpu_stop(void *data); 5122static int active_load_balance_cpu_stop(void *data);
5047 5123
5124static int should_we_balance(struct lb_env *env)
5125{
5126 struct sched_group *sg = env->sd->groups;
5127 struct cpumask *sg_cpus, *sg_mask;
5128 int cpu, balance_cpu = -1;
5129
5130 /*
5131 * In the newly idle case, we will allow all the cpu's
5132 * to do the newly idle load balance.
5133 */
5134 if (env->idle == CPU_NEWLY_IDLE)
5135 return 1;
5136
5137 sg_cpus = sched_group_cpus(sg);
5138 sg_mask = sched_group_mask(sg);
5139 /* Try to find first idle cpu */
5140 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
5141 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
5142 continue;
5143
5144 balance_cpu = cpu;
5145 break;
5146 }
5147
5148 if (balance_cpu == -1)
5149 balance_cpu = group_balance_cpu(sg);
5150
5151 /*
5152 * First idle cpu or the first cpu(busiest) in this sched group
5153 * is eligible for doing load balancing at this and above domains.
5154 */
5155 return balance_cpu == env->dst_cpu;
5156}
5157
5048/* 5158/*
5049 * Check this_cpu to ensure it is balanced within domain. Attempt to move 5159 * Check this_cpu to ensure it is balanced within domain. Attempt to move
5050 * tasks if there is an imbalance. 5160 * tasks if there is an imbalance.
5051 */ 5161 */
5052static int load_balance(int this_cpu, struct rq *this_rq, 5162static int load_balance(int this_cpu, struct rq *this_rq,
5053 struct sched_domain *sd, enum cpu_idle_type idle, 5163 struct sched_domain *sd, enum cpu_idle_type idle,
5054 int *balance) 5164 int *continue_balancing)
5055{ 5165{
5056 int ld_moved, cur_ld_moved, active_balance = 0; 5166 int ld_moved, cur_ld_moved, active_balance = 0;
5057 struct sched_group *group; 5167 struct sched_group *group;
@@ -5081,11 +5191,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5081 schedstat_inc(sd, lb_count[idle]); 5191 schedstat_inc(sd, lb_count[idle]);
5082 5192
5083redo: 5193redo:
5084 group = find_busiest_group(&env, balance); 5194 if (!should_we_balance(&env)) {
5085 5195 *continue_balancing = 0;
5086 if (*balance == 0)
5087 goto out_balanced; 5196 goto out_balanced;
5197 }
5088 5198
5199 group = find_busiest_group(&env);
5089 if (!group) { 5200 if (!group) {
5090 schedstat_inc(sd, lb_nobusyg[idle]); 5201 schedstat_inc(sd, lb_nobusyg[idle]);
5091 goto out_balanced; 5202 goto out_balanced;
@@ -5114,7 +5225,6 @@ redo:
5114 env.src_rq = busiest; 5225 env.src_rq = busiest;
5115 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 5226 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
5116 5227
5117 update_h_load(env.src_cpu);
5118more_balance: 5228more_balance:
5119 local_irq_save(flags); 5229 local_irq_save(flags);
5120 double_rq_lock(env.dst_rq, busiest); 5230 double_rq_lock(env.dst_rq, busiest);
@@ -5298,7 +5408,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5298 rcu_read_lock(); 5408 rcu_read_lock();
5299 for_each_domain(this_cpu, sd) { 5409 for_each_domain(this_cpu, sd) {
5300 unsigned long interval; 5410 unsigned long interval;
5301 int balance = 1; 5411 int continue_balancing = 1;
5302 5412
5303 if (!(sd->flags & SD_LOAD_BALANCE)) 5413 if (!(sd->flags & SD_LOAD_BALANCE))
5304 continue; 5414 continue;
@@ -5306,7 +5416,8 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5306 if (sd->flags & SD_BALANCE_NEWIDLE) { 5416 if (sd->flags & SD_BALANCE_NEWIDLE) {
5307 /* If we've pulled tasks over stop searching: */ 5417 /* If we've pulled tasks over stop searching: */
5308 pulled_task = load_balance(this_cpu, this_rq, 5418 pulled_task = load_balance(this_cpu, this_rq,
5309 sd, CPU_NEWLY_IDLE, &balance); 5419 sd, CPU_NEWLY_IDLE,
5420 &continue_balancing);
5310 } 5421 }
5311 5422
5312 interval = msecs_to_jiffies(sd->balance_interval); 5423 interval = msecs_to_jiffies(sd->balance_interval);
@@ -5544,7 +5655,7 @@ void update_max_interval(void)
5544 */ 5655 */
5545static void rebalance_domains(int cpu, enum cpu_idle_type idle) 5656static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5546{ 5657{
5547 int balance = 1; 5658 int continue_balancing = 1;
5548 struct rq *rq = cpu_rq(cpu); 5659 struct rq *rq = cpu_rq(cpu);
5549 unsigned long interval; 5660 unsigned long interval;
5550 struct sched_domain *sd; 5661 struct sched_domain *sd;
@@ -5576,7 +5687,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5576 } 5687 }
5577 5688
5578 if (time_after_eq(jiffies, sd->last_balance + interval)) { 5689 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5579 if (load_balance(cpu, rq, sd, idle, &balance)) { 5690 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
5580 /* 5691 /*
5581 * The LBF_SOME_PINNED logic could have changed 5692 * The LBF_SOME_PINNED logic could have changed
5582 * env->dst_cpu, so we can't know our idle 5693 * env->dst_cpu, so we can't know our idle
@@ -5599,7 +5710,7 @@ out:
5599 * CPU in our sched group which is doing load balancing more 5710 * CPU in our sched group which is doing load balancing more
5600 * actively. 5711 * actively.
5601 */ 5712 */
5602 if (!balance) 5713 if (!continue_balancing)
5603 break; 5714 break;
5604 } 5715 }
5605 rcu_read_unlock(); 5716 rcu_read_unlock();
@@ -5818,11 +5929,15 @@ static void task_fork_fair(struct task_struct *p)
5818 cfs_rq = task_cfs_rq(current); 5929 cfs_rq = task_cfs_rq(current);
5819 curr = cfs_rq->curr; 5930 curr = cfs_rq->curr;
5820 5931
5821 if (unlikely(task_cpu(p) != this_cpu)) { 5932 /*
5822 rcu_read_lock(); 5933 * Not only the cpu but also the task_group of the parent might have
5823 __set_task_cpu(p, this_cpu); 5934 * been changed after parent->se.parent,cfs_rq were copied to
5824 rcu_read_unlock(); 5935 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
5825 } 5936 * of child point to valid ones.
5937 */
5938 rcu_read_lock();
5939 __set_task_cpu(p, this_cpu);
5940 rcu_read_unlock();
5826 5941
5827 update_curr(cfs_rq); 5942 update_curr(cfs_rq);
5828 5943
@@ -5895,11 +6010,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
5895 * and ensure we don't carry in an old decay_count if we 6010 * and ensure we don't carry in an old decay_count if we
5896 * switch back. 6011 * switch back.
5897 */ 6012 */
5898 if (p->se.avg.decay_count) { 6013 if (se->avg.decay_count) {
5899 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); 6014 __synchronize_entity_decay(se);
5900 __synchronize_entity_decay(&p->se); 6015 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
5901 subtract_blocked_load_contrib(cfs_rq,
5902 p->se.avg.load_avg_contrib);
5903 } 6016 }
5904#endif 6017#endif
5905} 6018}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef0a7b2439dd..b3c5653e1dca 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -285,7 +285,6 @@ struct cfs_rq {
285 /* Required to track per-cpu representation of a task_group */ 285 /* Required to track per-cpu representation of a task_group */
286 u32 tg_runnable_contrib; 286 u32 tg_runnable_contrib;
287 unsigned long tg_load_contrib; 287 unsigned long tg_load_contrib;
288#endif /* CONFIG_FAIR_GROUP_SCHED */
289 288
290 /* 289 /*
291 * h_load = weight * f(tg) 290 * h_load = weight * f(tg)
@@ -294,6 +293,9 @@ struct cfs_rq {
294 * this group. 293 * this group.
295 */ 294 */
296 unsigned long h_load; 295 unsigned long h_load;
296 u64 last_h_load_update;
297 struct sched_entity *h_load_next;
298#endif /* CONFIG_FAIR_GROUP_SCHED */
297#endif /* CONFIG_SMP */ 299#endif /* CONFIG_SMP */
298 300
299#ifdef CONFIG_FAIR_GROUP_SCHED 301#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -429,9 +431,6 @@ struct rq {
429#ifdef CONFIG_FAIR_GROUP_SCHED 431#ifdef CONFIG_FAIR_GROUP_SCHED
430 /* list of leaf cfs_rq on this cpu: */ 432 /* list of leaf cfs_rq on this cpu: */
431 struct list_head leaf_cfs_rq_list; 433 struct list_head leaf_cfs_rq_list;
432#ifdef CONFIG_SMP
433 unsigned long h_load_throttle;
434#endif /* CONFIG_SMP */
435#endif /* CONFIG_FAIR_GROUP_SCHED */ 434#endif /* CONFIG_FAIR_GROUP_SCHED */
436 435
437#ifdef CONFIG_RT_GROUP_SCHED 436#ifdef CONFIG_RT_GROUP_SCHED
@@ -595,6 +594,7 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
595} 594}
596 595
597DECLARE_PER_CPU(struct sched_domain *, sd_llc); 596DECLARE_PER_CPU(struct sched_domain *, sd_llc);
597DECLARE_PER_CPU(int, sd_llc_size);
598DECLARE_PER_CPU(int, sd_llc_id); 598DECLARE_PER_CPU(int, sd_llc_id);
599 599
600struct sched_group_power { 600struct sched_group_power {
@@ -665,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg);
665/* 665/*
666 * Return the group to which this tasks belongs. 666 * Return the group to which this tasks belongs.
667 * 667 *
668 * We cannot use task_subsys_state() and friends because the cgroup 668 * We cannot use task_css() and friends because the cgroup subsystem
669 * subsystem changes that value before the cgroup_subsys::attach() method 669 * changes that value before the cgroup_subsys::attach() method is called,
670 * is called, therefore we cannot pin it and might observe the wrong value. 670 * therefore we cannot pin it and might observe the wrong value.
671 * 671 *
672 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup 672 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
673 * core changes this before calling sched_move_task(). 673 * core changes this before calling sched_move_task().
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 5aef494fc8b4..c7edee71bce8 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -104,8 +104,9 @@ static inline void sched_info_queued(struct task_struct *t)
104} 104}
105 105
106/* 106/*
107 * Called when a process ceases being the active-running process, either 107 * Called when a process ceases being the active-running process involuntarily
108 * voluntarily or involuntarily. Now we can calculate how long we ran. 108 * due, typically, to expiring its time slice (this may also be called when
109 * switching to the idle task). Now we can calculate how long we ran.
109 * Also, if the process is still in the TASK_RUNNING state, call 110 * Also, if the process is still in the TASK_RUNNING state, call
110 * sched_info_queued() to mark that it has now again started waiting on 111 * sched_info_queued() to mark that it has now again started waiting on
111 * the runqueue. 112 * the runqueue.
diff --git a/kernel/signal.c b/kernel/signal.c
index 50e41075ac77..ded28b91fa53 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3394,7 +3394,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
3394 new_ka.sa.sa_restorer = compat_ptr(restorer); 3394 new_ka.sa.sa_restorer = compat_ptr(restorer);
3395#endif 3395#endif
3396 ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); 3396 ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
3397 ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); 3397 ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
3398 if (ret) 3398 if (ret)
3399 return -EFAULT; 3399 return -EFAULT;
3400 sigset_from_compat(&new_ka.sa.sa_mask, &mask); 3400 sigset_from_compat(&new_ka.sa.sa_mask, &mask);
@@ -3406,7 +3406,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
3406 ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 3406 ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
3407 &oact->sa_handler); 3407 &oact->sa_handler);
3408 ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); 3408 ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
3409 ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); 3409 ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
3410#ifdef __ARCH_HAS_SA_RESTORER 3410#ifdef __ARCH_HAS_SA_RESTORER
3411 ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), 3411 ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
3412 &oact->sa_restorer); 3412 &oact->sa_restorer);
diff --git a/kernel/smp.c b/kernel/smp.c
index fe9f773d7114..0564571dcdf7 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -48,10 +48,13 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
48 cpu_to_node(cpu))) 48 cpu_to_node(cpu)))
49 return notifier_from_errno(-ENOMEM); 49 return notifier_from_errno(-ENOMEM);
50 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, 50 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
51 cpu_to_node(cpu))) 51 cpu_to_node(cpu))) {
52 free_cpumask_var(cfd->cpumask);
52 return notifier_from_errno(-ENOMEM); 53 return notifier_from_errno(-ENOMEM);
54 }
53 cfd->csd = alloc_percpu(struct call_single_data); 55 cfd->csd = alloc_percpu(struct call_single_data);
54 if (!cfd->csd) { 56 if (!cfd->csd) {
57 free_cpumask_var(cfd->cpumask_ipi);
55 free_cpumask_var(cfd->cpumask); 58 free_cpumask_var(cfd->cpumask);
56 return notifier_from_errno(-ENOMEM); 59 return notifier_from_errno(-ENOMEM);
57 } 60 }
@@ -186,25 +189,13 @@ void generic_smp_call_function_single_interrupt(void)
186 189
187 while (!list_empty(&list)) { 190 while (!list_empty(&list)) {
188 struct call_single_data *csd; 191 struct call_single_data *csd;
189 unsigned int csd_flags;
190 192
191 csd = list_entry(list.next, struct call_single_data, list); 193 csd = list_entry(list.next, struct call_single_data, list);
192 list_del(&csd->list); 194 list_del(&csd->list);
193 195
194 /*
195 * 'csd' can be invalid after this call if flags == 0
196 * (when called through generic_exec_single()),
197 * so save them away before making the call:
198 */
199 csd_flags = csd->flags;
200
201 csd->func(csd->info); 196 csd->func(csd->info);
202 197
203 /* 198 csd_unlock(csd);
204 * Unlocked CSDs are valid through generic_exec_single():
205 */
206 if (csd_flags & CSD_FLAG_LOCK)
207 csd_unlock(csd);
208 } 199 }
209} 200}
210 201
@@ -278,8 +269,6 @@ EXPORT_SYMBOL(smp_call_function_single);
278 * @wait: If true, wait until function has completed. 269 * @wait: If true, wait until function has completed.
279 * 270 *
280 * Returns 0 on success, else a negative status code (if no cpus were online). 271 * Returns 0 on success, else a negative status code (if no cpus were online).
281 * Note that @wait will be implicitly turned on in case of allocation failures,
282 * since we fall back to on-stack allocation.
283 * 272 *
284 * Selection preference: 273 * Selection preference:
285 * 1) current cpu if in @mask 274 * 1) current cpu if in @mask
@@ -586,8 +575,10 @@ EXPORT_SYMBOL(on_each_cpu);
586 * 575 *
587 * If @wait is true, then returns once @func has returned. 576 * If @wait is true, then returns once @func has returned.
588 * 577 *
589 * You must not call this function with disabled interrupts or 578 * You must not call this function with disabled interrupts or from a
590 * from a hardware interrupt handler or from a bottom half handler. 579 * hardware interrupt handler or from a bottom half handler. The
580 * exception is that it may be used during early boot while
581 * early_boot_irqs_disabled is set.
591 */ 582 */
592void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, 583void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
593 void *info, bool wait) 584 void *info, bool wait)
@@ -596,9 +587,10 @@ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
596 587
597 smp_call_function_many(mask, func, info, wait); 588 smp_call_function_many(mask, func, info, wait);
598 if (cpumask_test_cpu(cpu, mask)) { 589 if (cpumask_test_cpu(cpu, mask)) {
599 local_irq_disable(); 590 unsigned long flags;
591 local_irq_save(flags);
600 func(info); 592 func(info);
601 local_irq_enable(); 593 local_irq_restore(flags);
602 } 594 }
603 put_cpu(); 595 put_cpu();
604} 596}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index be3d3514c325..d7d498d8cc4f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -328,10 +328,19 @@ void irq_enter(void)
328 328
329static inline void invoke_softirq(void) 329static inline void invoke_softirq(void)
330{ 330{
331 if (!force_irqthreads) 331 if (!force_irqthreads) {
332 __do_softirq(); 332 /*
333 else 333 * We can safely execute softirq on the current stack if
334 * it is the irq stack, because it should be near empty
335 * at this stage. But we have no way to know if the arch
336 * calls irq_exit() on the irq stack. So call softirq
337 * in its own stack to prevent from any overrun on top
338 * of a potentially deep task stack.
339 */
340 do_softirq();
341 } else {
334 wakeup_softirqd(); 342 wakeup_softirqd();
343 }
335} 344}
336 345
337static inline void tick_irq_exit(void) 346static inline void tick_irq_exit(void)
@@ -876,7 +885,6 @@ int __init __weak early_irq_init(void)
876 return 0; 885 return 0;
877} 886}
878 887
879#ifdef CONFIG_GENERIC_HARDIRQS
880int __init __weak arch_probe_nr_irqs(void) 888int __init __weak arch_probe_nr_irqs(void)
881{ 889{
882 return NR_IRQS_LEGACY; 890 return NR_IRQS_LEGACY;
@@ -886,4 +894,3 @@ int __init __weak arch_early_irq_init(void)
886{ 894{
887 return 0; 895 return 0;
888} 896}
889#endif
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5cdd8065a3ce..4b082b5cac9e 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -34,6 +34,20 @@
34#else 34#else
35#define raw_read_can_lock(l) read_can_lock(l) 35#define raw_read_can_lock(l) read_can_lock(l)
36#define raw_write_can_lock(l) write_can_lock(l) 36#define raw_write_can_lock(l) write_can_lock(l)
37
38/*
39 * Some architectures can relax in favour of the CPU owning the lock.
40 */
41#ifndef arch_read_relax
42# define arch_read_relax(l) cpu_relax()
43#endif
44#ifndef arch_write_relax
45# define arch_write_relax(l) cpu_relax()
46#endif
47#ifndef arch_spin_relax
48# define arch_spin_relax(l) cpu_relax()
49#endif
50
37/* 51/*
38 * We build the __lock_function inlines here. They are too large for 52 * We build the __lock_function inlines here. They are too large for
39 * inlining all over the place, but here is only one user per function 53 * inlining all over the place, but here is only one user per function
diff --git a/kernel/sys.c b/kernel/sys.c
index 771129b299f8..c18ecca575b4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -337,7 +337,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
337 if (rgid != (gid_t) -1) { 337 if (rgid != (gid_t) -1) {
338 if (gid_eq(old->gid, krgid) || 338 if (gid_eq(old->gid, krgid) ||
339 gid_eq(old->egid, krgid) || 339 gid_eq(old->egid, krgid) ||
340 nsown_capable(CAP_SETGID)) 340 ns_capable(old->user_ns, CAP_SETGID))
341 new->gid = krgid; 341 new->gid = krgid;
342 else 342 else
343 goto error; 343 goto error;
@@ -346,7 +346,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
346 if (gid_eq(old->gid, kegid) || 346 if (gid_eq(old->gid, kegid) ||
347 gid_eq(old->egid, kegid) || 347 gid_eq(old->egid, kegid) ||
348 gid_eq(old->sgid, kegid) || 348 gid_eq(old->sgid, kegid) ||
349 nsown_capable(CAP_SETGID)) 349 ns_capable(old->user_ns, CAP_SETGID))
350 new->egid = kegid; 350 new->egid = kegid;
351 else 351 else
352 goto error; 352 goto error;
@@ -387,7 +387,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
387 old = current_cred(); 387 old = current_cred();
388 388
389 retval = -EPERM; 389 retval = -EPERM;
390 if (nsown_capable(CAP_SETGID)) 390 if (ns_capable(old->user_ns, CAP_SETGID))
391 new->gid = new->egid = new->sgid = new->fsgid = kgid; 391 new->gid = new->egid = new->sgid = new->fsgid = kgid;
392 else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) 392 else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
393 new->egid = new->fsgid = kgid; 393 new->egid = new->fsgid = kgid;
@@ -471,7 +471,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
471 new->uid = kruid; 471 new->uid = kruid;
472 if (!uid_eq(old->uid, kruid) && 472 if (!uid_eq(old->uid, kruid) &&
473 !uid_eq(old->euid, kruid) && 473 !uid_eq(old->euid, kruid) &&
474 !nsown_capable(CAP_SETUID)) 474 !ns_capable(old->user_ns, CAP_SETUID))
475 goto error; 475 goto error;
476 } 476 }
477 477
@@ -480,7 +480,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
480 if (!uid_eq(old->uid, keuid) && 480 if (!uid_eq(old->uid, keuid) &&
481 !uid_eq(old->euid, keuid) && 481 !uid_eq(old->euid, keuid) &&
482 !uid_eq(old->suid, keuid) && 482 !uid_eq(old->suid, keuid) &&
483 !nsown_capable(CAP_SETUID)) 483 !ns_capable(old->user_ns, CAP_SETUID))
484 goto error; 484 goto error;
485 } 485 }
486 486
@@ -534,7 +534,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
534 old = current_cred(); 534 old = current_cred();
535 535
536 retval = -EPERM; 536 retval = -EPERM;
537 if (nsown_capable(CAP_SETUID)) { 537 if (ns_capable(old->user_ns, CAP_SETUID)) {
538 new->suid = new->uid = kuid; 538 new->suid = new->uid = kuid;
539 if (!uid_eq(kuid, old->uid)) { 539 if (!uid_eq(kuid, old->uid)) {
540 retval = set_user(new); 540 retval = set_user(new);
@@ -591,7 +591,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
591 old = current_cred(); 591 old = current_cred();
592 592
593 retval = -EPERM; 593 retval = -EPERM;
594 if (!nsown_capable(CAP_SETUID)) { 594 if (!ns_capable(old->user_ns, CAP_SETUID)) {
595 if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && 595 if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
596 !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) 596 !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
597 goto error; 597 goto error;
@@ -673,7 +673,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
673 old = current_cred(); 673 old = current_cred();
674 674
675 retval = -EPERM; 675 retval = -EPERM;
676 if (!nsown_capable(CAP_SETGID)) { 676 if (!ns_capable(old->user_ns, CAP_SETGID)) {
677 if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && 677 if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
678 !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) 678 !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
679 goto error; 679 goto error;
@@ -744,7 +744,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
744 744
745 if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || 745 if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) ||
746 uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || 746 uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
747 nsown_capable(CAP_SETUID)) { 747 ns_capable(old->user_ns, CAP_SETUID)) {
748 if (!uid_eq(kuid, old->fsuid)) { 748 if (!uid_eq(kuid, old->fsuid)) {
749 new->fsuid = kuid; 749 new->fsuid = kuid;
750 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 750 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -783,7 +783,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
783 783
784 if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || 784 if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) ||
785 gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || 785 gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
786 nsown_capable(CAP_SETGID)) { 786 ns_capable(old->user_ns, CAP_SETGID)) {
787 if (!gid_eq(kgid, old->fsgid)) { 787 if (!gid_eq(kgid, old->fsgid)) {
788 new->fsgid = kgid; 788 new->fsgid = kgid;
789 goto change_okay; 789 goto change_okay;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 07f6fc468e17..b2f06f3c6a3f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1225,7 +1225,7 @@ static struct ctl_table vm_table[] = {
1225 .data = &hugepages_treat_as_movable, 1225 .data = &hugepages_treat_as_movable,
1226 .maxlen = sizeof(int), 1226 .maxlen = sizeof(int),
1227 .mode = 0644, 1227 .mode = 0644,
1228 .proc_handler = hugetlb_treat_movable_handler, 1228 .proc_handler = proc_dointvec,
1229 }, 1229 },
1230 { 1230 {
1231 .procname = "nr_overcommit_hugepages", 1231 .procname = "nr_overcommit_hugepages",
@@ -1471,14 +1471,14 @@ static struct ctl_table fs_table[] = {
1471 { 1471 {
1472 .procname = "inode-nr", 1472 .procname = "inode-nr",
1473 .data = &inodes_stat, 1473 .data = &inodes_stat,
1474 .maxlen = 2*sizeof(int), 1474 .maxlen = 2*sizeof(long),
1475 .mode = 0444, 1475 .mode = 0444,
1476 .proc_handler = proc_nr_inodes, 1476 .proc_handler = proc_nr_inodes,
1477 }, 1477 },
1478 { 1478 {
1479 .procname = "inode-state", 1479 .procname = "inode-state",
1480 .data = &inodes_stat, 1480 .data = &inodes_stat,
1481 .maxlen = 7*sizeof(int), 1481 .maxlen = 7*sizeof(long),
1482 .mode = 0444, 1482 .mode = 0444,
1483 .proc_handler = proc_nr_inodes, 1483 .proc_handler = proc_nr_inodes,
1484 }, 1484 },
@@ -1508,7 +1508,7 @@ static struct ctl_table fs_table[] = {
1508 { 1508 {
1509 .procname = "dentry-state", 1509 .procname = "dentry-state",
1510 .data = &dentry_stat, 1510 .data = &dentry_stat,
1511 .maxlen = 6*sizeof(int), 1511 .maxlen = 6*sizeof(long),
1512 .mode = 0444, 1512 .mode = 0444,
1513 .proc_handler = proc_nr_dentry, 1513 .proc_handler = proc_nr_dentry,
1514 }, 1514 },
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 65bd3c92d6f3..8727032e3a6f 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -4,6 +4,23 @@
4 4
5static struct callback_head work_exited; /* all we need is ->next == NULL */ 5static struct callback_head work_exited; /* all we need is ->next == NULL */
6 6
7/**
8 * task_work_add - ask the @task to execute @work->func()
9 * @task: the task which should run the callback
10 * @work: the callback to run
11 * @notify: send the notification if true
12 *
13 * Queue @work for task_work_run() below and notify the @task if @notify.
14 * Fails if the @task is exiting/exited and thus it can't process this @work.
15 * Otherwise @work->func() will be called when the @task returns from kernel
16 * mode or exits.
17 *
18 * This is like the signal handler which runs in kernel mode, but it doesn't
19 * try to wake up the @task.
20 *
21 * RETURNS:
22 * 0 if succeeds or -ESRCH.
23 */
7int 24int
8task_work_add(struct task_struct *task, struct callback_head *work, bool notify) 25task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
9{ 26{
@@ -21,11 +38,22 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
21 return 0; 38 return 0;
22} 39}
23 40
41/**
42 * task_work_cancel - cancel a pending work added by task_work_add()
43 * @task: the task which should execute the work
44 * @func: identifies the work to remove
45 *
46 * Find the last queued pending work with ->func == @func and remove
47 * it from queue.
48 *
49 * RETURNS:
50 * The found work or NULL if not found.
51 */
24struct callback_head * 52struct callback_head *
25task_work_cancel(struct task_struct *task, task_work_func_t func) 53task_work_cancel(struct task_struct *task, task_work_func_t func)
26{ 54{
27 struct callback_head **pprev = &task->task_works; 55 struct callback_head **pprev = &task->task_works;
28 struct callback_head *work = NULL; 56 struct callback_head *work;
29 unsigned long flags; 57 unsigned long flags;
30 /* 58 /*
31 * If cmpxchg() fails we continue without updating pprev. 59 * If cmpxchg() fails we continue without updating pprev.
@@ -35,7 +63,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
35 */ 63 */
36 raw_spin_lock_irqsave(&task->pi_lock, flags); 64 raw_spin_lock_irqsave(&task->pi_lock, flags);
37 while ((work = ACCESS_ONCE(*pprev))) { 65 while ((work = ACCESS_ONCE(*pprev))) {
38 read_barrier_depends(); 66 smp_read_barrier_depends();
39 if (work->func != func) 67 if (work->func != func)
40 pprev = &work->next; 68 pprev = &work->next;
41 else if (cmpxchg(pprev, work, work->next) == work) 69 else if (cmpxchg(pprev, work, work->next) == work)
@@ -46,6 +74,14 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
46 return work; 74 return work;
47} 75}
48 76
77/**
78 * task_work_run - execute the works added by task_work_add()
79 *
80 * Flush the pending works. Should be used by the core kernel code.
81 * Called before the task returns to the user-mode or stops, or when
82 * it exits. In the latter case task_work_add() can no longer add the
83 * new work after task_work_run() returns.
84 */
49void task_work_run(void) 85void task_work_run(void)
50{ 86{
51 struct task_struct *task = current; 87 struct task_struct *task = current;
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 70f27e89012b..2b62fe86f9ec 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -105,7 +105,6 @@ config NO_HZ_FULL
105 select RCU_USER_QS 105 select RCU_USER_QS
106 select RCU_NOCB_CPU 106 select RCU_NOCB_CPU
107 select VIRT_CPU_ACCOUNTING_GEN 107 select VIRT_CPU_ACCOUNTING_GEN
108 select CONTEXT_TRACKING_FORCE
109 select IRQ_WORK 108 select IRQ_WORK
110 help 109 help
111 Adaptively try to shutdown the tick whenever possible, even when 110 Adaptively try to shutdown the tick whenever possible, even when
@@ -134,6 +133,56 @@ config NO_HZ_FULL_ALL
134 Note the boot CPU will still be kept outside the range to 133 Note the boot CPU will still be kept outside the range to
135 handle the timekeeping duty. 134 handle the timekeeping duty.
136 135
136config NO_HZ_FULL_SYSIDLE
137 bool "Detect full-system idle state for full dynticks system"
138 depends on NO_HZ_FULL
139 default n
140 help
141 At least one CPU must keep the scheduling-clock tick running for
142 timekeeping purposes whenever there is a non-idle CPU, where
143 "non-idle" also includes dynticks CPUs as long as they are
144 running non-idle tasks. Because the underlying adaptive-tick
145 support cannot distinguish between all CPUs being idle and
146 all CPUs each running a single task in dynticks mode, the
147 underlying support simply ensures that there is always a CPU
148 handling the scheduling-clock tick, whether or not all CPUs
149 are idle. This Kconfig option enables scalable detection of
150 the all-CPUs-idle state, thus allowing the scheduling-clock
151 tick to be disabled when all CPUs are idle. Note that scalable
152 detection of the all-CPUs-idle state means that larger systems
153 will be slower to declare the all-CPUs-idle state.
154
155 Say Y if you would like to help debug all-CPUs-idle detection.
156
157 Say N if you are unsure.
158
159config NO_HZ_FULL_SYSIDLE_SMALL
160 int "Number of CPUs above which large-system approach is used"
161 depends on NO_HZ_FULL_SYSIDLE
162 range 1 NR_CPUS
163 default 8
164 help
165 The full-system idle detection mechanism takes a lazy approach
166 on large systems, as is required to attain decent scalability.
167 However, on smaller systems, scalability is not anywhere near as
168 large a concern as is energy efficiency. The sysidle subsystem
169 therefore uses a fast but non-scalable algorithm for small
170 systems and a lazier but scalable algorithm for large systems.
171 This Kconfig parameter defines the number of CPUs in the largest
172 system that will be considered to be "small".
173
174 The default value will be fine in most cases. Battery-powered
175 systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
176 numbers of CPUs, and (3) are suffering from battery-lifetime
177 problems due to long sysidle latencies might wish to experiment
178 with larger values for this Kconfig parameter. On the other
179 hand, they might be even better served by disabling NO_HZ_FULL
180 entirely, given that NO_HZ_FULL is intended for HPC and
181 real-time workloads that at present do not tend to be run on
182 battery-powered systems.
183
184 Take the default if you are unsure.
185
137config NO_HZ 186config NO_HZ
138 bool "Old Idle dynticks config" 187 bool "Old Idle dynticks config"
139 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS 188 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 38959c866789..662c5798a685 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -33,29 +33,64 @@ struct ce_unbind {
33 int res; 33 int res;
34}; 34};
35 35
36/** 36static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
37 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds 37 bool ismax)
38 * @latch: value to convert
39 * @evt: pointer to clock event device descriptor
40 *
41 * Math helper, returns latch value converted to nanoseconds (bound checked)
42 */
43u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
44{ 38{
45 u64 clc = (u64) latch << evt->shift; 39 u64 clc = (u64) latch << evt->shift;
40 u64 rnd;
46 41
47 if (unlikely(!evt->mult)) { 42 if (unlikely(!evt->mult)) {
48 evt->mult = 1; 43 evt->mult = 1;
49 WARN_ON(1); 44 WARN_ON(1);
50 } 45 }
46 rnd = (u64) evt->mult - 1;
47
48 /*
49 * Upper bound sanity check. If the backwards conversion is
50 * not equal latch, we know that the above shift overflowed.
51 */
52 if ((clc >> evt->shift) != (u64)latch)
53 clc = ~0ULL;
54
55 /*
56 * Scaled math oddities:
57 *
58 * For mult <= (1 << shift) we can safely add mult - 1 to
59 * prevent integer rounding loss. So the backwards conversion
60 * from nsec to device ticks will be correct.
61 *
62 * For mult > (1 << shift), i.e. device frequency is > 1GHz we
63 * need to be careful. Adding mult - 1 will result in a value
64 * which when converted back to device ticks can be larger
65 * than latch by up to (mult - 1) >> shift. For the min_delta
66 * calculation we still want to apply this in order to stay
67 * above the minimum device ticks limit. For the upper limit
68 * we would end up with a latch value larger than the upper
69 * limit of the device, so we omit the add to stay below the
70 * device upper boundary.
71 *
72 * Also omit the add if it would overflow the u64 boundary.
73 */
74 if ((~0ULL - clc > rnd) &&
75 (!ismax || evt->mult <= (1U << evt->shift)))
76 clc += rnd;
51 77
52 do_div(clc, evt->mult); 78 do_div(clc, evt->mult);
53 if (clc < 1000)
54 clc = 1000;
55 if (clc > KTIME_MAX)
56 clc = KTIME_MAX;
57 79
58 return clc; 80 /* Deltas less than 1usec are pointless noise */
81 return clc > 1000 ? clc : 1000;
82}
83
84/**
85 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
86 * @latch: value to convert
87 * @evt: pointer to clock event device descriptor
88 *
89 * Math helper, returns latch value converted to nanoseconds (bound checked)
90 */
91u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
92{
93 return cev_delta2ns(latch, evt, false);
59} 94}
60EXPORT_SYMBOL_GPL(clockevent_delta2ns); 95EXPORT_SYMBOL_GPL(clockevent_delta2ns);
61 96
@@ -380,8 +415,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq)
380 sec = 600; 415 sec = 600;
381 416
382 clockevents_calc_mult_shift(dev, freq, sec); 417 clockevents_calc_mult_shift(dev, freq, sec);
383 dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); 418 dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false);
384 dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); 419 dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true);
385} 420}
386 421
387/** 422/**
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8f5b3b98577b..bb2215174f05 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -516,13 +516,13 @@ static void sync_cmos_clock(struct work_struct *work)
516 schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); 516 schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
517} 517}
518 518
519static void notify_cmos_timer(void) 519void ntp_notify_cmos_timer(void)
520{ 520{
521 schedule_delayed_work(&sync_cmos_work, 0); 521 schedule_delayed_work(&sync_cmos_work, 0);
522} 522}
523 523
524#else 524#else
525static inline void notify_cmos_timer(void) { } 525void ntp_notify_cmos_timer(void) { }
526#endif 526#endif
527 527
528 528
@@ -687,8 +687,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
687 if (!(time_status & STA_NANO)) 687 if (!(time_status & STA_NANO))
688 txc->time.tv_usec /= NSEC_PER_USEC; 688 txc->time.tv_usec /= NSEC_PER_USEC;
689 689
690 notify_cmos_timer();
691
692 return result; 690 return result;
693} 691}
694 692
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e8a1516cc0a3..3612fc77f834 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -23,6 +23,7 @@
23#include <linux/irq_work.h> 23#include <linux/irq_work.h>
24#include <linux/posix-timers.h> 24#include <linux/posix-timers.h>
25#include <linux/perf_event.h> 25#include <linux/perf_event.h>
26#include <linux/context_tracking.h>
26 27
27#include <asm/irq_regs.h> 28#include <asm/irq_regs.h>
28 29
@@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
148} 149}
149 150
150#ifdef CONFIG_NO_HZ_FULL 151#ifdef CONFIG_NO_HZ_FULL
151static cpumask_var_t nohz_full_mask; 152cpumask_var_t tick_nohz_full_mask;
152bool have_nohz_full_mask; 153bool tick_nohz_full_running;
153 154
154static bool can_stop_full_tick(void) 155static bool can_stop_full_tick(void)
155{ 156{
@@ -182,7 +183,7 @@ static bool can_stop_full_tick(void)
182 * Don't allow the user to think they can get 183 * Don't allow the user to think they can get
183 * full NO_HZ with this machine. 184 * full NO_HZ with this machine.
184 */ 185 */
185 WARN_ONCE(have_nohz_full_mask, 186 WARN_ONCE(tick_nohz_full_running,
186 "NO_HZ FULL will not work with unstable sched clock"); 187 "NO_HZ FULL will not work with unstable sched clock");
187 return false; 188 return false;
188 } 189 }
@@ -197,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
197 * Re-evaluate the need for the tick on the current CPU 198 * Re-evaluate the need for the tick on the current CPU
198 * and restart it if necessary. 199 * and restart it if necessary.
199 */ 200 */
200void tick_nohz_full_check(void) 201void __tick_nohz_full_check(void)
201{ 202{
202 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 203 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
203 204
@@ -211,7 +212,7 @@ void tick_nohz_full_check(void)
211 212
212static void nohz_full_kick_work_func(struct irq_work *work) 213static void nohz_full_kick_work_func(struct irq_work *work)
213{ 214{
214 tick_nohz_full_check(); 215 __tick_nohz_full_check();
215} 216}
216 217
217static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { 218static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
@@ -230,7 +231,7 @@ void tick_nohz_full_kick(void)
230 231
231static void nohz_full_kick_ipi(void *info) 232static void nohz_full_kick_ipi(void *info)
232{ 233{
233 tick_nohz_full_check(); 234 __tick_nohz_full_check();
234} 235}
235 236
236/* 237/*
@@ -239,12 +240,13 @@ static void nohz_full_kick_ipi(void *info)
239 */ 240 */
240void tick_nohz_full_kick_all(void) 241void tick_nohz_full_kick_all(void)
241{ 242{
242 if (!have_nohz_full_mask) 243 if (!tick_nohz_full_running)
243 return; 244 return;
244 245
245 preempt_disable(); 246 preempt_disable();
246 smp_call_function_many(nohz_full_mask, 247 smp_call_function_many(tick_nohz_full_mask,
247 nohz_full_kick_ipi, NULL, false); 248 nohz_full_kick_ipi, NULL, false);
249 tick_nohz_full_kick();
248 preempt_enable(); 250 preempt_enable();
249} 251}
250 252
@@ -253,7 +255,7 @@ void tick_nohz_full_kick_all(void)
253 * It might need the tick due to per task/process properties: 255 * It might need the tick due to per task/process properties:
254 * perf events, posix cpu timers, ... 256 * perf events, posix cpu timers, ...
255 */ 257 */
256void tick_nohz_task_switch(struct task_struct *tsk) 258void __tick_nohz_task_switch(struct task_struct *tsk)
257{ 259{
258 unsigned long flags; 260 unsigned long flags;
259 261
@@ -269,31 +271,23 @@ out:
269 local_irq_restore(flags); 271 local_irq_restore(flags);
270} 272}
271 273
272int tick_nohz_full_cpu(int cpu)
273{
274 if (!have_nohz_full_mask)
275 return 0;
276
277 return cpumask_test_cpu(cpu, nohz_full_mask);
278}
279
280/* Parse the boot-time nohz CPU list from the kernel parameters. */ 274/* Parse the boot-time nohz CPU list from the kernel parameters. */
281static int __init tick_nohz_full_setup(char *str) 275static int __init tick_nohz_full_setup(char *str)
282{ 276{
283 int cpu; 277 int cpu;
284 278
285 alloc_bootmem_cpumask_var(&nohz_full_mask); 279 alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
286 if (cpulist_parse(str, nohz_full_mask) < 0) { 280 if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
287 pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); 281 pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
288 return 1; 282 return 1;
289 } 283 }
290 284
291 cpu = smp_processor_id(); 285 cpu = smp_processor_id();
292 if (cpumask_test_cpu(cpu, nohz_full_mask)) { 286 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
293 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); 287 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
294 cpumask_clear_cpu(cpu, nohz_full_mask); 288 cpumask_clear_cpu(cpu, tick_nohz_full_mask);
295 } 289 }
296 have_nohz_full_mask = true; 290 tick_nohz_full_running = true;
297 291
298 return 1; 292 return 1;
299} 293}
@@ -311,7 +305,7 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
311 * If we handle the timekeeping duty for full dynticks CPUs, 305 * If we handle the timekeeping duty for full dynticks CPUs,
312 * we can't safely shutdown that CPU. 306 * we can't safely shutdown that CPU.
313 */ 307 */
314 if (have_nohz_full_mask && tick_do_timer_cpu == cpu) 308 if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
315 return NOTIFY_BAD; 309 return NOTIFY_BAD;
316 break; 310 break;
317 } 311 }
@@ -330,31 +324,34 @@ static int tick_nohz_init_all(void)
330 int err = -1; 324 int err = -1;
331 325
332#ifdef CONFIG_NO_HZ_FULL_ALL 326#ifdef CONFIG_NO_HZ_FULL_ALL
333 if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { 327 if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
334 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); 328 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
335 return err; 329 return err;
336 } 330 }
337 err = 0; 331 err = 0;
338 cpumask_setall(nohz_full_mask); 332 cpumask_setall(tick_nohz_full_mask);
339 cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); 333 cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
340 have_nohz_full_mask = true; 334 tick_nohz_full_running = true;
341#endif 335#endif
342 return err; 336 return err;
343} 337}
344 338
345void __init tick_nohz_init(void) 339void __init tick_nohz_init(void)
346{ 340{
347 if (!have_nohz_full_mask) { 341 int cpu;
342
343 if (!tick_nohz_full_running) {
348 if (tick_nohz_init_all() < 0) 344 if (tick_nohz_init_all() < 0)
349 return; 345 return;
350 } 346 }
351 347
348 for_each_cpu(cpu, tick_nohz_full_mask)
349 context_tracking_cpu_set(cpu);
350
352 cpu_notifier(tick_nohz_cpu_down_callback, 0); 351 cpu_notifier(tick_nohz_cpu_down_callback, 0);
353 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); 352 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask);
354 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); 353 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
355} 354}
356#else
357#define have_nohz_full_mask (0)
358#endif 355#endif
359 356
360/* 357/*
@@ -732,7 +729,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
732 return false; 729 return false;
733 } 730 }
734 731
735 if (have_nohz_full_mask) { 732 if (tick_nohz_full_enabled()) {
736 /* 733 /*
737 * Keep the tick alive to guarantee timekeeping progression 734 * Keep the tick alive to guarantee timekeeping progression
738 * if there are full dynticks CPUs around 735 * if there are full dynticks CPUs around
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 48b9fffabdc2..947ba25a95a0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1703,6 +1703,8 @@ int do_adjtimex(struct timex *txc)
1703 write_seqcount_end(&timekeeper_seq); 1703 write_seqcount_end(&timekeeper_seq);
1704 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1704 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1705 1705
1706 ntp_notify_cmos_timer();
1707
1706 return ret; 1708 return ret;
1707} 1709}
1708 1710
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a6d098c6df3f..03cf44ac54d3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1978,12 +1978,27 @@ int __weak ftrace_arch_code_modify_post_process(void)
1978 1978
1979void ftrace_modify_all_code(int command) 1979void ftrace_modify_all_code(int command)
1980{ 1980{
1981 int update = command & FTRACE_UPDATE_TRACE_FUNC;
1982
1983 /*
1984 * If the ftrace_caller calls a ftrace_ops func directly,
1985 * we need to make sure that it only traces functions it
1986 * expects to trace. When doing the switch of functions,
1987 * we need to update to the ftrace_ops_list_func first
1988 * before the transition between old and new calls are set,
1989 * as the ftrace_ops_list_func will check the ops hashes
1990 * to make sure the ops are having the right functions
1991 * traced.
1992 */
1993 if (update)
1994 ftrace_update_ftrace_func(ftrace_ops_list_func);
1995
1981 if (command & FTRACE_UPDATE_CALLS) 1996 if (command & FTRACE_UPDATE_CALLS)
1982 ftrace_replace_code(1); 1997 ftrace_replace_code(1);
1983 else if (command & FTRACE_DISABLE_CALLS) 1998 else if (command & FTRACE_DISABLE_CALLS)
1984 ftrace_replace_code(0); 1999 ftrace_replace_code(0);
1985 2000
1986 if (command & FTRACE_UPDATE_TRACE_FUNC) 2001 if (update && ftrace_trace_function != ftrace_ops_list_func)
1987 ftrace_update_ftrace_func(ftrace_trace_function); 2002 ftrace_update_ftrace_func(ftrace_trace_function);
1988 2003
1989 if (command & FTRACE_START_FUNC_RET) 2004 if (command & FTRACE_START_FUNC_RET)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 496f94d57698..7974ba20557d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3166,11 +3166,6 @@ static const struct file_operations show_traces_fops = {
3166}; 3166};
3167 3167
3168/* 3168/*
3169 * Only trace on a CPU if the bitmask is set:
3170 */
3171static cpumask_var_t tracing_cpumask;
3172
3173/*
3174 * The tracer itself will not take this lock, but still we want 3169 * The tracer itself will not take this lock, but still we want
3175 * to provide a consistent cpumask to user-space: 3170 * to provide a consistent cpumask to user-space:
3176 */ 3171 */
@@ -3186,11 +3181,12 @@ static ssize_t
3186tracing_cpumask_read(struct file *filp, char __user *ubuf, 3181tracing_cpumask_read(struct file *filp, char __user *ubuf,
3187 size_t count, loff_t *ppos) 3182 size_t count, loff_t *ppos)
3188{ 3183{
3184 struct trace_array *tr = file_inode(filp)->i_private;
3189 int len; 3185 int len;
3190 3186
3191 mutex_lock(&tracing_cpumask_update_lock); 3187 mutex_lock(&tracing_cpumask_update_lock);
3192 3188
3193 len = cpumask_scnprintf(mask_str, count, tracing_cpumask); 3189 len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask);
3194 if (count - len < 2) { 3190 if (count - len < 2) {
3195 count = -EINVAL; 3191 count = -EINVAL;
3196 goto out_err; 3192 goto out_err;
@@ -3208,7 +3204,7 @@ static ssize_t
3208tracing_cpumask_write(struct file *filp, const char __user *ubuf, 3204tracing_cpumask_write(struct file *filp, const char __user *ubuf,
3209 size_t count, loff_t *ppos) 3205 size_t count, loff_t *ppos)
3210{ 3206{
3211 struct trace_array *tr = filp->private_data; 3207 struct trace_array *tr = file_inode(filp)->i_private;
3212 cpumask_var_t tracing_cpumask_new; 3208 cpumask_var_t tracing_cpumask_new;
3213 int err, cpu; 3209 int err, cpu;
3214 3210
@@ -3228,12 +3224,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
3228 * Increase/decrease the disabled counter if we are 3224 * Increase/decrease the disabled counter if we are
3229 * about to flip a bit in the cpumask: 3225 * about to flip a bit in the cpumask:
3230 */ 3226 */
3231 if (cpumask_test_cpu(cpu, tracing_cpumask) && 3227 if (cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
3232 !cpumask_test_cpu(cpu, tracing_cpumask_new)) { 3228 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
3233 atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); 3229 atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
3234 ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); 3230 ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
3235 } 3231 }
3236 if (!cpumask_test_cpu(cpu, tracing_cpumask) && 3232 if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
3237 cpumask_test_cpu(cpu, tracing_cpumask_new)) { 3233 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
3238 atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); 3234 atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
3239 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); 3235 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
@@ -3242,7 +3238,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
3242 arch_spin_unlock(&ftrace_max_lock); 3238 arch_spin_unlock(&ftrace_max_lock);
3243 local_irq_enable(); 3239 local_irq_enable();
3244 3240
3245 cpumask_copy(tracing_cpumask, tracing_cpumask_new); 3241 cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
3246 3242
3247 mutex_unlock(&tracing_cpumask_update_lock); 3243 mutex_unlock(&tracing_cpumask_update_lock);
3248 free_cpumask_var(tracing_cpumask_new); 3244 free_cpumask_var(tracing_cpumask_new);
@@ -3256,9 +3252,10 @@ err_unlock:
3256} 3252}
3257 3253
3258static const struct file_operations tracing_cpumask_fops = { 3254static const struct file_operations tracing_cpumask_fops = {
3259 .open = tracing_open_generic, 3255 .open = tracing_open_generic_tr,
3260 .read = tracing_cpumask_read, 3256 .read = tracing_cpumask_read,
3261 .write = tracing_cpumask_write, 3257 .write = tracing_cpumask_write,
3258 .release = tracing_release_generic_tr,
3262 .llseek = generic_file_llseek, 3259 .llseek = generic_file_llseek,
3263}; 3260};
3264 3261
@@ -5938,6 +5935,11 @@ static int new_instance_create(const char *name)
5938 if (!tr->name) 5935 if (!tr->name)
5939 goto out_free_tr; 5936 goto out_free_tr;
5940 5937
5938 if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
5939 goto out_free_tr;
5940
5941 cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
5942
5941 raw_spin_lock_init(&tr->start_lock); 5943 raw_spin_lock_init(&tr->start_lock);
5942 5944
5943 tr->current_trace = &nop_trace; 5945 tr->current_trace = &nop_trace;
@@ -5969,6 +5971,7 @@ static int new_instance_create(const char *name)
5969 out_free_tr: 5971 out_free_tr:
5970 if (tr->trace_buffer.buffer) 5972 if (tr->trace_buffer.buffer)
5971 ring_buffer_free(tr->trace_buffer.buffer); 5973 ring_buffer_free(tr->trace_buffer.buffer);
5974 free_cpumask_var(tr->tracing_cpumask);
5972 kfree(tr->name); 5975 kfree(tr->name);
5973 kfree(tr); 5976 kfree(tr);
5974 5977
@@ -6098,6 +6101,9 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6098{ 6101{
6099 int cpu; 6102 int cpu;
6100 6103
6104 trace_create_file("tracing_cpumask", 0644, d_tracer,
6105 tr, &tracing_cpumask_fops);
6106
6101 trace_create_file("trace_options", 0644, d_tracer, 6107 trace_create_file("trace_options", 0644, d_tracer,
6102 tr, &tracing_iter_fops); 6108 tr, &tracing_iter_fops);
6103 6109
@@ -6147,9 +6153,6 @@ static __init int tracer_init_debugfs(void)
6147 6153
6148 init_tracer_debugfs(&global_trace, d_tracer); 6154 init_tracer_debugfs(&global_trace, d_tracer);
6149 6155
6150 trace_create_file("tracing_cpumask", 0644, d_tracer,
6151 &global_trace, &tracing_cpumask_fops);
6152
6153 trace_create_file("available_tracers", 0444, d_tracer, 6156 trace_create_file("available_tracers", 0444, d_tracer,
6154 &global_trace, &show_traces_fops); 6157 &global_trace, &show_traces_fops);
6155 6158
@@ -6371,7 +6374,7 @@ __init static int tracer_alloc_buffers(void)
6371 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) 6374 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
6372 goto out; 6375 goto out;
6373 6376
6374 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 6377 if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL))
6375 goto out_free_buffer_mask; 6378 goto out_free_buffer_mask;
6376 6379
6377 /* Only allocate trace_printk buffers if a trace_printk exists */ 6380 /* Only allocate trace_printk buffers if a trace_printk exists */
@@ -6386,7 +6389,7 @@ __init static int tracer_alloc_buffers(void)
6386 ring_buf_size = 1; 6389 ring_buf_size = 1;
6387 6390
6388 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 6391 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
6389 cpumask_copy(tracing_cpumask, cpu_all_mask); 6392 cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask);
6390 6393
6391 raw_spin_lock_init(&global_trace.start_lock); 6394 raw_spin_lock_init(&global_trace.start_lock);
6392 6395
@@ -6441,7 +6444,7 @@ out_free_cpumask:
6441#ifdef CONFIG_TRACER_MAX_TRACE 6444#ifdef CONFIG_TRACER_MAX_TRACE
6442 free_percpu(global_trace.max_buffer.data); 6445 free_percpu(global_trace.max_buffer.data);
6443#endif 6446#endif
6444 free_cpumask_var(tracing_cpumask); 6447 free_cpumask_var(global_trace.tracing_cpumask);
6445out_free_buffer_mask: 6448out_free_buffer_mask:
6446 free_cpumask_var(tracing_buffer_mask); 6449 free_cpumask_var(tracing_buffer_mask);
6447out: 6450out:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index afaae41b0a02..10c86fb7a2b4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -206,6 +206,7 @@ struct trace_array {
206 struct dentry *event_dir; 206 struct dentry *event_dir;
207 struct list_head systems; 207 struct list_head systems;
208 struct list_head events; 208 struct list_head events;
209 cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
209 int ref; 210 int ref;
210}; 211};
211 212
@@ -1022,6 +1023,9 @@ extern struct list_head ftrace_events;
1022extern const char *__start___trace_bprintk_fmt[]; 1023extern const char *__start___trace_bprintk_fmt[];
1023extern const char *__stop___trace_bprintk_fmt[]; 1024extern const char *__stop___trace_bprintk_fmt[];
1024 1025
1026extern const char *__start___tracepoint_str[];
1027extern const char *__stop___tracepoint_str[];
1028
1025void trace_printk_init_buffers(void); 1029void trace_printk_init_buffers(void);
1026void trace_printk_start_comm(void); 1030void trace_printk_start_comm(void);
1027int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); 1031int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 29a7ebcfb426..368a4d50cc30 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1489,12 +1489,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
1489} 1489}
1490 1490
1491static int 1491static int
1492event_create_dir(struct dentry *parent, 1492event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
1493 struct ftrace_event_file *file,
1494 const struct file_operations *id,
1495 const struct file_operations *enable,
1496 const struct file_operations *filter,
1497 const struct file_operations *format)
1498{ 1493{
1499 struct ftrace_event_call *call = file->event_call; 1494 struct ftrace_event_call *call = file->event_call;
1500 struct trace_array *tr = file->tr; 1495 struct trace_array *tr = file->tr;
@@ -1522,12 +1517,13 @@ event_create_dir(struct dentry *parent,
1522 1517
1523 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) 1518 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
1524 trace_create_file("enable", 0644, file->dir, file, 1519 trace_create_file("enable", 0644, file->dir, file,
1525 enable); 1520 &ftrace_enable_fops);
1526 1521
1527#ifdef CONFIG_PERF_EVENTS 1522#ifdef CONFIG_PERF_EVENTS
1528 if (call->event.type && call->class->reg) 1523 if (call->event.type && call->class->reg)
1529 trace_create_file("id", 0444, file->dir, 1524 trace_create_file("id", 0444, file->dir,
1530 (void *)(long)call->event.type, id); 1525 (void *)(long)call->event.type,
1526 &ftrace_event_id_fops);
1531#endif 1527#endif
1532 1528
1533 /* 1529 /*
@@ -1544,10 +1540,10 @@ event_create_dir(struct dentry *parent,
1544 } 1540 }
1545 } 1541 }
1546 trace_create_file("filter", 0644, file->dir, call, 1542 trace_create_file("filter", 0644, file->dir, call,
1547 filter); 1543 &ftrace_event_filter_fops);
1548 1544
1549 trace_create_file("format", 0444, file->dir, call, 1545 trace_create_file("format", 0444, file->dir, call,
1550 format); 1546 &ftrace_event_format_fops);
1551 1547
1552 return 0; 1548 return 0;
1553} 1549}
@@ -1648,12 +1644,7 @@ trace_create_new_event(struct ftrace_event_call *call,
1648 1644
1649/* Add an event to a trace directory */ 1645/* Add an event to a trace directory */
1650static int 1646static int
1651__trace_add_new_event(struct ftrace_event_call *call, 1647__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)
1652 struct trace_array *tr,
1653 const struct file_operations *id,
1654 const struct file_operations *enable,
1655 const struct file_operations *filter,
1656 const struct file_operations *format)
1657{ 1648{
1658 struct ftrace_event_file *file; 1649 struct ftrace_event_file *file;
1659 1650
@@ -1661,7 +1652,7 @@ __trace_add_new_event(struct ftrace_event_call *call,
1661 if (!file) 1652 if (!file)
1662 return -ENOMEM; 1653 return -ENOMEM;
1663 1654
1664 return event_create_dir(tr->event_dir, file, id, enable, filter, format); 1655 return event_create_dir(tr->event_dir, file);
1665} 1656}
1666 1657
1667/* 1658/*
@@ -1683,8 +1674,7 @@ __trace_early_add_new_event(struct ftrace_event_call *call,
1683} 1674}
1684 1675
1685struct ftrace_module_file_ops; 1676struct ftrace_module_file_ops;
1686static void __add_event_to_tracers(struct ftrace_event_call *call, 1677static void __add_event_to_tracers(struct ftrace_event_call *call);
1687 struct ftrace_module_file_ops *file_ops);
1688 1678
1689/* Add an additional event_call dynamically */ 1679/* Add an additional event_call dynamically */
1690int trace_add_event_call(struct ftrace_event_call *call) 1680int trace_add_event_call(struct ftrace_event_call *call)
@@ -1695,7 +1685,7 @@ int trace_add_event_call(struct ftrace_event_call *call)
1695 1685
1696 ret = __register_event(call, NULL); 1686 ret = __register_event(call, NULL);
1697 if (ret >= 0) 1687 if (ret >= 0)
1698 __add_event_to_tracers(call, NULL); 1688 __add_event_to_tracers(call);
1699 1689
1700 mutex_unlock(&event_mutex); 1690 mutex_unlock(&event_mutex);
1701 mutex_unlock(&trace_types_lock); 1691 mutex_unlock(&trace_types_lock);
@@ -1769,100 +1759,21 @@ int trace_remove_event_call(struct ftrace_event_call *call)
1769 1759
1770#ifdef CONFIG_MODULES 1760#ifdef CONFIG_MODULES
1771 1761
1772static LIST_HEAD(ftrace_module_file_list);
1773
1774/*
1775 * Modules must own their file_operations to keep up with
1776 * reference counting.
1777 */
1778struct ftrace_module_file_ops {
1779 struct list_head list;
1780 struct module *mod;
1781 struct file_operations id;
1782 struct file_operations enable;
1783 struct file_operations format;
1784 struct file_operations filter;
1785};
1786
1787static struct ftrace_module_file_ops *
1788find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
1789{
1790 /*
1791 * As event_calls are added in groups by module,
1792 * when we find one file_ops, we don't need to search for
1793 * each call in that module, as the rest should be the
1794 * same. Only search for a new one if the last one did
1795 * not match.
1796 */
1797 if (file_ops && mod == file_ops->mod)
1798 return file_ops;
1799
1800 list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
1801 if (file_ops->mod == mod)
1802 return file_ops;
1803 }
1804 return NULL;
1805}
1806
1807static struct ftrace_module_file_ops *
1808trace_create_file_ops(struct module *mod)
1809{
1810 struct ftrace_module_file_ops *file_ops;
1811
1812 /*
1813 * This is a bit of a PITA. To allow for correct reference
1814 * counting, modules must "own" their file_operations.
1815 * To do this, we allocate the file operations that will be
1816 * used in the event directory.
1817 */
1818
1819 file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
1820 if (!file_ops)
1821 return NULL;
1822
1823 file_ops->mod = mod;
1824
1825 file_ops->id = ftrace_event_id_fops;
1826 file_ops->id.owner = mod;
1827
1828 file_ops->enable = ftrace_enable_fops;
1829 file_ops->enable.owner = mod;
1830
1831 file_ops->filter = ftrace_event_filter_fops;
1832 file_ops->filter.owner = mod;
1833
1834 file_ops->format = ftrace_event_format_fops;
1835 file_ops->format.owner = mod;
1836
1837 list_add(&file_ops->list, &ftrace_module_file_list);
1838
1839 return file_ops;
1840}
1841
1842static void trace_module_add_events(struct module *mod) 1762static void trace_module_add_events(struct module *mod)
1843{ 1763{
1844 struct ftrace_module_file_ops *file_ops = NULL;
1845 struct ftrace_event_call **call, **start, **end; 1764 struct ftrace_event_call **call, **start, **end;
1846 1765
1847 start = mod->trace_events; 1766 start = mod->trace_events;
1848 end = mod->trace_events + mod->num_trace_events; 1767 end = mod->trace_events + mod->num_trace_events;
1849 1768
1850 if (start == end)
1851 return;
1852
1853 file_ops = trace_create_file_ops(mod);
1854 if (!file_ops)
1855 return;
1856
1857 for_each_event(call, start, end) { 1769 for_each_event(call, start, end) {
1858 __register_event(*call, mod); 1770 __register_event(*call, mod);
1859 __add_event_to_tracers(*call, file_ops); 1771 __add_event_to_tracers(*call);
1860 } 1772 }
1861} 1773}
1862 1774
1863static void trace_module_remove_events(struct module *mod) 1775static void trace_module_remove_events(struct module *mod)
1864{ 1776{
1865 struct ftrace_module_file_ops *file_ops;
1866 struct ftrace_event_call *call, *p; 1777 struct ftrace_event_call *call, *p;
1867 bool clear_trace = false; 1778 bool clear_trace = false;
1868 1779
@@ -1874,16 +1785,6 @@ static void trace_module_remove_events(struct module *mod)
1874 __trace_remove_event_call(call); 1785 __trace_remove_event_call(call);
1875 } 1786 }
1876 } 1787 }
1877
1878 /* Now free the file_operations */
1879 list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
1880 if (file_ops->mod == mod)
1881 break;
1882 }
1883 if (&file_ops->list != &ftrace_module_file_list) {
1884 list_del(&file_ops->list);
1885 kfree(file_ops);
1886 }
1887 up_write(&trace_event_sem); 1788 up_write(&trace_event_sem);
1888 1789
1889 /* 1790 /*
@@ -1919,67 +1820,21 @@ static int trace_module_notify(struct notifier_block *self,
1919 return 0; 1820 return 0;
1920} 1821}
1921 1822
1922static int 1823static struct notifier_block trace_module_nb = {
1923__trace_add_new_mod_event(struct ftrace_event_call *call, 1824 .notifier_call = trace_module_notify,
1924 struct trace_array *tr, 1825 .priority = 0,
1925 struct ftrace_module_file_ops *file_ops) 1826};
1926{
1927 return __trace_add_new_event(call, tr,
1928 &file_ops->id, &file_ops->enable,
1929 &file_ops->filter, &file_ops->format);
1930}
1931
1932#else
1933static inline struct ftrace_module_file_ops *
1934find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
1935{
1936 return NULL;
1937}
1938static inline int trace_module_notify(struct notifier_block *self,
1939 unsigned long val, void *data)
1940{
1941 return 0;
1942}
1943static inline int
1944__trace_add_new_mod_event(struct ftrace_event_call *call,
1945 struct trace_array *tr,
1946 struct ftrace_module_file_ops *file_ops)
1947{
1948 return -ENODEV;
1949}
1950#endif /* CONFIG_MODULES */ 1827#endif /* CONFIG_MODULES */
1951 1828
1952/* Create a new event directory structure for a trace directory. */ 1829/* Create a new event directory structure for a trace directory. */
1953static void 1830static void
1954__trace_add_event_dirs(struct trace_array *tr) 1831__trace_add_event_dirs(struct trace_array *tr)
1955{ 1832{
1956 struct ftrace_module_file_ops *file_ops = NULL;
1957 struct ftrace_event_call *call; 1833 struct ftrace_event_call *call;
1958 int ret; 1834 int ret;
1959 1835
1960 list_for_each_entry(call, &ftrace_events, list) { 1836 list_for_each_entry(call, &ftrace_events, list) {
1961 if (call->mod) { 1837 ret = __trace_add_new_event(call, tr);
1962 /*
1963 * Directories for events by modules need to
1964 * keep module ref counts when opened (as we don't
1965 * want the module to disappear when reading one
1966 * of these files). The file_ops keep account of
1967 * the module ref count.
1968 */
1969 file_ops = find_ftrace_file_ops(file_ops, call->mod);
1970 if (!file_ops)
1971 continue; /* Warn? */
1972 ret = __trace_add_new_mod_event(call, tr, file_ops);
1973 if (ret < 0)
1974 pr_warning("Could not create directory for event %s\n",
1975 call->name);
1976 continue;
1977 }
1978 ret = __trace_add_new_event(call, tr,
1979 &ftrace_event_id_fops,
1980 &ftrace_enable_fops,
1981 &ftrace_event_filter_fops,
1982 &ftrace_event_format_fops);
1983 if (ret < 0) 1838 if (ret < 0)
1984 pr_warning("Could not create directory for event %s\n", 1839 pr_warning("Could not create directory for event %s\n",
1985 call->name); 1840 call->name);
@@ -2287,11 +2142,7 @@ __trace_early_add_event_dirs(struct trace_array *tr)
2287 2142
2288 2143
2289 list_for_each_entry(file, &tr->events, list) { 2144 list_for_each_entry(file, &tr->events, list) {
2290 ret = event_create_dir(tr->event_dir, file, 2145 ret = event_create_dir(tr->event_dir, file);
2291 &ftrace_event_id_fops,
2292 &ftrace_enable_fops,
2293 &ftrace_event_filter_fops,
2294 &ftrace_event_format_fops);
2295 if (ret < 0) 2146 if (ret < 0)
2296 pr_warning("Could not create directory for event %s\n", 2147 pr_warning("Could not create directory for event %s\n",
2297 file->event_call->name); 2148 file->event_call->name);
@@ -2332,29 +2183,14 @@ __trace_remove_event_dirs(struct trace_array *tr)
2332 remove_event_file_dir(file); 2183 remove_event_file_dir(file);
2333} 2184}
2334 2185
2335static void 2186static void __add_event_to_tracers(struct ftrace_event_call *call)
2336__add_event_to_tracers(struct ftrace_event_call *call,
2337 struct ftrace_module_file_ops *file_ops)
2338{ 2187{
2339 struct trace_array *tr; 2188 struct trace_array *tr;
2340 2189
2341 list_for_each_entry(tr, &ftrace_trace_arrays, list) { 2190 list_for_each_entry(tr, &ftrace_trace_arrays, list)
2342 if (file_ops) 2191 __trace_add_new_event(call, tr);
2343 __trace_add_new_mod_event(call, tr, file_ops);
2344 else
2345 __trace_add_new_event(call, tr,
2346 &ftrace_event_id_fops,
2347 &ftrace_enable_fops,
2348 &ftrace_event_filter_fops,
2349 &ftrace_event_format_fops);
2350 }
2351} 2192}
2352 2193
2353static struct notifier_block trace_module_nb = {
2354 .notifier_call = trace_module_notify,
2355 .priority = 0,
2356};
2357
2358extern struct ftrace_event_call *__start_ftrace_events[]; 2194extern struct ftrace_event_call *__start_ftrace_events[];
2359extern struct ftrace_event_call *__stop_ftrace_events[]; 2195extern struct ftrace_event_call *__stop_ftrace_events[];
2360 2196
@@ -2559,10 +2395,11 @@ static __init int event_trace_init(void)
2559 if (ret) 2395 if (ret)
2560 return ret; 2396 return ret;
2561 2397
2398#ifdef CONFIG_MODULES
2562 ret = register_module_notifier(&trace_module_nb); 2399 ret = register_module_notifier(&trace_module_nb);
2563 if (ret) 2400 if (ret)
2564 pr_warning("Failed to register trace events module notifier\n"); 2401 pr_warning("Failed to register trace events module notifier\n");
2565 2402#endif
2566 return 0; 2403 return 0;
2567} 2404}
2568early_initcall(event_trace_memsetup); 2405early_initcall(event_trace_memsetup);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index a9077c1b4ad3..2900817ba65c 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -244,12 +244,31 @@ static const char **find_next(void *v, loff_t *pos)
244{ 244{
245 const char **fmt = v; 245 const char **fmt = v;
246 int start_index; 246 int start_index;
247 int last_index;
247 248
248 start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; 249 start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
249 250
250 if (*pos < start_index) 251 if (*pos < start_index)
251 return __start___trace_bprintk_fmt + *pos; 252 return __start___trace_bprintk_fmt + *pos;
252 253
254 /*
255 * The __tracepoint_str section is treated the same as the
256 * __trace_printk_fmt section. The difference is that the
257 * __trace_printk_fmt section should only be used by trace_printk()
258 * in a debugging environment, as if anything exists in that section
259 * the trace_prink() helper buffers are allocated, which would just
260 * waste space in a production environment.
261 *
262 * The __tracepoint_str sections on the other hand are used by
263 * tracepoints which need to map pointers to their strings to
264 * the ASCII text for userspace.
265 */
266 last_index = start_index;
267 start_index = __stop___tracepoint_str - __start___tracepoint_str;
268
269 if (*pos < last_index + start_index)
270 return __start___tracepoint_str + (*pos - last_index);
271
253 return find_next_mod_format(start_index, v, fmt, pos); 272 return find_next_mod_format(start_index, v, fmt, pos);
254} 273}
255 274
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8fd03657bc7d..559329d9bd2f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -200,8 +200,8 @@ extern char *__bad_type_size(void);
200 #type, #name, offsetof(typeof(trace), name), \ 200 #type, #name, offsetof(typeof(trace), name), \
201 sizeof(trace.name), is_signed_type(type) 201 sizeof(trace.name), is_signed_type(type)
202 202
203static 203static int __init
204int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) 204__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
205{ 205{
206 int i; 206 int i;
207 int pos = 0; 207 int pos = 0;
@@ -228,7 +228,7 @@ int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
228 return pos; 228 return pos;
229} 229}
230 230
231static int set_syscall_print_fmt(struct ftrace_event_call *call) 231static int __init set_syscall_print_fmt(struct ftrace_event_call *call)
232{ 232{
233 char *print_fmt; 233 char *print_fmt;
234 int len; 234 int len;
@@ -253,7 +253,7 @@ static int set_syscall_print_fmt(struct ftrace_event_call *call)
253 return 0; 253 return 0;
254} 254}
255 255
256static void free_syscall_print_fmt(struct ftrace_event_call *call) 256static void __init free_syscall_print_fmt(struct ftrace_event_call *call)
257{ 257{
258 struct syscall_metadata *entry = call->data; 258 struct syscall_metadata *entry = call->data;
259 259
@@ -459,7 +459,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
459 mutex_unlock(&syscall_trace_lock); 459 mutex_unlock(&syscall_trace_lock);
460} 460}
461 461
462static int init_syscall_trace(struct ftrace_event_call *call) 462static int __init init_syscall_trace(struct ftrace_event_call *call)
463{ 463{
464 int id; 464 int id;
465 int num; 465 int num;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index f6c83d7ef000..602e5bbbceff 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
176 struct group_info *group_info; 176 struct group_info *group_info;
177 int retval; 177 int retval;
178 178
179 if (!nsown_capable(CAP_SETGID)) 179 if (!ns_capable(current_user_ns(), CAP_SETGID))
180 return -EPERM; 180 return -EPERM;
181 if ((unsigned)gidsetsize > NGROUPS_MAX) 181 if ((unsigned)gidsetsize > NGROUPS_MAX)
182 return -EINVAL; 182 return -EINVAL;
diff --git a/kernel/up.c b/kernel/up.c
index c54c75e9faf7..630d72bf7e41 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -10,12 +10,64 @@
10int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 10int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
11 int wait) 11 int wait)
12{ 12{
13 unsigned long flags;
14
13 WARN_ON(cpu != 0); 15 WARN_ON(cpu != 0);
14 16
15 local_irq_disable(); 17 local_irq_save(flags);
16 (func)(info); 18 func(info);
17 local_irq_enable(); 19 local_irq_restore(flags);
18 20
19 return 0; 21 return 0;
20} 22}
21EXPORT_SYMBOL(smp_call_function_single); 23EXPORT_SYMBOL(smp_call_function_single);
24
25int on_each_cpu(smp_call_func_t func, void *info, int wait)
26{
27 unsigned long flags;
28
29 local_irq_save(flags);
30 func(info);
31 local_irq_restore(flags);
32 return 0;
33}
34EXPORT_SYMBOL(on_each_cpu);
35
36/*
37 * Note we still need to test the mask even for UP
38 * because we actually can get an empty mask from
39 * code that on SMP might call us without the local
40 * CPU in the mask.
41 */
42void on_each_cpu_mask(const struct cpumask *mask,
43 smp_call_func_t func, void *info, bool wait)
44{
45 unsigned long flags;
46
47 if (cpumask_test_cpu(0, mask)) {
48 local_irq_save(flags);
49 func(info);
50 local_irq_restore(flags);
51 }
52}
53EXPORT_SYMBOL(on_each_cpu_mask);
54
55/*
56 * Preemption is disabled here to make sure the cond_func is called under the
57 * same condtions in UP and SMP.
58 */
59void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
60 smp_call_func_t func, void *info, bool wait,
61 gfp_t gfp_flags)
62{
63 unsigned long flags;
64
65 preempt_disable();
66 if (cond_func(0, info)) {
67 local_irq_save(flags);
68 func(info);
69 local_irq_restore(flags);
70 }
71 preempt_enable();
72}
73EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/user.c b/kernel/user.c
index 69b4c3d48cde..5bbb91988e69 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,8 +51,6 @@ struct user_namespace init_user_ns = {
51 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
52 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
53 .proc_inum = PROC_USER_INIT_INO, 53 .proc_inum = PROC_USER_INIT_INO,
54 .may_mount_sysfs = true,
55 .may_mount_proc = true,
56}; 54};
57EXPORT_SYMBOL_GPL(init_user_ns); 55EXPORT_SYMBOL_GPL(init_user_ns);
58 56
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9064b919a406..13fb1134ba58 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -101,8 +101,6 @@ int create_user_ns(struct cred *new)
101 101
102 set_cred_user_ns(new, ns); 102 set_cred_user_ns(new, ns);
103 103
104 update_mnt_policy(ns);
105
106 return 0; 104 return 0;
107} 105}
108 106
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 2fc8576efaa8..fd393124e507 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -114,7 +114,7 @@ static int utsns_install(struct nsproxy *nsproxy, void *new)
114 struct uts_namespace *ns = new; 114 struct uts_namespace *ns = new;
115 115
116 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || 116 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
117 !nsown_capable(CAP_SYS_ADMIN)) 117 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
118 return -EPERM; 118 return -EPERM;
119 119
120 get_uts_ns(ns); 120 get_uts_ns(ns);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 1241d8c91d5e..4431610f049a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -486,7 +486,52 @@ static struct smp_hotplug_thread watchdog_threads = {
486 .unpark = watchdog_enable, 486 .unpark = watchdog_enable,
487}; 487};
488 488
489static int watchdog_enable_all_cpus(void) 489static void restart_watchdog_hrtimer(void *info)
490{
491 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
492 int ret;
493
494 /*
495 * No need to cancel and restart hrtimer if it is currently executing
496 * because it will reprogram itself with the new period now.
497 * We should never see it unqueued here because we are running per-cpu
498 * with interrupts disabled.
499 */
500 ret = hrtimer_try_to_cancel(hrtimer);
501 if (ret == 1)
502 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
503 HRTIMER_MODE_REL_PINNED);
504}
505
506static void update_timers(int cpu)
507{
508 struct call_single_data data = {.func = restart_watchdog_hrtimer};
509 /*
510 * Make sure that perf event counter will adopt to a new
511 * sampling period. Updating the sampling period directly would
512 * be much nicer but we do not have an API for that now so
513 * let's use a big hammer.
514 * Hrtimer will adopt the new period on the next tick but this
515 * might be late already so we have to restart the timer as well.
516 */
517 watchdog_nmi_disable(cpu);
518 __smp_call_function_single(cpu, &data, 1);
519 watchdog_nmi_enable(cpu);
520}
521
522static void update_timers_all_cpus(void)
523{
524 int cpu;
525
526 get_online_cpus();
527 preempt_disable();
528 for_each_online_cpu(cpu)
529 update_timers(cpu);
530 preempt_enable();
531 put_online_cpus();
532}
533
534static int watchdog_enable_all_cpus(bool sample_period_changed)
490{ 535{
491 int err = 0; 536 int err = 0;
492 537
@@ -496,6 +541,8 @@ static int watchdog_enable_all_cpus(void)
496 pr_err("Failed to create watchdog threads, disabled\n"); 541 pr_err("Failed to create watchdog threads, disabled\n");
497 else 542 else
498 watchdog_running = 1; 543 watchdog_running = 1;
544 } else if (sample_period_changed) {
545 update_timers_all_cpus();
499 } 546 }
500 547
501 return err; 548 return err;
@@ -520,13 +567,15 @@ int proc_dowatchdog(struct ctl_table *table, int write,
520 void __user *buffer, size_t *lenp, loff_t *ppos) 567 void __user *buffer, size_t *lenp, loff_t *ppos)
521{ 568{
522 int err, old_thresh, old_enabled; 569 int err, old_thresh, old_enabled;
570 static DEFINE_MUTEX(watchdog_proc_mutex);
523 571
572 mutex_lock(&watchdog_proc_mutex);
524 old_thresh = ACCESS_ONCE(watchdog_thresh); 573 old_thresh = ACCESS_ONCE(watchdog_thresh);
525 old_enabled = ACCESS_ONCE(watchdog_user_enabled); 574 old_enabled = ACCESS_ONCE(watchdog_user_enabled);
526 575
527 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 576 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
528 if (err || !write) 577 if (err || !write)
529 return err; 578 goto out;
530 579
531 set_sample_period(); 580 set_sample_period();
532 /* 581 /*
@@ -535,7 +584,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
535 * watchdog_*_all_cpus() function takes care of this. 584 * watchdog_*_all_cpus() function takes care of this.
536 */ 585 */
537 if (watchdog_user_enabled && watchdog_thresh) 586 if (watchdog_user_enabled && watchdog_thresh)
538 err = watchdog_enable_all_cpus(); 587 err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
539 else 588 else
540 watchdog_disable_all_cpus(); 589 watchdog_disable_all_cpus();
541 590
@@ -544,7 +593,8 @@ int proc_dowatchdog(struct ctl_table *table, int write,
544 watchdog_thresh = old_thresh; 593 watchdog_thresh = old_thresh;
545 watchdog_user_enabled = old_enabled; 594 watchdog_user_enabled = old_enabled;
546 } 595 }
547 596out:
597 mutex_unlock(&watchdog_proc_mutex);
548 return err; 598 return err;
549} 599}
550#endif /* CONFIG_SYSCTL */ 600#endif /* CONFIG_SYSCTL */
@@ -553,14 +603,6 @@ void __init lockup_detector_init(void)
553{ 603{
554 set_sample_period(); 604 set_sample_period();
555 605
556#ifdef CONFIG_NO_HZ_FULL
557 if (watchdog_user_enabled) {
558 watchdog_user_enabled = 0;
559 pr_warning("Disabled lockup detectors by default for full dynticks\n");
560 pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n");
561 }
562#endif
563
564 if (watchdog_user_enabled) 606 if (watchdog_user_enabled)
565 watchdog_enable_all_cpus(); 607 watchdog_enable_all_cpus(false);
566} 608}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e93f7b9067d8..987293d03ebc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -16,9 +16,10 @@
16 * 16 *
17 * This is the generic async execution mechanism. Work items as are 17 * This is the generic async execution mechanism. Work items as are
18 * executed in process context. The worker pool is shared and 18 * executed in process context. The worker pool is shared and
19 * automatically managed. There is one worker pool for each CPU and 19 * automatically managed. There are two worker pools for each CPU (one for
20 * one extra for works which are better served by workers which are 20 * normal work items and the other for high priority ones) and some extra
21 * not bound to any specific CPU. 21 * pools for workqueues which are not bound to any specific CPU - the
22 * number of these backing pools is dynamic.
22 * 23 *
23 * Please read Documentation/workqueue.txt for details. 24 * Please read Documentation/workqueue.txt for details.
24 */ 25 */
@@ -540,6 +541,8 @@ static int worker_pool_assign_id(struct worker_pool *pool)
540 * This must be called either with pwq_lock held or sched RCU read locked. 541 * This must be called either with pwq_lock held or sched RCU read locked.
541 * If the pwq needs to be used beyond the locking in effect, the caller is 542 * If the pwq needs to be used beyond the locking in effect, the caller is
542 * responsible for guaranteeing that the pwq stays online. 543 * responsible for guaranteeing that the pwq stays online.
544 *
545 * Return: The unbound pool_workqueue for @node.
543 */ 546 */
544static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, 547static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
545 int node) 548 int node)
@@ -638,8 +641,6 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
638 * get_work_pool - return the worker_pool a given work was associated with 641 * get_work_pool - return the worker_pool a given work was associated with
639 * @work: the work item of interest 642 * @work: the work item of interest
640 * 643 *
641 * Return the worker_pool @work was last associated with. %NULL if none.
642 *
643 * Pools are created and destroyed under wq_pool_mutex, and allows read 644 * Pools are created and destroyed under wq_pool_mutex, and allows read
644 * access under sched-RCU read lock. As such, this function should be 645 * access under sched-RCU read lock. As such, this function should be
645 * called under wq_pool_mutex or with preemption disabled. 646 * called under wq_pool_mutex or with preemption disabled.
@@ -648,6 +649,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
648 * mentioned locking is in effect. If the returned pool needs to be used 649 * mentioned locking is in effect. If the returned pool needs to be used
649 * beyond the critical section, the caller is responsible for ensuring the 650 * beyond the critical section, the caller is responsible for ensuring the
650 * returned pool is and stays online. 651 * returned pool is and stays online.
652 *
653 * Return: The worker_pool @work was last associated with. %NULL if none.
651 */ 654 */
652static struct worker_pool *get_work_pool(struct work_struct *work) 655static struct worker_pool *get_work_pool(struct work_struct *work)
653{ 656{
@@ -671,7 +674,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
671 * get_work_pool_id - return the worker pool ID a given work is associated with 674 * get_work_pool_id - return the worker pool ID a given work is associated with
672 * @work: the work item of interest 675 * @work: the work item of interest
673 * 676 *
674 * Return the worker_pool ID @work was last associated with. 677 * Return: The worker_pool ID @work was last associated with.
675 * %WORK_OFFQ_POOL_NONE if none. 678 * %WORK_OFFQ_POOL_NONE if none.
676 */ 679 */
677static int get_work_pool_id(struct work_struct *work) 680static int get_work_pool_id(struct work_struct *work)
@@ -830,7 +833,7 @@ void wq_worker_waking_up(struct task_struct *task, int cpu)
830 * CONTEXT: 833 * CONTEXT:
831 * spin_lock_irq(rq->lock) 834 * spin_lock_irq(rq->lock)
832 * 835 *
833 * RETURNS: 836 * Return:
834 * Worker task on @cpu to wake up, %NULL if none. 837 * Worker task on @cpu to wake up, %NULL if none.
835 */ 838 */
836struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) 839struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
@@ -965,8 +968,8 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
965 * CONTEXT: 968 * CONTEXT:
966 * spin_lock_irq(pool->lock). 969 * spin_lock_irq(pool->lock).
967 * 970 *
968 * RETURNS: 971 * Return:
969 * Pointer to worker which is executing @work if found, NULL 972 * Pointer to worker which is executing @work if found, %NULL
970 * otherwise. 973 * otherwise.
971 */ 974 */
972static struct worker *find_worker_executing_work(struct worker_pool *pool, 975static struct worker *find_worker_executing_work(struct worker_pool *pool,
@@ -1154,14 +1157,16 @@ out_put:
1154 * @flags: place to store irq state 1157 * @flags: place to store irq state
1155 * 1158 *
1156 * Try to grab PENDING bit of @work. This function can handle @work in any 1159 * Try to grab PENDING bit of @work. This function can handle @work in any
1157 * stable state - idle, on timer or on worklist. Return values are 1160 * stable state - idle, on timer or on worklist.
1158 * 1161 *
1162 * Return:
1159 * 1 if @work was pending and we successfully stole PENDING 1163 * 1 if @work was pending and we successfully stole PENDING
1160 * 0 if @work was idle and we claimed PENDING 1164 * 0 if @work was idle and we claimed PENDING
1161 * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry 1165 * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry
1162 * -ENOENT if someone else is canceling @work, this state may persist 1166 * -ENOENT if someone else is canceling @work, this state may persist
1163 * for arbitrarily long 1167 * for arbitrarily long
1164 * 1168 *
1169 * Note:
1165 * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting 1170 * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting
1166 * interrupted while holding PENDING and @work off queue, irq must be 1171 * interrupted while holding PENDING and @work off queue, irq must be
1167 * disabled on entry. This, combined with delayed_work->timer being 1172 * disabled on entry. This, combined with delayed_work->timer being
@@ -1403,10 +1408,10 @@ retry:
1403 * @wq: workqueue to use 1408 * @wq: workqueue to use
1404 * @work: work to queue 1409 * @work: work to queue
1405 * 1410 *
1406 * Returns %false if @work was already on a queue, %true otherwise.
1407 *
1408 * We queue the work to a specific CPU, the caller must ensure it 1411 * We queue the work to a specific CPU, the caller must ensure it
1409 * can't go away. 1412 * can't go away.
1413 *
1414 * Return: %false if @work was already on a queue, %true otherwise.
1410 */ 1415 */
1411bool queue_work_on(int cpu, struct workqueue_struct *wq, 1416bool queue_work_on(int cpu, struct workqueue_struct *wq,
1412 struct work_struct *work) 1417 struct work_struct *work)
@@ -1476,7 +1481,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1476 * @dwork: work to queue 1481 * @dwork: work to queue
1477 * @delay: number of jiffies to wait before queueing 1482 * @delay: number of jiffies to wait before queueing
1478 * 1483 *
1479 * Returns %false if @work was already on a queue, %true otherwise. If 1484 * Return: %false if @work was already on a queue, %true otherwise. If
1480 * @delay is zero and @dwork is idle, it will be scheduled for immediate 1485 * @delay is zero and @dwork is idle, it will be scheduled for immediate
1481 * execution. 1486 * execution.
1482 */ 1487 */
@@ -1512,7 +1517,7 @@ EXPORT_SYMBOL(queue_delayed_work_on);
1512 * zero, @work is guaranteed to be scheduled immediately regardless of its 1517 * zero, @work is guaranteed to be scheduled immediately regardless of its
1513 * current state. 1518 * current state.
1514 * 1519 *
1515 * Returns %false if @dwork was idle and queued, %true if @dwork was 1520 * Return: %false if @dwork was idle and queued, %true if @dwork was
1516 * pending and its timer was modified. 1521 * pending and its timer was modified.
1517 * 1522 *
1518 * This function is safe to call from any context including IRQ handler. 1523 * This function is safe to call from any context including IRQ handler.
@@ -1627,7 +1632,7 @@ static void worker_leave_idle(struct worker *worker)
1627 * Might sleep. Called without any lock but returns with pool->lock 1632 * Might sleep. Called without any lock but returns with pool->lock
1628 * held. 1633 * held.
1629 * 1634 *
1630 * RETURNS: 1635 * Return:
1631 * %true if the associated pool is online (@worker is successfully 1636 * %true if the associated pool is online (@worker is successfully
1632 * bound), %false if offline. 1637 * bound), %false if offline.
1633 */ 1638 */
@@ -1688,7 +1693,7 @@ static struct worker *alloc_worker(void)
1688 * CONTEXT: 1693 * CONTEXT:
1689 * Might sleep. Does GFP_KERNEL allocations. 1694 * Might sleep. Does GFP_KERNEL allocations.
1690 * 1695 *
1691 * RETURNS: 1696 * Return:
1692 * Pointer to the newly created worker. 1697 * Pointer to the newly created worker.
1693 */ 1698 */
1694static struct worker *create_worker(struct worker_pool *pool) 1699static struct worker *create_worker(struct worker_pool *pool)
@@ -1788,6 +1793,8 @@ static void start_worker(struct worker *worker)
1788 * @pool: the target pool 1793 * @pool: the target pool
1789 * 1794 *
1790 * Grab the managership of @pool and create and start a new worker for it. 1795 * Grab the managership of @pool and create and start a new worker for it.
1796 *
1797 * Return: 0 on success. A negative error code otherwise.
1791 */ 1798 */
1792static int create_and_start_worker(struct worker_pool *pool) 1799static int create_and_start_worker(struct worker_pool *pool)
1793{ 1800{
@@ -1932,7 +1939,7 @@ static void pool_mayday_timeout(unsigned long __pool)
1932 * multiple times. Does GFP_KERNEL allocations. Called only from 1939 * multiple times. Does GFP_KERNEL allocations. Called only from
1933 * manager. 1940 * manager.
1934 * 1941 *
1935 * RETURNS: 1942 * Return:
1936 * %false if no action was taken and pool->lock stayed locked, %true 1943 * %false if no action was taken and pool->lock stayed locked, %true
1937 * otherwise. 1944 * otherwise.
1938 */ 1945 */
@@ -1989,7 +1996,7 @@ restart:
1989 * spin_lock_irq(pool->lock) which may be released and regrabbed 1996 * spin_lock_irq(pool->lock) which may be released and regrabbed
1990 * multiple times. Called only from manager. 1997 * multiple times. Called only from manager.
1991 * 1998 *
1992 * RETURNS: 1999 * Return:
1993 * %false if no action was taken and pool->lock stayed locked, %true 2000 * %false if no action was taken and pool->lock stayed locked, %true
1994 * otherwise. 2001 * otherwise.
1995 */ 2002 */
@@ -2032,9 +2039,12 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
2032 * spin_lock_irq(pool->lock) which may be released and regrabbed 2039 * spin_lock_irq(pool->lock) which may be released and regrabbed
2033 * multiple times. Does GFP_KERNEL allocations. 2040 * multiple times. Does GFP_KERNEL allocations.
2034 * 2041 *
2035 * RETURNS: 2042 * Return:
2036 * spin_lock_irq(pool->lock) which may be released and regrabbed 2043 * %false if the pool don't need management and the caller can safely start
2037 * multiple times. Does GFP_KERNEL allocations. 2044 * processing works, %true indicates that the function released pool->lock
2045 * and reacquired it to perform some management function and that the
2046 * conditions that the caller verified while holding the lock before
2047 * calling the function might no longer be true.
2038 */ 2048 */
2039static bool manage_workers(struct worker *worker) 2049static bool manage_workers(struct worker *worker)
2040{ 2050{
@@ -2255,6 +2265,8 @@ static void process_scheduled_works(struct worker *worker)
2255 * work items regardless of their specific target workqueue. The only 2265 * work items regardless of their specific target workqueue. The only
2256 * exception is work items which belong to workqueues with a rescuer which 2266 * exception is work items which belong to workqueues with a rescuer which
2257 * will be explained in rescuer_thread(). 2267 * will be explained in rescuer_thread().
2268 *
2269 * Return: 0
2258 */ 2270 */
2259static int worker_thread(void *__worker) 2271static int worker_thread(void *__worker)
2260{ 2272{
@@ -2353,6 +2365,8 @@ sleep:
2353 * those works so that forward progress can be guaranteed. 2365 * those works so that forward progress can be guaranteed.
2354 * 2366 *
2355 * This should happen rarely. 2367 * This should happen rarely.
2368 *
2369 * Return: 0
2356 */ 2370 */
2357static int rescuer_thread(void *__rescuer) 2371static int rescuer_thread(void *__rescuer)
2358{ 2372{
@@ -2525,7 +2539,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
2525 * CONTEXT: 2539 * CONTEXT:
2526 * mutex_lock(wq->mutex). 2540 * mutex_lock(wq->mutex).
2527 * 2541 *
2528 * RETURNS: 2542 * Return:
2529 * %true if @flush_color >= 0 and there's something to flush. %false 2543 * %true if @flush_color >= 0 and there's something to flush. %false
2530 * otherwise. 2544 * otherwise.
2531 */ 2545 */
@@ -2846,7 +2860,7 @@ static bool __flush_work(struct work_struct *work)
2846 * Wait until @work has finished execution. @work is guaranteed to be idle 2860 * Wait until @work has finished execution. @work is guaranteed to be idle
2847 * on return if it hasn't been requeued since flush started. 2861 * on return if it hasn't been requeued since flush started.
2848 * 2862 *
2849 * RETURNS: 2863 * Return:
2850 * %true if flush_work() waited for the work to finish execution, 2864 * %true if flush_work() waited for the work to finish execution,
2851 * %false if it was already idle. 2865 * %false if it was already idle.
2852 */ 2866 */
@@ -2898,7 +2912,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
2898 * The caller must ensure that the workqueue on which @work was last 2912 * The caller must ensure that the workqueue on which @work was last
2899 * queued can't be destroyed before this function returns. 2913 * queued can't be destroyed before this function returns.
2900 * 2914 *
2901 * RETURNS: 2915 * Return:
2902 * %true if @work was pending, %false otherwise. 2916 * %true if @work was pending, %false otherwise.
2903 */ 2917 */
2904bool cancel_work_sync(struct work_struct *work) 2918bool cancel_work_sync(struct work_struct *work)
@@ -2915,7 +2929,7 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
2915 * immediate execution. Like flush_work(), this function only 2929 * immediate execution. Like flush_work(), this function only
2916 * considers the last queueing instance of @dwork. 2930 * considers the last queueing instance of @dwork.
2917 * 2931 *
2918 * RETURNS: 2932 * Return:
2919 * %true if flush_work() waited for the work to finish execution, 2933 * %true if flush_work() waited for the work to finish execution,
2920 * %false if it was already idle. 2934 * %false if it was already idle.
2921 */ 2935 */
@@ -2933,11 +2947,15 @@ EXPORT_SYMBOL(flush_delayed_work);
2933 * cancel_delayed_work - cancel a delayed work 2947 * cancel_delayed_work - cancel a delayed work
2934 * @dwork: delayed_work to cancel 2948 * @dwork: delayed_work to cancel
2935 * 2949 *
2936 * Kill off a pending delayed_work. Returns %true if @dwork was pending 2950 * Kill off a pending delayed_work.
2937 * and canceled; %false if wasn't pending. Note that the work callback 2951 *
2938 * function may still be running on return, unless it returns %true and the 2952 * Return: %true if @dwork was pending and canceled; %false if it wasn't
2939 * work doesn't re-arm itself. Explicitly flush or use 2953 * pending.
2940 * cancel_delayed_work_sync() to wait on it. 2954 *
2955 * Note:
2956 * The work callback function may still be running on return, unless
2957 * it returns %true and the work doesn't re-arm itself. Explicitly flush or
2958 * use cancel_delayed_work_sync() to wait on it.
2941 * 2959 *
2942 * This function is safe to call from any context including IRQ handler. 2960 * This function is safe to call from any context including IRQ handler.
2943 */ 2961 */
@@ -2966,7 +2984,7 @@ EXPORT_SYMBOL(cancel_delayed_work);
2966 * 2984 *
2967 * This is cancel_work_sync() for delayed works. 2985 * This is cancel_work_sync() for delayed works.
2968 * 2986 *
2969 * RETURNS: 2987 * Return:
2970 * %true if @dwork was pending, %false otherwise. 2988 * %true if @dwork was pending, %false otherwise.
2971 */ 2989 */
2972bool cancel_delayed_work_sync(struct delayed_work *dwork) 2990bool cancel_delayed_work_sync(struct delayed_work *dwork)
@@ -2983,7 +3001,7 @@ EXPORT_SYMBOL(cancel_delayed_work_sync);
2983 * system workqueue and blocks until all CPUs have completed. 3001 * system workqueue and blocks until all CPUs have completed.
2984 * schedule_on_each_cpu() is very slow. 3002 * schedule_on_each_cpu() is very slow.
2985 * 3003 *
2986 * RETURNS: 3004 * Return:
2987 * 0 on success, -errno on failure. 3005 * 0 on success, -errno on failure.
2988 */ 3006 */
2989int schedule_on_each_cpu(work_func_t func) 3007int schedule_on_each_cpu(work_func_t func)
@@ -3051,7 +3069,7 @@ EXPORT_SYMBOL(flush_scheduled_work);
3051 * Executes the function immediately if process context is available, 3069 * Executes the function immediately if process context is available,
3052 * otherwise schedules the function for delayed execution. 3070 * otherwise schedules the function for delayed execution.
3053 * 3071 *
3054 * Returns: 0 - function was executed 3072 * Return: 0 - function was executed
3055 * 1 - function was scheduled for execution 3073 * 1 - function was scheduled for execution
3056 */ 3074 */
3057int execute_in_process_context(work_func_t fn, struct execute_work *ew) 3075int execute_in_process_context(work_func_t fn, struct execute_work *ew)
@@ -3095,25 +3113,26 @@ static struct workqueue_struct *dev_to_wq(struct device *dev)
3095 return wq_dev->wq; 3113 return wq_dev->wq;
3096} 3114}
3097 3115
3098static ssize_t wq_per_cpu_show(struct device *dev, 3116static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
3099 struct device_attribute *attr, char *buf) 3117 char *buf)
3100{ 3118{
3101 struct workqueue_struct *wq = dev_to_wq(dev); 3119 struct workqueue_struct *wq = dev_to_wq(dev);
3102 3120
3103 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); 3121 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
3104} 3122}
3123static DEVICE_ATTR_RO(per_cpu);
3105 3124
3106static ssize_t wq_max_active_show(struct device *dev, 3125static ssize_t max_active_show(struct device *dev,
3107 struct device_attribute *attr, char *buf) 3126 struct device_attribute *attr, char *buf)
3108{ 3127{
3109 struct workqueue_struct *wq = dev_to_wq(dev); 3128 struct workqueue_struct *wq = dev_to_wq(dev);
3110 3129
3111 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); 3130 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
3112} 3131}
3113 3132
3114static ssize_t wq_max_active_store(struct device *dev, 3133static ssize_t max_active_store(struct device *dev,
3115 struct device_attribute *attr, 3134 struct device_attribute *attr, const char *buf,
3116 const char *buf, size_t count) 3135 size_t count)
3117{ 3136{
3118 struct workqueue_struct *wq = dev_to_wq(dev); 3137 struct workqueue_struct *wq = dev_to_wq(dev);
3119 int val; 3138 int val;
@@ -3124,12 +3143,14 @@ static ssize_t wq_max_active_store(struct device *dev,
3124 workqueue_set_max_active(wq, val); 3143 workqueue_set_max_active(wq, val);
3125 return count; 3144 return count;
3126} 3145}
3146static DEVICE_ATTR_RW(max_active);
3127 3147
3128static struct device_attribute wq_sysfs_attrs[] = { 3148static struct attribute *wq_sysfs_attrs[] = {
3129 __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), 3149 &dev_attr_per_cpu.attr,
3130 __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), 3150 &dev_attr_max_active.attr,
3131 __ATTR_NULL, 3151 NULL,
3132}; 3152};
3153ATTRIBUTE_GROUPS(wq_sysfs);
3133 3154
3134static ssize_t wq_pool_ids_show(struct device *dev, 3155static ssize_t wq_pool_ids_show(struct device *dev,
3135 struct device_attribute *attr, char *buf) 3156 struct device_attribute *attr, char *buf)
@@ -3279,7 +3300,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = {
3279 3300
3280static struct bus_type wq_subsys = { 3301static struct bus_type wq_subsys = {
3281 .name = "workqueue", 3302 .name = "workqueue",
3282 .dev_attrs = wq_sysfs_attrs, 3303 .dev_groups = wq_sysfs_groups,
3283}; 3304};
3284 3305
3285static int __init wq_sysfs_init(void) 3306static int __init wq_sysfs_init(void)
@@ -3308,7 +3329,7 @@ static void wq_device_release(struct device *dev)
3308 * apply_workqueue_attrs() may race against userland updating the 3329 * apply_workqueue_attrs() may race against userland updating the
3309 * attributes. 3330 * attributes.
3310 * 3331 *
3311 * Returns 0 on success, -errno on failure. 3332 * Return: 0 on success, -errno on failure.
3312 */ 3333 */
3313int workqueue_sysfs_register(struct workqueue_struct *wq) 3334int workqueue_sysfs_register(struct workqueue_struct *wq)
3314{ 3335{
@@ -3401,7 +3422,9 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
3401 * @gfp_mask: allocation mask to use 3422 * @gfp_mask: allocation mask to use
3402 * 3423 *
3403 * Allocate a new workqueue_attrs, initialize with default settings and 3424 * Allocate a new workqueue_attrs, initialize with default settings and
3404 * return it. Returns NULL on failure. 3425 * return it.
3426 *
3427 * Return: The allocated new workqueue_attr on success. %NULL on failure.
3405 */ 3428 */
3406struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) 3429struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
3407{ 3430{
@@ -3460,7 +3483,8 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
3460 * @pool: worker_pool to initialize 3483 * @pool: worker_pool to initialize
3461 * 3484 *
3462 * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. 3485 * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs.
3463 * Returns 0 on success, -errno on failure. Even on failure, all fields 3486 *
3487 * Return: 0 on success, -errno on failure. Even on failure, all fields
3464 * inside @pool proper are initialized and put_unbound_pool() can be called 3488 * inside @pool proper are initialized and put_unbound_pool() can be called
3465 * on @pool safely to release it. 3489 * on @pool safely to release it.
3466 */ 3490 */
@@ -3567,9 +3591,12 @@ static void put_unbound_pool(struct worker_pool *pool)
3567 * Obtain a worker_pool which has the same attributes as @attrs, bump the 3591 * Obtain a worker_pool which has the same attributes as @attrs, bump the
3568 * reference count and return it. If there already is a matching 3592 * reference count and return it. If there already is a matching
3569 * worker_pool, it will be used; otherwise, this function attempts to 3593 * worker_pool, it will be used; otherwise, this function attempts to
3570 * create a new one. On failure, returns NULL. 3594 * create a new one.
3571 * 3595 *
3572 * Should be called with wq_pool_mutex held. 3596 * Should be called with wq_pool_mutex held.
3597 *
3598 * Return: On success, a worker_pool with the same attributes as @attrs.
3599 * On failure, %NULL.
3573 */ 3600 */
3574static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) 3601static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3575{ 3602{
@@ -3805,9 +3832,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq)
3805 * 3832 *
3806 * Calculate the cpumask a workqueue with @attrs should use on @node. If 3833 * Calculate the cpumask a workqueue with @attrs should use on @node. If
3807 * @cpu_going_down is >= 0, that cpu is considered offline during 3834 * @cpu_going_down is >= 0, that cpu is considered offline during
3808 * calculation. The result is stored in @cpumask. This function returns 3835 * calculation. The result is stored in @cpumask.
3809 * %true if the resulting @cpumask is different from @attrs->cpumask,
3810 * %false if equal.
3811 * 3836 *
3812 * If NUMA affinity is not enabled, @attrs->cpumask is always used. If 3837 * If NUMA affinity is not enabled, @attrs->cpumask is always used. If
3813 * enabled and @node has online CPUs requested by @attrs, the returned 3838 * enabled and @node has online CPUs requested by @attrs, the returned
@@ -3816,6 +3841,9 @@ static void free_unbound_pwq(struct pool_workqueue *pwq)
3816 * 3841 *
3817 * The caller is responsible for ensuring that the cpumask of @node stays 3842 * The caller is responsible for ensuring that the cpumask of @node stays
3818 * stable. 3843 * stable.
3844 *
3845 * Return: %true if the resulting @cpumask is different from @attrs->cpumask,
3846 * %false if equal.
3819 */ 3847 */
3820static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, 3848static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
3821 int cpu_going_down, cpumask_t *cpumask) 3849 int cpu_going_down, cpumask_t *cpumask)
@@ -3869,8 +3897,9 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
3869 * items finish. Note that a work item which repeatedly requeues itself 3897 * items finish. Note that a work item which repeatedly requeues itself
3870 * back-to-back will stay on its current pwq. 3898 * back-to-back will stay on its current pwq.
3871 * 3899 *
3872 * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on 3900 * Performs GFP_KERNEL allocations.
3873 * failure. 3901 *
3902 * Return: 0 on success and -errno on failure.
3874 */ 3903 */
3875int apply_workqueue_attrs(struct workqueue_struct *wq, 3904int apply_workqueue_attrs(struct workqueue_struct *wq,
3876 const struct workqueue_attrs *attrs) 3905 const struct workqueue_attrs *attrs)
@@ -4338,6 +4367,8 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
4338 * 4367 *
4339 * Determine whether %current is a workqueue rescuer. Can be used from 4368 * Determine whether %current is a workqueue rescuer. Can be used from
4340 * work functions to determine whether it's being run off the rescuer task. 4369 * work functions to determine whether it's being run off the rescuer task.
4370 *
4371 * Return: %true if %current is a workqueue rescuer. %false otherwise.
4341 */ 4372 */
4342bool current_is_workqueue_rescuer(void) 4373bool current_is_workqueue_rescuer(void)
4343{ 4374{
@@ -4361,7 +4392,7 @@ bool current_is_workqueue_rescuer(void)
4361 * workqueue being congested on one CPU doesn't mean the workqueue is also 4392 * workqueue being congested on one CPU doesn't mean the workqueue is also
4362 * contested on other CPUs / NUMA nodes. 4393 * contested on other CPUs / NUMA nodes.
4363 * 4394 *
4364 * RETURNS: 4395 * Return:
4365 * %true if congested, %false otherwise. 4396 * %true if congested, %false otherwise.
4366 */ 4397 */
4367bool workqueue_congested(int cpu, struct workqueue_struct *wq) 4398bool workqueue_congested(int cpu, struct workqueue_struct *wq)
@@ -4394,7 +4425,7 @@ EXPORT_SYMBOL_GPL(workqueue_congested);
4394 * synchronization around this function and the test result is 4425 * synchronization around this function and the test result is
4395 * unreliable and only useful as advisory hints or for debugging. 4426 * unreliable and only useful as advisory hints or for debugging.
4396 * 4427 *
4397 * RETURNS: 4428 * Return:
4398 * OR'd bitmask of WORK_BUSY_* bits. 4429 * OR'd bitmask of WORK_BUSY_* bits.
4399 */ 4430 */
4400unsigned int work_busy(struct work_struct *work) 4431unsigned int work_busy(struct work_struct *work)
@@ -4772,9 +4803,10 @@ static void work_for_cpu_fn(struct work_struct *work)
4772 * @fn: the function to run 4803 * @fn: the function to run
4773 * @arg: the function arg 4804 * @arg: the function arg
4774 * 4805 *
4775 * This will return the value @fn returns.
4776 * It is up to the caller to ensure that the cpu doesn't go offline. 4806 * It is up to the caller to ensure that the cpu doesn't go offline.
4777 * The caller must not hold any locks which would prevent @fn from completing. 4807 * The caller must not hold any locks which would prevent @fn from completing.
4808 *
4809 * Return: The value @fn returns.
4778 */ 4810 */
4779long work_on_cpu(int cpu, long (*fn)(void *), void *arg) 4811long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
4780{ 4812{
@@ -4846,7 +4878,7 @@ void freeze_workqueues_begin(void)
4846 * CONTEXT: 4878 * CONTEXT:
4847 * Grabs and releases wq_pool_mutex. 4879 * Grabs and releases wq_pool_mutex.
4848 * 4880 *
4849 * RETURNS: 4881 * Return:
4850 * %true if some freezable workqueues are still busy. %false if freezing 4882 * %true if some freezable workqueues are still busy. %false if freezing
4851 * is complete. 4883 * is complete.
4852 */ 4884 */