aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile15
-rw-r--r--kernel/audit.c5
-rw-r--r--kernel/bounds.c4
-rw-r--r--kernel/capability.c13
-rw-r--r--kernel/cgroup.c1903
-rw-r--r--kernel/cgroup_freezer.c155
-rw-r--r--kernel/context_tracking.c139
-rw-r--r--kernel/cpu.c26
-rw-r--r--kernel/cpu/idle.c16
-rw-r--r--kernel/cpuset.c337
-rw-r--r--kernel/debug/debug_core.c32
-rw-r--r--kernel/debug/debug_core.h3
-rw-r--r--kernel/debug/kdb/kdb_debugger.c5
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/events/callchain.c3
-rw-r--r--kernel/events/core.c609
-rw-r--r--kernel/events/internal.h35
-rw-r--r--kernel/events/ring_buffer.c126
-rw-r--r--kernel/events/uprobes.c227
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/fork.c54
-rw-r--r--kernel/gcov/fs.c2
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/hung_task.c13
-rw-r--r--kernel/irq/Kconfig12
-rw-r--r--kernel/irq/irqdomain.c13
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/jump_label.c1
-rw-r--r--kernel/kexec.c5
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c95
-rw-r--r--kernel/ksysfs.c2
-rw-r--r--kernel/lglock.c12
-rw-r--r--kernel/lockdep.c4
-rw-r--r--kernel/modsign_pubkey.c6
-rw-r--r--kernel/module.c30
-rw-r--r--kernel/mutex.c79
-rw-r--r--kernel/nsproxy.c63
-rw-r--r--kernel/padata.c32
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/params.c34
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/pid_namespace.c6
-rw-r--r--kernel/power/hibernate.c49
-rw-r--r--kernel/power/qos.c20
-rw-r--r--kernel/power/snapshot.c17
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/power/user.c32
-rw-r--r--kernel/printk/printk.c7
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcu/Makefile6
-rw-r--r--kernel/rcu/rcu.h (renamed from kernel/rcu.h)19
-rw-r--r--kernel/rcu/srcu.c (renamed from kernel/srcu.c)0
-rw-r--r--kernel/rcu/tiny.c (renamed from kernel/rcutiny.c)39
-rw-r--r--kernel/rcu/tiny_plugin.h (renamed from kernel/rcutiny_plugin.h)2
-rw-r--r--kernel/rcu/torture.c (renamed from kernel/rcutorture.c)400
-rw-r--r--kernel/rcu/tree.c (renamed from kernel/rcutree.c)439
-rw-r--r--kernel/rcu/tree.h (renamed from kernel/rcutree.h)21
-rw-r--r--kernel/rcu/tree_plugin.h (renamed from kernel/rcutree_plugin.h)542
-rw-r--r--kernel/rcu/tree_trace.c (renamed from kernel/rcutree_trace.c)2
-rw-r--r--kernel/rcu/update.c (renamed from kernel/rcupdate.c)114
-rw-r--r--kernel/reboot.c9
-rw-r--r--kernel/res_counter.c25
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/completion.c299
-rw-r--r--kernel/sched/core.c907
-rw-r--r--kernel/sched/cpuacct.c51
-rw-r--r--kernel/sched/cpupri.c4
-rw-r--r--kernel/sched/cputime.c74
-rw-r--r--kernel/sched/debug.c74
-rw-r--r--kernel/sched/fair.c1912
-rw-r--r--kernel/sched/features.h19
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c22
-rw-r--r--kernel/sched/sched.h68
-rw-r--r--kernel/sched/stats.h51
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/sched/wait.c (renamed from kernel/wait.c)130
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/smp.c39
-rw-r--r--kernel/softirq.c58
-rw-r--r--kernel/spinlock.c14
-rw-r--r--kernel/stop_machine.c303
-rw-r--r--kernel/sys.c20
-rw-r--r--kernel/sysctl.c32
-rw-r--r--kernel/task_work.c40
-rw-r--r--kernel/time/Kconfig53
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/clockevents.c67
-rw-r--r--kernel/time/clocksource.c52
-rw-r--r--kernel/time/ntp.c9
-rw-r--r--kernel/time/sched_clock.c114
-rw-r--r--kernel/time/tick-broadcast.c1
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-sched.c60
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/time/timer_list.c41
-rw-r--r--kernel/time/timer_stats.c8
-rw-r--r--kernel/timer.c8
-rw-r--r--kernel/trace/ftrace.c17
-rw-r--r--kernel/trace/trace.c40
-rw-r--r--kernel/trace/trace.h5
-rw-r--r--kernel/trace/trace_event_perf.c2
-rw-r--r--kernel/trace/trace_events.c207
-rw-r--r--kernel/trace/trace_output.c19
-rw-r--r--kernel/trace/trace_printk.c19
-rw-r--r--kernel/trace/trace_syscalls.c10
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/up.c58
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/watchdog.c68
-rw-r--r--kernel/workqueue.c157
114 files changed, 6839 insertions, 4213 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 35ef1185e359..a4d1aa8da9bc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,9 +6,9 @@ obj-y = fork.o exec_domain.o panic.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 extable.o params.o posix-timers.o \
10 kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o semaphore.o \
12 notifier.o ksysfs.o cred.o reboot.o \ 12 notifier.o ksysfs.o cred.o reboot.o \
13 async.o range.o groups.o lglock.o smpboot.o 13 async.o range.o groups.o lglock.o smpboot.o
14 14
@@ -26,6 +26,8 @@ obj-y += sched/
26obj-y += power/ 26obj-y += power/
27obj-y += printk/ 27obj-y += printk/
28obj-y += cpu/ 28obj-y += cpu/
29obj-y += irq/
30obj-y += rcu/
29 31
30obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o 32obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
31obj-$(CONFIG_FREEZER) += freezer.o 33obj-$(CONFIG_FREEZER) += freezer.o
@@ -79,14 +81,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
79obj-$(CONFIG_KGDB) += debug/ 81obj-$(CONFIG_KGDB) += debug/
80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 82obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
81obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o 83obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
82obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
83obj-$(CONFIG_SECCOMP) += seccomp.o 84obj-$(CONFIG_SECCOMP) += seccomp.o
84obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
85obj-$(CONFIG_TREE_RCU) += rcutree.o
86obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
87obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
88obj-$(CONFIG_TINY_RCU) += rcutiny.o
89obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
90obj-$(CONFIG_RELAY) += relay.o 85obj-$(CONFIG_RELAY) += relay.o
91obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 86obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
92obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 87obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 91e53d04b6a9..7b0e23a740ce 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1117,9 +1117,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1117 1117
1118 sleep_time = timeout_start + audit_backlog_wait_time - 1118 sleep_time = timeout_start + audit_backlog_wait_time -
1119 jiffies; 1119 jiffies;
1120 if ((long)sleep_time > 0) 1120 if ((long)sleep_time > 0) {
1121 wait_for_auditd(sleep_time); 1121 wait_for_auditd(sleep_time);
1122 continue; 1122 continue;
1123 }
1123 } 1124 }
1124 if (audit_rate_check() && printk_ratelimit()) 1125 if (audit_rate_check() && printk_ratelimit())
1125 printk(KERN_WARNING 1126 printk(KERN_WARNING
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 0c9b862292b2..e8ca97b5c386 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -10,6 +10,7 @@
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/kbuild.h> 11#include <linux/kbuild.h>
12#include <linux/page_cgroup.h> 12#include <linux/page_cgroup.h>
13#include <linux/log2.h>
13 14
14void foo(void) 15void foo(void)
15{ 16{
@@ -17,5 +18,8 @@ void foo(void)
17 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 18 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
18 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 19 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
19 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); 20 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
21#ifdef CONFIG_SMP
22 DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
23#endif
20 /* End of constants */ 24 /* End of constants */
21} 25}
diff --git a/kernel/capability.c b/kernel/capability.c
index f6c2ce5701e1..4e66bf9275b0 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -433,18 +433,6 @@ bool capable(int cap)
433EXPORT_SYMBOL(capable); 433EXPORT_SYMBOL(capable);
434 434
435/** 435/**
436 * nsown_capable - Check superior capability to one's own user_ns
437 * @cap: The capability in question
438 *
439 * Return true if the current task has the given superior capability
440 * targeted at its own user namespace.
441 */
442bool nsown_capable(int cap)
443{
444 return ns_capable(current_user_ns(), cap);
445}
446
447/**
448 * inode_capable - Check superior capability over inode 436 * inode_capable - Check superior capability over inode
449 * @inode: The inode in question 437 * @inode: The inode in question
450 * @cap: The capability in question 438 * @cap: The capability in question
@@ -464,3 +452,4 @@ bool inode_capable(const struct inode *inode, int cap)
464 452
465 return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); 453 return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
466} 454}
455EXPORT_SYMBOL(inode_capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 781845a013ab..e0839bcd48c8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,6 +60,7 @@
60#include <linux/poll.h> 60#include <linux/poll.h>
61#include <linux/flex_array.h> /* used in cgroup_attach_task */ 61#include <linux/flex_array.h> /* used in cgroup_attach_task */
62#include <linux/kthread.h> 62#include <linux/kthread.h>
63#include <linux/file.h>
63 64
64#include <linux/atomic.h> 65#include <linux/atomic.h>
65 66
@@ -81,7 +82,7 @@
81 */ 82 */
82#ifdef CONFIG_PROVE_RCU 83#ifdef CONFIG_PROVE_RCU
83DEFINE_MUTEX(cgroup_mutex); 84DEFINE_MUTEX(cgroup_mutex);
84EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ 85EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
85#else 86#else
86static DEFINE_MUTEX(cgroup_mutex); 87static DEFINE_MUTEX(cgroup_mutex);
87#endif 88#endif
@@ -117,51 +118,20 @@ struct cfent {
117 struct list_head node; 118 struct list_head node;
118 struct dentry *dentry; 119 struct dentry *dentry;
119 struct cftype *type; 120 struct cftype *type;
121 struct cgroup_subsys_state *css;
120 122
121 /* file xattrs */ 123 /* file xattrs */
122 struct simple_xattrs xattrs; 124 struct simple_xattrs xattrs;
123}; 125};
124 126
125/* 127/*
126 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
127 * cgroup_subsys->use_id != 0.
128 */
129#define CSS_ID_MAX (65535)
130struct css_id {
131 /*
132 * The css to which this ID points. This pointer is set to valid value
133 * after cgroup is populated. If cgroup is removed, this will be NULL.
134 * This pointer is expected to be RCU-safe because destroy()
135 * is called after synchronize_rcu(). But for safe use, css_tryget()
136 * should be used for avoiding race.
137 */
138 struct cgroup_subsys_state __rcu *css;
139 /*
140 * ID of this css.
141 */
142 unsigned short id;
143 /*
144 * Depth in hierarchy which this ID belongs to.
145 */
146 unsigned short depth;
147 /*
148 * ID is freed by RCU. (and lookup routine is RCU safe.)
149 */
150 struct rcu_head rcu_head;
151 /*
152 * Hierarchy of CSS ID belongs to.
153 */
154 unsigned short stack[0]; /* Array of Length (depth+1) */
155};
156
157/*
158 * cgroup_event represents events which userspace want to receive. 128 * cgroup_event represents events which userspace want to receive.
159 */ 129 */
160struct cgroup_event { 130struct cgroup_event {
161 /* 131 /*
162 * Cgroup which the event belongs to. 132 * css which the event belongs to.
163 */ 133 */
164 struct cgroup *cgrp; 134 struct cgroup_subsys_state *css;
165 /* 135 /*
166 * Control file which the event associated. 136 * Control file which the event associated.
167 */ 137 */
@@ -215,10 +185,33 @@ static u64 cgroup_serial_nr_next = 1;
215 */ 185 */
216static int need_forkexit_callback __read_mostly; 186static int need_forkexit_callback __read_mostly;
217 187
218static void cgroup_offline_fn(struct work_struct *work); 188static struct cftype cgroup_base_files[];
189
190static void cgroup_destroy_css_killed(struct cgroup *cgrp);
219static int cgroup_destroy_locked(struct cgroup *cgrp); 191static int cgroup_destroy_locked(struct cgroup *cgrp);
220static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 192static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
221 struct cftype cfts[], bool is_add); 193 bool is_add);
194
195/**
196 * cgroup_css - obtain a cgroup's css for the specified subsystem
197 * @cgrp: the cgroup of interest
198 * @ss: the subsystem of interest (%NULL returns the dummy_css)
199 *
200 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
201 * function must be called either under cgroup_mutex or rcu_read_lock() and
202 * the caller is responsible for pinning the returned css if it wants to
203 * keep accessing it outside the said locks. This function may return
204 * %NULL if @cgrp doesn't have @subsys_id enabled.
205 */
206static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
207 struct cgroup_subsys *ss)
208{
209 if (ss)
210 return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
211 lockdep_is_held(&cgroup_mutex));
212 else
213 return &cgrp->dummy_css;
214}
222 215
223/* convenient tests for these bits */ 216/* convenient tests for these bits */
224static inline bool cgroup_is_dead(const struct cgroup *cgrp) 217static inline bool cgroup_is_dead(const struct cgroup *cgrp)
@@ -362,12 +355,11 @@ struct cgrp_cset_link {
362static struct css_set init_css_set; 355static struct css_set init_css_set;
363static struct cgrp_cset_link init_cgrp_cset_link; 356static struct cgrp_cset_link init_cgrp_cset_link;
364 357
365static int cgroup_init_idr(struct cgroup_subsys *ss, 358/*
366 struct cgroup_subsys_state *css); 359 * css_set_lock protects the list of css_set objects, and the chain of
367 360 * tasks off each css_set. Nests outside task->alloc_lock due to
368/* css_set_lock protects the list of css_set objects, and the 361 * css_task_iter_start().
369 * chain of tasks off each css_set. Nests outside task->alloc_lock 362 */
370 * due to cgroup_iter_start() */
371static DEFINE_RWLOCK(css_set_lock); 363static DEFINE_RWLOCK(css_set_lock);
372static int css_set_count; 364static int css_set_count;
373 365
@@ -392,10 +384,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
392 return key; 384 return key;
393} 385}
394 386
395/* We don't maintain the lists running through each css_set to its 387/*
396 * task until after the first call to cgroup_iter_start(). This 388 * We don't maintain the lists running through each css_set to its task
397 * reduces the fork()/exit() overhead for people who have cgroups 389 * until after the first call to css_task_iter_start(). This reduces the
398 * compiled into their kernel but not actually in use */ 390 * fork()/exit() overhead for people who have cgroups compiled into their
391 * kernel but not actually in use.
392 */
399static int use_task_css_set_links __read_mostly; 393static int use_task_css_set_links __read_mostly;
400 394
401static void __put_css_set(struct css_set *cset, int taskexit) 395static void __put_css_set(struct css_set *cset, int taskexit)
@@ -464,7 +458,7 @@ static inline void put_css_set_taskexit(struct css_set *cset)
464 * @new_cgrp: cgroup that's being entered by the task 458 * @new_cgrp: cgroup that's being entered by the task
465 * @template: desired set of css pointers in css_set (pre-calculated) 459 * @template: desired set of css pointers in css_set (pre-calculated)
466 * 460 *
467 * Returns true if "cg" matches "old_cg" except for the hierarchy 461 * Returns true if "cset" matches "old_cset" except for the hierarchy
468 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 462 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
469 */ 463 */
470static bool compare_css_sets(struct css_set *cset, 464static bool compare_css_sets(struct css_set *cset,
@@ -555,7 +549,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
555 /* Subsystem is in this hierarchy. So we want 549 /* Subsystem is in this hierarchy. So we want
556 * the subsystem state from the new 550 * the subsystem state from the new
557 * cgroup */ 551 * cgroup */
558 template[i] = cgrp->subsys[i]; 552 template[i] = cgroup_css(cgrp, ss);
559 } else { 553 } else {
560 /* Subsystem is not in this hierarchy, so we 554 /* Subsystem is not in this hierarchy, so we
561 * don't want to change the subsystem state */ 555 * don't want to change the subsystem state */
@@ -803,8 +797,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
803 797
804static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 798static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
805static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 799static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
806static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 800static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
807 unsigned long subsys_mask);
808static const struct inode_operations cgroup_dir_inode_operations; 801static const struct inode_operations cgroup_dir_inode_operations;
809static const struct file_operations proc_cgroupstats_operations; 802static const struct file_operations proc_cgroupstats_operations;
810 803
@@ -813,9 +806,6 @@ static struct backing_dev_info cgroup_backing_dev_info = {
813 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 806 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
814}; 807};
815 808
816static int alloc_css_id(struct cgroup_subsys *ss,
817 struct cgroup *parent, struct cgroup *child);
818
819static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) 809static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
820{ 810{
821 struct inode *inode = new_inode(sb); 811 struct inode *inode = new_inode(sb);
@@ -845,15 +835,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
845static void cgroup_free_fn(struct work_struct *work) 835static void cgroup_free_fn(struct work_struct *work)
846{ 836{
847 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 837 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
848 struct cgroup_subsys *ss;
849 838
850 mutex_lock(&cgroup_mutex); 839 mutex_lock(&cgroup_mutex);
851 /*
852 * Release the subsystem state objects.
853 */
854 for_each_root_subsys(cgrp->root, ss)
855 ss->css_free(cgrp);
856
857 cgrp->root->number_of_cgroups--; 840 cgrp->root->number_of_cgroups--;
858 mutex_unlock(&cgroup_mutex); 841 mutex_unlock(&cgroup_mutex);
859 842
@@ -864,8 +847,6 @@ static void cgroup_free_fn(struct work_struct *work)
864 */ 847 */
865 dput(cgrp->parent->dentry); 848 dput(cgrp->parent->dentry);
866 849
867 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
868
869 /* 850 /*
870 * Drop the active superblock reference that we took when we 851 * Drop the active superblock reference that we took when we
871 * created the cgroup. This will free cgrp->root, if we are 852 * created the cgroup. This will free cgrp->root, if we are
@@ -956,27 +937,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
956} 937}
957 938
958/** 939/**
959 * cgroup_clear_directory - selective removal of base and subsystem files 940 * cgroup_clear_dir - remove subsys files in a cgroup directory
960 * @dir: directory containing the files 941 * @cgrp: target cgroup
961 * @base_files: true if the base files should be removed
962 * @subsys_mask: mask of the subsystem ids whose files should be removed 942 * @subsys_mask: mask of the subsystem ids whose files should be removed
963 */ 943 */
964static void cgroup_clear_directory(struct dentry *dir, bool base_files, 944static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
965 unsigned long subsys_mask)
966{ 945{
967 struct cgroup *cgrp = __d_cgrp(dir);
968 struct cgroup_subsys *ss; 946 struct cgroup_subsys *ss;
947 int i;
969 948
970 for_each_root_subsys(cgrp->root, ss) { 949 for_each_subsys(ss, i) {
971 struct cftype_set *set; 950 struct cftype_set *set;
972 if (!test_bit(ss->subsys_id, &subsys_mask)) 951
952 if (!test_bit(i, &subsys_mask))
973 continue; 953 continue;
974 list_for_each_entry(set, &ss->cftsets, node) 954 list_for_each_entry(set, &ss->cftsets, node)
975 cgroup_addrm_files(cgrp, NULL, set->cfts, false); 955 cgroup_addrm_files(cgrp, set->cfts, false);
976 }
977 if (base_files) {
978 while (!list_empty(&cgrp->files))
979 cgroup_rm_file(cgrp, NULL);
980 } 956 }
981} 957}
982 958
@@ -986,9 +962,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
986static void cgroup_d_remove_dir(struct dentry *dentry) 962static void cgroup_d_remove_dir(struct dentry *dentry)
987{ 963{
988 struct dentry *parent; 964 struct dentry *parent;
989 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
990
991 cgroup_clear_directory(dentry, true, root->subsys_mask);
992 965
993 parent = dentry->d_parent; 966 parent = dentry->d_parent;
994 spin_lock(&parent->d_lock); 967 spin_lock(&parent->d_lock);
@@ -1009,79 +982,84 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1009{ 982{
1010 struct cgroup *cgrp = &root->top_cgroup; 983 struct cgroup *cgrp = &root->top_cgroup;
1011 struct cgroup_subsys *ss; 984 struct cgroup_subsys *ss;
1012 int i; 985 unsigned long pinned = 0;
986 int i, ret;
1013 987
1014 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 988 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1015 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 989 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
1016 990
1017 /* Check that any added subsystems are currently free */ 991 /* Check that any added subsystems are currently free */
1018 for_each_subsys(ss, i) { 992 for_each_subsys(ss, i) {
1019 unsigned long bit = 1UL << i; 993 if (!(added_mask & (1 << i)))
1020
1021 if (!(bit & added_mask))
1022 continue; 994 continue;
1023 995
996 /* is the subsystem mounted elsewhere? */
1024 if (ss->root != &cgroup_dummy_root) { 997 if (ss->root != &cgroup_dummy_root) {
1025 /* Subsystem isn't free */ 998 ret = -EBUSY;
1026 return -EBUSY; 999 goto out_put;
1000 }
1001
1002 /* pin the module */
1003 if (!try_module_get(ss->module)) {
1004 ret = -ENOENT;
1005 goto out_put;
1027 } 1006 }
1007 pinned |= 1 << i;
1028 } 1008 }
1029 1009
1030 /* Currently we don't handle adding/removing subsystems when 1010 /* subsys could be missing if unloaded between parsing and here */
1031 * any child cgroups exist. This is theoretically supportable 1011 if (added_mask != pinned) {
1032 * but involves complex error handling, so it's being left until 1012 ret = -ENOENT;
1033 * later */ 1013 goto out_put;
1034 if (root->number_of_cgroups > 1) 1014 }
1035 return -EBUSY; 1015
1016 ret = cgroup_populate_dir(cgrp, added_mask);
1017 if (ret)
1018 goto out_put;
1019
1020 /*
1021 * Nothing can fail from this point on. Remove files for the
1022 * removed subsystems and rebind each subsystem.
1023 */
1024 cgroup_clear_dir(cgrp, removed_mask);
1036 1025
1037 /* Process each subsystem */
1038 for_each_subsys(ss, i) { 1026 for_each_subsys(ss, i) {
1039 unsigned long bit = 1UL << i; 1027 unsigned long bit = 1UL << i;
1040 1028
1041 if (bit & added_mask) { 1029 if (bit & added_mask) {
1042 /* We're binding this subsystem to this hierarchy */ 1030 /* We're binding this subsystem to this hierarchy */
1043 BUG_ON(cgrp->subsys[i]); 1031 BUG_ON(cgroup_css(cgrp, ss));
1044 BUG_ON(!cgroup_dummy_top->subsys[i]); 1032 BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1045 BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); 1033 BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1034
1035 rcu_assign_pointer(cgrp->subsys[i],
1036 cgroup_css(cgroup_dummy_top, ss));
1037 cgroup_css(cgrp, ss)->cgroup = cgrp;
1046 1038
1047 cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
1048 cgrp->subsys[i]->cgroup = cgrp;
1049 list_move(&ss->sibling, &root->subsys_list); 1039 list_move(&ss->sibling, &root->subsys_list);
1050 ss->root = root; 1040 ss->root = root;
1051 if (ss->bind) 1041 if (ss->bind)
1052 ss->bind(cgrp); 1042 ss->bind(cgroup_css(cgrp, ss));
1053 1043
1054 /* refcount was already taken, and we're keeping it */ 1044 /* refcount was already taken, and we're keeping it */
1055 root->subsys_mask |= bit; 1045 root->subsys_mask |= bit;
1056 } else if (bit & removed_mask) { 1046 } else if (bit & removed_mask) {
1057 /* We're removing this subsystem */ 1047 /* We're removing this subsystem */
1058 BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); 1048 BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1059 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1049 BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1060 1050
1061 if (ss->bind) 1051 if (ss->bind)
1062 ss->bind(cgroup_dummy_top); 1052 ss->bind(cgroup_css(cgroup_dummy_top, ss));
1063 cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; 1053
1064 cgrp->subsys[i] = NULL; 1054 cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top;
1055 RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1056
1065 cgroup_subsys[i]->root = &cgroup_dummy_root; 1057 cgroup_subsys[i]->root = &cgroup_dummy_root;
1066 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); 1058 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1067 1059
1068 /* subsystem is now free - drop reference on module */ 1060 /* subsystem is now free - drop reference on module */
1069 module_put(ss->module); 1061 module_put(ss->module);
1070 root->subsys_mask &= ~bit; 1062 root->subsys_mask &= ~bit;
1071 } else if (bit & root->subsys_mask) {
1072 /* Subsystem state should already exist */
1073 BUG_ON(!cgrp->subsys[i]);
1074 /*
1075 * a refcount was taken, but we already had one, so
1076 * drop the extra reference.
1077 */
1078 module_put(ss->module);
1079#ifdef CONFIG_MODULE_UNLOAD
1080 BUG_ON(ss->module && !module_refcount(ss->module));
1081#endif
1082 } else {
1083 /* Subsystem state shouldn't exist */
1084 BUG_ON(cgrp->subsys[i]);
1085 } 1063 }
1086 } 1064 }
1087 1065
@@ -1092,6 +1070,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1092 root->flags |= CGRP_ROOT_SUBSYS_BOUND; 1070 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1093 1071
1094 return 0; 1072 return 0;
1073
1074out_put:
1075 for_each_subsys(ss, i)
1076 if (pinned & (1 << i))
1077 module_put(ss->module);
1078 return ret;
1095} 1079}
1096 1080
1097static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1081static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
@@ -1142,7 +1126,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1142 char *token, *o = data; 1126 char *token, *o = data;
1143 bool all_ss = false, one_ss = false; 1127 bool all_ss = false, one_ss = false;
1144 unsigned long mask = (unsigned long)-1; 1128 unsigned long mask = (unsigned long)-1;
1145 bool module_pin_failed = false;
1146 struct cgroup_subsys *ss; 1129 struct cgroup_subsys *ss;
1147 int i; 1130 int i;
1148 1131
@@ -1285,52 +1268,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1285 if (!opts->subsys_mask && !opts->name) 1268 if (!opts->subsys_mask && !opts->name)
1286 return -EINVAL; 1269 return -EINVAL;
1287 1270
1288 /*
1289 * Grab references on all the modules we'll need, so the subsystems
1290 * don't dance around before rebind_subsystems attaches them. This may
1291 * take duplicate reference counts on a subsystem that's already used,
1292 * but rebind_subsystems handles this case.
1293 */
1294 for_each_subsys(ss, i) {
1295 if (!(opts->subsys_mask & (1UL << i)))
1296 continue;
1297 if (!try_module_get(cgroup_subsys[i]->module)) {
1298 module_pin_failed = true;
1299 break;
1300 }
1301 }
1302 if (module_pin_failed) {
1303 /*
1304 * oops, one of the modules was going away. this means that we
1305 * raced with a module_delete call, and to the user this is
1306 * essentially a "subsystem doesn't exist" case.
1307 */
1308 for (i--; i >= 0; i--) {
1309 /* drop refcounts only on the ones we took */
1310 unsigned long bit = 1UL << i;
1311
1312 if (!(bit & opts->subsys_mask))
1313 continue;
1314 module_put(cgroup_subsys[i]->module);
1315 }
1316 return -ENOENT;
1317 }
1318
1319 return 0; 1271 return 0;
1320} 1272}
1321 1273
1322static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1323{
1324 struct cgroup_subsys *ss;
1325 int i;
1326
1327 mutex_lock(&cgroup_mutex);
1328 for_each_subsys(ss, i)
1329 if (subsys_mask & (1UL << i))
1330 module_put(cgroup_subsys[i]->module);
1331 mutex_unlock(&cgroup_mutex);
1332}
1333
1334static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1274static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1335{ 1275{
1336 int ret = 0; 1276 int ret = 0;
@@ -1370,22 +1310,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1370 goto out_unlock; 1310 goto out_unlock;
1371 } 1311 }
1372 1312
1373 /* 1313 /* remounting is not allowed for populated hierarchies */
1374 * Clear out the files of subsystems that should be removed, do 1314 if (root->number_of_cgroups > 1) {
1375 * this before rebind_subsystems, since rebind_subsystems may 1315 ret = -EBUSY;
1376 * change this hierarchy's subsys_list.
1377 */
1378 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1379
1380 ret = rebind_subsystems(root, added_mask, removed_mask);
1381 if (ret) {
1382 /* rebind_subsystems failed, re-populate the removed files */
1383 cgroup_populate_dir(cgrp, false, removed_mask);
1384 goto out_unlock; 1316 goto out_unlock;
1385 } 1317 }
1386 1318
1387 /* re-populate subsystem files */ 1319 ret = rebind_subsystems(root, added_mask, removed_mask);
1388 cgroup_populate_dir(cgrp, false, added_mask); 1320 if (ret)
1321 goto out_unlock;
1389 1322
1390 if (opts.release_agent) 1323 if (opts.release_agent)
1391 strcpy(root->release_agent_path, opts.release_agent); 1324 strcpy(root->release_agent_path, opts.release_agent);
@@ -1395,8 +1328,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1395 mutex_unlock(&cgroup_root_mutex); 1328 mutex_unlock(&cgroup_root_mutex);
1396 mutex_unlock(&cgroup_mutex); 1329 mutex_unlock(&cgroup_mutex);
1397 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1330 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1398 if (ret)
1399 drop_parsed_module_refcounts(opts.subsys_mask);
1400 return ret; 1331 return ret;
1401} 1332}
1402 1333
@@ -1416,6 +1347,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1416 INIT_LIST_HEAD(&cgrp->release_list); 1347 INIT_LIST_HEAD(&cgrp->release_list);
1417 INIT_LIST_HEAD(&cgrp->pidlists); 1348 INIT_LIST_HEAD(&cgrp->pidlists);
1418 mutex_init(&cgrp->pidlist_mutex); 1349 mutex_init(&cgrp->pidlist_mutex);
1350 cgrp->dummy_css.cgroup = cgrp;
1419 INIT_LIST_HEAD(&cgrp->event_list); 1351 INIT_LIST_HEAD(&cgrp->event_list);
1420 spin_lock_init(&cgrp->event_list_lock); 1352 spin_lock_init(&cgrp->event_list_lock);
1421 simple_xattrs_init(&cgrp->xattrs); 1353 simple_xattrs_init(&cgrp->xattrs);
@@ -1431,6 +1363,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1431 cgrp->root = root; 1363 cgrp->root = root;
1432 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); 1364 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1433 init_cgroup_housekeeping(cgrp); 1365 init_cgroup_housekeeping(cgrp);
1366 idr_init(&root->cgroup_idr);
1434} 1367}
1435 1368
1436static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) 1369static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
@@ -1503,7 +1436,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1503 */ 1436 */
1504 root->subsys_mask = opts->subsys_mask; 1437 root->subsys_mask = opts->subsys_mask;
1505 root->flags = opts->flags; 1438 root->flags = opts->flags;
1506 ida_init(&root->cgroup_ida);
1507 if (opts->release_agent) 1439 if (opts->release_agent)
1508 strcpy(root->release_agent_path, opts->release_agent); 1440 strcpy(root->release_agent_path, opts->release_agent);
1509 if (opts->name) 1441 if (opts->name)
@@ -1519,7 +1451,7 @@ static void cgroup_free_root(struct cgroupfs_root *root)
1519 /* hierarhcy ID shoulid already have been released */ 1451 /* hierarhcy ID shoulid already have been released */
1520 WARN_ON_ONCE(root->hierarchy_id); 1452 WARN_ON_ONCE(root->hierarchy_id);
1521 1453
1522 ida_destroy(&root->cgroup_ida); 1454 idr_destroy(&root->cgroup_idr);
1523 kfree(root); 1455 kfree(root);
1524 } 1456 }
1525} 1457}
@@ -1584,7 +1516,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1584 int ret = 0; 1516 int ret = 0;
1585 struct super_block *sb; 1517 struct super_block *sb;
1586 struct cgroupfs_root *new_root; 1518 struct cgroupfs_root *new_root;
1519 struct list_head tmp_links;
1587 struct inode *inode; 1520 struct inode *inode;
1521 const struct cred *cred;
1588 1522
1589 /* First find the desired set of subsystems */ 1523 /* First find the desired set of subsystems */
1590 mutex_lock(&cgroup_mutex); 1524 mutex_lock(&cgroup_mutex);
@@ -1600,7 +1534,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1600 new_root = cgroup_root_from_opts(&opts); 1534 new_root = cgroup_root_from_opts(&opts);
1601 if (IS_ERR(new_root)) { 1535 if (IS_ERR(new_root)) {
1602 ret = PTR_ERR(new_root); 1536 ret = PTR_ERR(new_root);
1603 goto drop_modules; 1537 goto out_err;
1604 } 1538 }
1605 opts.new_root = new_root; 1539 opts.new_root = new_root;
1606 1540
@@ -1609,17 +1543,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1609 if (IS_ERR(sb)) { 1543 if (IS_ERR(sb)) {
1610 ret = PTR_ERR(sb); 1544 ret = PTR_ERR(sb);
1611 cgroup_free_root(opts.new_root); 1545 cgroup_free_root(opts.new_root);
1612 goto drop_modules; 1546 goto out_err;
1613 } 1547 }
1614 1548
1615 root = sb->s_fs_info; 1549 root = sb->s_fs_info;
1616 BUG_ON(!root); 1550 BUG_ON(!root);
1617 if (root == opts.new_root) { 1551 if (root == opts.new_root) {
1618 /* We used the new root structure, so this is a new hierarchy */ 1552 /* We used the new root structure, so this is a new hierarchy */
1619 struct list_head tmp_links;
1620 struct cgroup *root_cgrp = &root->top_cgroup; 1553 struct cgroup *root_cgrp = &root->top_cgroup;
1621 struct cgroupfs_root *existing_root; 1554 struct cgroupfs_root *existing_root;
1622 const struct cred *cred;
1623 int i; 1555 int i;
1624 struct css_set *cset; 1556 struct css_set *cset;
1625 1557
@@ -1634,6 +1566,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1634 mutex_lock(&cgroup_mutex); 1566 mutex_lock(&cgroup_mutex);
1635 mutex_lock(&cgroup_root_mutex); 1567 mutex_lock(&cgroup_root_mutex);
1636 1568
1569 root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
1570 0, 1, GFP_KERNEL);
1571 if (root_cgrp->id < 0)
1572 goto unlock_drop;
1573
1637 /* Check for name clashes with existing mounts */ 1574 /* Check for name clashes with existing mounts */
1638 ret = -EBUSY; 1575 ret = -EBUSY;
1639 if (strlen(root->name)) 1576 if (strlen(root->name))
@@ -1657,26 +1594,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1657 if (ret) 1594 if (ret)
1658 goto unlock_drop; 1595 goto unlock_drop;
1659 1596
1597 sb->s_root->d_fsdata = root_cgrp;
1598 root_cgrp->dentry = sb->s_root;
1599
1600 /*
1601 * We're inside get_sb() and will call lookup_one_len() to
1602 * create the root files, which doesn't work if SELinux is
1603 * in use. The following cred dancing somehow works around
1604 * it. See 2ce9738ba ("cgroupfs: use init_cred when
1605 * populating new cgroupfs mount") for more details.
1606 */
1607 cred = override_creds(&init_cred);
1608
1609 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1610 if (ret)
1611 goto rm_base_files;
1612
1660 ret = rebind_subsystems(root, root->subsys_mask, 0); 1613 ret = rebind_subsystems(root, root->subsys_mask, 0);
1661 if (ret == -EBUSY) { 1614 if (ret)
1662 free_cgrp_cset_links(&tmp_links); 1615 goto rm_base_files;
1663 goto unlock_drop; 1616
1664 } 1617 revert_creds(cred);
1618
1665 /* 1619 /*
1666 * There must be no failure case after here, since rebinding 1620 * There must be no failure case after here, since rebinding
1667 * takes care of subsystems' refcounts, which are explicitly 1621 * takes care of subsystems' refcounts, which are explicitly
1668 * dropped in the failure exit path. 1622 * dropped in the failure exit path.
1669 */ 1623 */
1670 1624
1671 /* EBUSY should be the only error here */
1672 BUG_ON(ret);
1673
1674 list_add(&root->root_list, &cgroup_roots); 1625 list_add(&root->root_list, &cgroup_roots);
1675 cgroup_root_count++; 1626 cgroup_root_count++;
1676 1627
1677 sb->s_root->d_fsdata = root_cgrp;
1678 root->top_cgroup.dentry = sb->s_root;
1679
1680 /* Link the top cgroup in this hierarchy into all 1628 /* Link the top cgroup in this hierarchy into all
1681 * the css_set objects */ 1629 * the css_set objects */
1682 write_lock(&css_set_lock); 1630 write_lock(&css_set_lock);
@@ -1689,9 +1637,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1689 BUG_ON(!list_empty(&root_cgrp->children)); 1637 BUG_ON(!list_empty(&root_cgrp->children));
1690 BUG_ON(root->number_of_cgroups != 1); 1638 BUG_ON(root->number_of_cgroups != 1);
1691 1639
1692 cred = override_creds(&init_cred);
1693 cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
1694 revert_creds(cred);
1695 mutex_unlock(&cgroup_root_mutex); 1640 mutex_unlock(&cgroup_root_mutex);
1696 mutex_unlock(&cgroup_mutex); 1641 mutex_unlock(&cgroup_mutex);
1697 mutex_unlock(&inode->i_mutex); 1642 mutex_unlock(&inode->i_mutex);
@@ -1711,15 +1656,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1711 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1656 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1712 } 1657 }
1713 } 1658 }
1714
1715 /* no subsys rebinding, so refcounts don't change */
1716 drop_parsed_module_refcounts(opts.subsys_mask);
1717 } 1659 }
1718 1660
1719 kfree(opts.release_agent); 1661 kfree(opts.release_agent);
1720 kfree(opts.name); 1662 kfree(opts.name);
1721 return dget(sb->s_root); 1663 return dget(sb->s_root);
1722 1664
1665 rm_base_files:
1666 free_cgrp_cset_links(&tmp_links);
1667 cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
1668 revert_creds(cred);
1723 unlock_drop: 1669 unlock_drop:
1724 cgroup_exit_root_id(root); 1670 cgroup_exit_root_id(root);
1725 mutex_unlock(&cgroup_root_mutex); 1671 mutex_unlock(&cgroup_root_mutex);
@@ -1727,8 +1673,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1727 mutex_unlock(&inode->i_mutex); 1673 mutex_unlock(&inode->i_mutex);
1728 drop_new_super: 1674 drop_new_super:
1729 deactivate_locked_super(sb); 1675 deactivate_locked_super(sb);
1730 drop_modules:
1731 drop_parsed_module_refcounts(opts.subsys_mask);
1732 out_err: 1676 out_err:
1733 kfree(opts.release_agent); 1677 kfree(opts.release_agent);
1734 kfree(opts.name); 1678 kfree(opts.name);
@@ -1746,6 +1690,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1746 BUG_ON(root->number_of_cgroups != 1); 1690 BUG_ON(root->number_of_cgroups != 1);
1747 BUG_ON(!list_empty(&cgrp->children)); 1691 BUG_ON(!list_empty(&cgrp->children));
1748 1692
1693 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1749 mutex_lock(&cgroup_mutex); 1694 mutex_lock(&cgroup_mutex);
1750 mutex_lock(&cgroup_root_mutex); 1695 mutex_lock(&cgroup_root_mutex);
1751 1696
@@ -1778,6 +1723,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1778 1723
1779 mutex_unlock(&cgroup_root_mutex); 1724 mutex_unlock(&cgroup_root_mutex);
1780 mutex_unlock(&cgroup_mutex); 1725 mutex_unlock(&cgroup_mutex);
1726 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1781 1727
1782 simple_xattrs_free(&cgrp->xattrs); 1728 simple_xattrs_free(&cgrp->xattrs);
1783 1729
@@ -1889,7 +1835,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path);
1889struct task_and_cgroup { 1835struct task_and_cgroup {
1890 struct task_struct *task; 1836 struct task_struct *task;
1891 struct cgroup *cgrp; 1837 struct cgroup *cgrp;
1892 struct css_set *cg; 1838 struct css_set *cset;
1893}; 1839};
1894 1840
1895struct cgroup_taskset { 1841struct cgroup_taskset {
@@ -1939,18 +1885,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1939EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1885EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1940 1886
1941/** 1887/**
1942 * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task 1888 * cgroup_taskset_cur_css - return the matching css for the current task
1943 * @tset: taskset of interest 1889 * @tset: taskset of interest
1890 * @subsys_id: the ID of the target subsystem
1944 * 1891 *
1945 * Return the cgroup for the current (last returned) task of @tset. This 1892 * Return the css for the current (last returned) task of @tset for
1946 * function must be preceded by either cgroup_taskset_first() or 1893 * subsystem specified by @subsys_id. This function must be preceded by
1947 * cgroup_taskset_next(). 1894 * either cgroup_taskset_first() or cgroup_taskset_next().
1948 */ 1895 */
1949struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) 1896struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1897 int subsys_id)
1950{ 1898{
1951 return tset->cur_cgrp; 1899 return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1952} 1900}
1953EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); 1901EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1954 1902
1955/** 1903/**
1956 * cgroup_taskset_size - return the number of tasks in taskset 1904 * cgroup_taskset_size - return the number of tasks in taskset
@@ -2054,7 +2002,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2054 2002
2055 /* @tsk either already exited or can't exit until the end */ 2003 /* @tsk either already exited or can't exit until the end */
2056 if (tsk->flags & PF_EXITING) 2004 if (tsk->flags & PF_EXITING)
2057 continue; 2005 goto next;
2058 2006
2059 /* as per above, nr_threads may decrease, but not increase. */ 2007 /* as per above, nr_threads may decrease, but not increase. */
2060 BUG_ON(i >= group_size); 2008 BUG_ON(i >= group_size);
@@ -2062,7 +2010,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2062 ent.cgrp = task_cgroup_from_root(tsk, root); 2010 ent.cgrp = task_cgroup_from_root(tsk, root);
2063 /* nothing to do if this task is already in the cgroup */ 2011 /* nothing to do if this task is already in the cgroup */
2064 if (ent.cgrp == cgrp) 2012 if (ent.cgrp == cgrp)
2065 continue; 2013 goto next;
2066 /* 2014 /*
2067 * saying GFP_ATOMIC has no effect here because we did prealloc 2015 * saying GFP_ATOMIC has no effect here because we did prealloc
2068 * earlier, but it's good form to communicate our expectations. 2016 * earlier, but it's good form to communicate our expectations.
@@ -2070,7 +2018,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2070 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2018 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2071 BUG_ON(retval != 0); 2019 BUG_ON(retval != 0);
2072 i++; 2020 i++;
2073 2021 next:
2074 if (!threadgroup) 2022 if (!threadgroup)
2075 break; 2023 break;
2076 } while_each_thread(leader, tsk); 2024 } while_each_thread(leader, tsk);
@@ -2089,8 +2037,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2089 * step 1: check that we can legitimately attach to the cgroup. 2037 * step 1: check that we can legitimately attach to the cgroup.
2090 */ 2038 */
2091 for_each_root_subsys(root, ss) { 2039 for_each_root_subsys(root, ss) {
2040 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2041
2092 if (ss->can_attach) { 2042 if (ss->can_attach) {
2093 retval = ss->can_attach(cgrp, &tset); 2043 retval = ss->can_attach(css, &tset);
2094 if (retval) { 2044 if (retval) {
2095 failed_ss = ss; 2045 failed_ss = ss;
2096 goto out_cancel_attach; 2046 goto out_cancel_attach;
@@ -2107,8 +2057,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2107 2057
2108 tc = flex_array_get(group, i); 2058 tc = flex_array_get(group, i);
2109 old_cset = task_css_set(tc->task); 2059 old_cset = task_css_set(tc->task);
2110 tc->cg = find_css_set(old_cset, cgrp); 2060 tc->cset = find_css_set(old_cset, cgrp);
2111 if (!tc->cg) { 2061 if (!tc->cset) {
2112 retval = -ENOMEM; 2062 retval = -ENOMEM;
2113 goto out_put_css_set_refs; 2063 goto out_put_css_set_refs;
2114 } 2064 }
@@ -2121,7 +2071,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2121 */ 2071 */
2122 for (i = 0; i < group_size; i++) { 2072 for (i = 0; i < group_size; i++) {
2123 tc = flex_array_get(group, i); 2073 tc = flex_array_get(group, i);
2124 cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); 2074 cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2125 } 2075 }
2126 /* nothing is sensitive to fork() after this point. */ 2076 /* nothing is sensitive to fork() after this point. */
2127 2077
@@ -2129,8 +2079,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2129 * step 4: do subsystem attach callbacks. 2079 * step 4: do subsystem attach callbacks.
2130 */ 2080 */
2131 for_each_root_subsys(root, ss) { 2081 for_each_root_subsys(root, ss) {
2082 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2083
2132 if (ss->attach) 2084 if (ss->attach)
2133 ss->attach(cgrp, &tset); 2085 ss->attach(css, &tset);
2134 } 2086 }
2135 2087
2136 /* 2088 /*
@@ -2141,18 +2093,20 @@ out_put_css_set_refs:
2141 if (retval) { 2093 if (retval) {
2142 for (i = 0; i < group_size; i++) { 2094 for (i = 0; i < group_size; i++) {
2143 tc = flex_array_get(group, i); 2095 tc = flex_array_get(group, i);
2144 if (!tc->cg) 2096 if (!tc->cset)
2145 break; 2097 break;
2146 put_css_set(tc->cg); 2098 put_css_set(tc->cset);
2147 } 2099 }
2148 } 2100 }
2149out_cancel_attach: 2101out_cancel_attach:
2150 if (retval) { 2102 if (retval) {
2151 for_each_root_subsys(root, ss) { 2103 for_each_root_subsys(root, ss) {
2104 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2105
2152 if (ss == failed_ss) 2106 if (ss == failed_ss)
2153 break; 2107 break;
2154 if (ss->cancel_attach) 2108 if (ss->cancel_attach)
2155 ss->cancel_attach(cgrp, &tset); 2109 ss->cancel_attach(css, &tset);
2156 } 2110 }
2157 } 2111 }
2158out_free_group_list: 2112out_free_group_list:
@@ -2253,9 +2207,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2253 2207
2254 mutex_lock(&cgroup_mutex); 2208 mutex_lock(&cgroup_mutex);
2255 for_each_active_root(root) { 2209 for_each_active_root(root) {
2256 struct cgroup *from_cg = task_cgroup_from_root(from, root); 2210 struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
2257 2211
2258 retval = cgroup_attach_task(from_cg, tsk, false); 2212 retval = cgroup_attach_task(from_cgrp, tsk, false);
2259 if (retval) 2213 if (retval)
2260 break; 2214 break;
2261 } 2215 }
@@ -2265,34 +2219,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2265} 2219}
2266EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2220EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2267 2221
2268static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2222static int cgroup_tasks_write(struct cgroup_subsys_state *css,
2223 struct cftype *cft, u64 pid)
2269{ 2224{
2270 return attach_task_by_pid(cgrp, pid, false); 2225 return attach_task_by_pid(css->cgroup, pid, false);
2271} 2226}
2272 2227
2273static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2228static int cgroup_procs_write(struct cgroup_subsys_state *css,
2229 struct cftype *cft, u64 tgid)
2274{ 2230{
2275 return attach_task_by_pid(cgrp, tgid, true); 2231 return attach_task_by_pid(css->cgroup, tgid, true);
2276} 2232}
2277 2233
2278static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 2234static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2279 const char *buffer) 2235 struct cftype *cft, const char *buffer)
2280{ 2236{
2281 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 2237 BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
2282 if (strlen(buffer) >= PATH_MAX) 2238 if (strlen(buffer) >= PATH_MAX)
2283 return -EINVAL; 2239 return -EINVAL;
2284 if (!cgroup_lock_live_group(cgrp)) 2240 if (!cgroup_lock_live_group(css->cgroup))
2285 return -ENODEV; 2241 return -ENODEV;
2286 mutex_lock(&cgroup_root_mutex); 2242 mutex_lock(&cgroup_root_mutex);
2287 strcpy(cgrp->root->release_agent_path, buffer); 2243 strcpy(css->cgroup->root->release_agent_path, buffer);
2288 mutex_unlock(&cgroup_root_mutex); 2244 mutex_unlock(&cgroup_root_mutex);
2289 mutex_unlock(&cgroup_mutex); 2245 mutex_unlock(&cgroup_mutex);
2290 return 0; 2246 return 0;
2291} 2247}
2292 2248
2293static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, 2249static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
2294 struct seq_file *seq) 2250 struct cftype *cft, struct seq_file *seq)
2295{ 2251{
2252 struct cgroup *cgrp = css->cgroup;
2253
2296 if (!cgroup_lock_live_group(cgrp)) 2254 if (!cgroup_lock_live_group(cgrp))
2297 return -ENODEV; 2255 return -ENODEV;
2298 seq_puts(seq, cgrp->root->release_agent_path); 2256 seq_puts(seq, cgrp->root->release_agent_path);
@@ -2301,20 +2259,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2301 return 0; 2259 return 0;
2302} 2260}
2303 2261
2304static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, 2262static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
2305 struct seq_file *seq) 2263 struct cftype *cft, struct seq_file *seq)
2306{ 2264{
2307 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); 2265 seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
2308 return 0; 2266 return 0;
2309} 2267}
2310 2268
2311/* A buffer size big enough for numbers or short strings */ 2269/* A buffer size big enough for numbers or short strings */
2312#define CGROUP_LOCAL_BUFFER_SIZE 64 2270#define CGROUP_LOCAL_BUFFER_SIZE 64
2313 2271
2314static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, 2272static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
2315 struct file *file, 2273 struct cftype *cft, struct file *file,
2316 const char __user *userbuf, 2274 const char __user *userbuf, size_t nbytes,
2317 size_t nbytes, loff_t *unused_ppos) 2275 loff_t *unused_ppos)
2318{ 2276{
2319 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2277 char buffer[CGROUP_LOCAL_BUFFER_SIZE];
2320 int retval = 0; 2278 int retval = 0;
@@ -2332,22 +2290,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
2332 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2290 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
2333 if (*end) 2291 if (*end)
2334 return -EINVAL; 2292 return -EINVAL;
2335 retval = cft->write_u64(cgrp, cft, val); 2293 retval = cft->write_u64(css, cft, val);
2336 } else { 2294 } else {
2337 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2295 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
2338 if (*end) 2296 if (*end)
2339 return -EINVAL; 2297 return -EINVAL;
2340 retval = cft->write_s64(cgrp, cft, val); 2298 retval = cft->write_s64(css, cft, val);
2341 } 2299 }
2342 if (!retval) 2300 if (!retval)
2343 retval = nbytes; 2301 retval = nbytes;
2344 return retval; 2302 return retval;
2345} 2303}
2346 2304
2347static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, 2305static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
2348 struct file *file, 2306 struct cftype *cft, struct file *file,
2349 const char __user *userbuf, 2307 const char __user *userbuf, size_t nbytes,
2350 size_t nbytes, loff_t *unused_ppos) 2308 loff_t *unused_ppos)
2351{ 2309{
2352 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2310 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
2353 int retval = 0; 2311 int retval = 0;
@@ -2370,7 +2328,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
2370 } 2328 }
2371 2329
2372 buffer[nbytes] = 0; /* nul-terminate */ 2330 buffer[nbytes] = 0; /* nul-terminate */
2373 retval = cft->write_string(cgrp, cft, strstrip(buffer)); 2331 retval = cft->write_string(css, cft, strstrip(buffer));
2374 if (!retval) 2332 if (!retval)
2375 retval = nbytes; 2333 retval = nbytes;
2376out: 2334out:
@@ -2380,65 +2338,60 @@ out:
2380} 2338}
2381 2339
2382static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2340static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2383 size_t nbytes, loff_t *ppos) 2341 size_t nbytes, loff_t *ppos)
2384{ 2342{
2343 struct cfent *cfe = __d_cfe(file->f_dentry);
2385 struct cftype *cft = __d_cft(file->f_dentry); 2344 struct cftype *cft = __d_cft(file->f_dentry);
2386 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2345 struct cgroup_subsys_state *css = cfe->css;
2387 2346
2388 if (cgroup_is_dead(cgrp))
2389 return -ENODEV;
2390 if (cft->write) 2347 if (cft->write)
2391 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 2348 return cft->write(css, cft, file, buf, nbytes, ppos);
2392 if (cft->write_u64 || cft->write_s64) 2349 if (cft->write_u64 || cft->write_s64)
2393 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); 2350 return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
2394 if (cft->write_string) 2351 if (cft->write_string)
2395 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); 2352 return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
2396 if (cft->trigger) { 2353 if (cft->trigger) {
2397 int ret = cft->trigger(cgrp, (unsigned int)cft->private); 2354 int ret = cft->trigger(css, (unsigned int)cft->private);
2398 return ret ? ret : nbytes; 2355 return ret ? ret : nbytes;
2399 } 2356 }
2400 return -EINVAL; 2357 return -EINVAL;
2401} 2358}
2402 2359
2403static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, 2360static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
2404 struct file *file, 2361 struct cftype *cft, struct file *file,
2405 char __user *buf, size_t nbytes, 2362 char __user *buf, size_t nbytes, loff_t *ppos)
2406 loff_t *ppos)
2407{ 2363{
2408 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2364 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2409 u64 val = cft->read_u64(cgrp, cft); 2365 u64 val = cft->read_u64(css, cft);
2410 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 2366 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2411 2367
2412 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2368 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2413} 2369}
2414 2370
2415static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, 2371static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
2416 struct file *file, 2372 struct cftype *cft, struct file *file,
2417 char __user *buf, size_t nbytes, 2373 char __user *buf, size_t nbytes, loff_t *ppos)
2418 loff_t *ppos)
2419{ 2374{
2420 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2375 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2421 s64 val = cft->read_s64(cgrp, cft); 2376 s64 val = cft->read_s64(css, cft);
2422 int len = sprintf(tmp, "%lld\n", (long long) val); 2377 int len = sprintf(tmp, "%lld\n", (long long) val);
2423 2378
2424 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2379 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2425} 2380}
2426 2381
2427static ssize_t cgroup_file_read(struct file *file, char __user *buf, 2382static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2428 size_t nbytes, loff_t *ppos) 2383 size_t nbytes, loff_t *ppos)
2429{ 2384{
2385 struct cfent *cfe = __d_cfe(file->f_dentry);
2430 struct cftype *cft = __d_cft(file->f_dentry); 2386 struct cftype *cft = __d_cft(file->f_dentry);
2431 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2387 struct cgroup_subsys_state *css = cfe->css;
2432
2433 if (cgroup_is_dead(cgrp))
2434 return -ENODEV;
2435 2388
2436 if (cft->read) 2389 if (cft->read)
2437 return cft->read(cgrp, cft, file, buf, nbytes, ppos); 2390 return cft->read(css, cft, file, buf, nbytes, ppos);
2438 if (cft->read_u64) 2391 if (cft->read_u64)
2439 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); 2392 return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
2440 if (cft->read_s64) 2393 if (cft->read_s64)
2441 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); 2394 return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
2442 return -EINVAL; 2395 return -EINVAL;
2443} 2396}
2444 2397
@@ -2447,11 +2400,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2447 * supports string->u64 maps, but can be extended in future. 2400 * supports string->u64 maps, but can be extended in future.
2448 */ 2401 */
2449 2402
2450struct cgroup_seqfile_state {
2451 struct cftype *cft;
2452 struct cgroup *cgroup;
2453};
2454
2455static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) 2403static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2456{ 2404{
2457 struct seq_file *sf = cb->state; 2405 struct seq_file *sf = cb->state;
@@ -2460,69 +2408,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2460 2408
2461static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2409static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2462{ 2410{
2463 struct cgroup_seqfile_state *state = m->private; 2411 struct cfent *cfe = m->private;
2464 struct cftype *cft = state->cft; 2412 struct cftype *cft = cfe->type;
2413 struct cgroup_subsys_state *css = cfe->css;
2414
2465 if (cft->read_map) { 2415 if (cft->read_map) {
2466 struct cgroup_map_cb cb = { 2416 struct cgroup_map_cb cb = {
2467 .fill = cgroup_map_add, 2417 .fill = cgroup_map_add,
2468 .state = m, 2418 .state = m,
2469 }; 2419 };
2470 return cft->read_map(state->cgroup, cft, &cb); 2420 return cft->read_map(css, cft, &cb);
2471 } 2421 }
2472 return cft->read_seq_string(state->cgroup, cft, m); 2422 return cft->read_seq_string(css, cft, m);
2473}
2474
2475static int cgroup_seqfile_release(struct inode *inode, struct file *file)
2476{
2477 struct seq_file *seq = file->private_data;
2478 kfree(seq->private);
2479 return single_release(inode, file);
2480} 2423}
2481 2424
2482static const struct file_operations cgroup_seqfile_operations = { 2425static const struct file_operations cgroup_seqfile_operations = {
2483 .read = seq_read, 2426 .read = seq_read,
2484 .write = cgroup_file_write, 2427 .write = cgroup_file_write,
2485 .llseek = seq_lseek, 2428 .llseek = seq_lseek,
2486 .release = cgroup_seqfile_release, 2429 .release = single_release,
2487}; 2430};
2488 2431
2489static int cgroup_file_open(struct inode *inode, struct file *file) 2432static int cgroup_file_open(struct inode *inode, struct file *file)
2490{ 2433{
2434 struct cfent *cfe = __d_cfe(file->f_dentry);
2435 struct cftype *cft = __d_cft(file->f_dentry);
2436 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2437 struct cgroup_subsys_state *css;
2491 int err; 2438 int err;
2492 struct cftype *cft;
2493 2439
2494 err = generic_file_open(inode, file); 2440 err = generic_file_open(inode, file);
2495 if (err) 2441 if (err)
2496 return err; 2442 return err;
2497 cft = __d_cft(file->f_dentry);
2498 2443
2499 if (cft->read_map || cft->read_seq_string) { 2444 /*
2500 struct cgroup_seqfile_state *state; 2445 * If the file belongs to a subsystem, pin the css. Will be
2446 * unpinned either on open failure or release. This ensures that
2447 * @css stays alive for all file operations.
2448 */
2449 rcu_read_lock();
2450 css = cgroup_css(cgrp, cft->ss);
2451 if (cft->ss && !css_tryget(css))
2452 css = NULL;
2453 rcu_read_unlock();
2501 2454
2502 state = kzalloc(sizeof(*state), GFP_USER); 2455 if (!css)
2503 if (!state) 2456 return -ENODEV;
2504 return -ENOMEM;
2505 2457
2506 state->cft = cft; 2458 /*
2507 state->cgroup = __d_cgrp(file->f_dentry->d_parent); 2459 * @cfe->css is used by read/write/close to determine the
2460 * associated css. @file->private_data would be a better place but
2461 * that's already used by seqfile. Multiple accessors may use it
2462 * simultaneously which is okay as the association never changes.
2463 */
2464 WARN_ON_ONCE(cfe->css && cfe->css != css);
2465 cfe->css = css;
2466
2467 if (cft->read_map || cft->read_seq_string) {
2508 file->f_op = &cgroup_seqfile_operations; 2468 file->f_op = &cgroup_seqfile_operations;
2509 err = single_open(file, cgroup_seqfile_show, state); 2469 err = single_open(file, cgroup_seqfile_show, cfe);
2510 if (err < 0) 2470 } else if (cft->open) {
2511 kfree(state);
2512 } else if (cft->open)
2513 err = cft->open(inode, file); 2471 err = cft->open(inode, file);
2514 else 2472 }
2515 err = 0;
2516 2473
2474 if (css->ss && err)
2475 css_put(css);
2517 return err; 2476 return err;
2518} 2477}
2519 2478
2520static int cgroup_file_release(struct inode *inode, struct file *file) 2479static int cgroup_file_release(struct inode *inode, struct file *file)
2521{ 2480{
2481 struct cfent *cfe = __d_cfe(file->f_dentry);
2522 struct cftype *cft = __d_cft(file->f_dentry); 2482 struct cftype *cft = __d_cft(file->f_dentry);
2483 struct cgroup_subsys_state *css = cfe->css;
2484 int ret = 0;
2485
2523 if (cft->release) 2486 if (cft->release)
2524 return cft->release(inode, file); 2487 ret = cft->release(inode, file);
2525 return 0; 2488 if (css->ss)
2489 css_put(css);
2490 return ret;
2526} 2491}
2527 2492
2528/* 2493/*
@@ -2736,8 +2701,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2736 return mode; 2701 return mode;
2737} 2702}
2738 2703
2739static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2704static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2740 struct cftype *cft)
2741{ 2705{
2742 struct dentry *dir = cgrp->dentry; 2706 struct dentry *dir = cgrp->dentry;
2743 struct cgroup *parent = __d_cgrp(dir); 2707 struct cgroup *parent = __d_cgrp(dir);
@@ -2747,8 +2711,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2747 umode_t mode; 2711 umode_t mode;
2748 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2712 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2749 2713
2750 if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { 2714 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
2751 strcpy(name, subsys->name); 2715 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2716 strcpy(name, cft->ss->name);
2752 strcat(name, "."); 2717 strcat(name, ".");
2753 } 2718 }
2754 strcat(name, cft->name); 2719 strcat(name, cft->name);
@@ -2782,11 +2747,25 @@ out:
2782 return error; 2747 return error;
2783} 2748}
2784 2749
2785static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2750/**
2786 struct cftype cfts[], bool is_add) 2751 * cgroup_addrm_files - add or remove files to a cgroup directory
2752 * @cgrp: the target cgroup
2753 * @cfts: array of cftypes to be added
2754 * @is_add: whether to add or remove
2755 *
2756 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2757 * For removals, this function never fails. If addition fails, this
2758 * function doesn't remove files already added. The caller is responsible
2759 * for cleaning up.
2760 */
2761static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2762 bool is_add)
2787{ 2763{
2788 struct cftype *cft; 2764 struct cftype *cft;
2789 int err, ret = 0; 2765 int ret;
2766
2767 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
2768 lockdep_assert_held(&cgroup_mutex);
2790 2769
2791 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2770 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2792 /* does cft->flags tell us to skip this file on @cgrp? */ 2771 /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2798,16 +2777,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2798 continue; 2777 continue;
2799 2778
2800 if (is_add) { 2779 if (is_add) {
2801 err = cgroup_add_file(cgrp, subsys, cft); 2780 ret = cgroup_add_file(cgrp, cft);
2802 if (err) 2781 if (ret) {
2803 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", 2782 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2804 cft->name, err); 2783 cft->name, ret);
2805 ret = err; 2784 return ret;
2785 }
2806 } else { 2786 } else {
2807 cgroup_rm_file(cgrp, cft); 2787 cgroup_rm_file(cgrp, cft);
2808 } 2788 }
2809 } 2789 }
2810 return ret; 2790 return 0;
2811} 2791}
2812 2792
2813static void cgroup_cfts_prepare(void) 2793static void cgroup_cfts_prepare(void)
@@ -2816,28 +2796,30 @@ static void cgroup_cfts_prepare(void)
2816 /* 2796 /*
2817 * Thanks to the entanglement with vfs inode locking, we can't walk 2797 * Thanks to the entanglement with vfs inode locking, we can't walk
2818 * the existing cgroups under cgroup_mutex and create files. 2798 * the existing cgroups under cgroup_mutex and create files.
2819 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU 2799 * Instead, we use css_for_each_descendant_pre() and drop RCU read
2820 * read lock before calling cgroup_addrm_files(). 2800 * lock before calling cgroup_addrm_files().
2821 */ 2801 */
2822 mutex_lock(&cgroup_mutex); 2802 mutex_lock(&cgroup_mutex);
2823} 2803}
2824 2804
2825static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2805static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2826 struct cftype *cfts, bool is_add)
2827 __releases(&cgroup_mutex) 2806 __releases(&cgroup_mutex)
2828{ 2807{
2829 LIST_HEAD(pending); 2808 LIST_HEAD(pending);
2830 struct cgroup *cgrp, *root = &ss->root->top_cgroup; 2809 struct cgroup_subsys *ss = cfts[0].ss;
2810 struct cgroup *root = &ss->root->top_cgroup;
2831 struct super_block *sb = ss->root->sb; 2811 struct super_block *sb = ss->root->sb;
2832 struct dentry *prev = NULL; 2812 struct dentry *prev = NULL;
2833 struct inode *inode; 2813 struct inode *inode;
2814 struct cgroup_subsys_state *css;
2834 u64 update_before; 2815 u64 update_before;
2816 int ret = 0;
2835 2817
2836 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2818 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2837 if (!cfts || ss->root == &cgroup_dummy_root || 2819 if (!cfts || ss->root == &cgroup_dummy_root ||
2838 !atomic_inc_not_zero(&sb->s_active)) { 2820 !atomic_inc_not_zero(&sb->s_active)) {
2839 mutex_unlock(&cgroup_mutex); 2821 mutex_unlock(&cgroup_mutex);
2840 return; 2822 return 0;
2841 } 2823 }
2842 2824
2843 /* 2825 /*
@@ -2849,17 +2831,11 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2849 2831
2850 mutex_unlock(&cgroup_mutex); 2832 mutex_unlock(&cgroup_mutex);
2851 2833
2852 /* @root always needs to be updated */
2853 inode = root->dentry->d_inode;
2854 mutex_lock(&inode->i_mutex);
2855 mutex_lock(&cgroup_mutex);
2856 cgroup_addrm_files(root, ss, cfts, is_add);
2857 mutex_unlock(&cgroup_mutex);
2858 mutex_unlock(&inode->i_mutex);
2859
2860 /* add/rm files for all cgroups created before */ 2834 /* add/rm files for all cgroups created before */
2861 rcu_read_lock(); 2835 rcu_read_lock();
2862 cgroup_for_each_descendant_pre(cgrp, root) { 2836 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2837 struct cgroup *cgrp = css->cgroup;
2838
2863 if (cgroup_is_dead(cgrp)) 2839 if (cgroup_is_dead(cgrp))
2864 continue; 2840 continue;
2865 2841
@@ -2873,15 +2849,18 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2873 mutex_lock(&inode->i_mutex); 2849 mutex_lock(&inode->i_mutex);
2874 mutex_lock(&cgroup_mutex); 2850 mutex_lock(&cgroup_mutex);
2875 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) 2851 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2876 cgroup_addrm_files(cgrp, ss, cfts, is_add); 2852 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2877 mutex_unlock(&cgroup_mutex); 2853 mutex_unlock(&cgroup_mutex);
2878 mutex_unlock(&inode->i_mutex); 2854 mutex_unlock(&inode->i_mutex);
2879 2855
2880 rcu_read_lock(); 2856 rcu_read_lock();
2857 if (ret)
2858 break;
2881 } 2859 }
2882 rcu_read_unlock(); 2860 rcu_read_unlock();
2883 dput(prev); 2861 dput(prev);
2884 deactivate_super(sb); 2862 deactivate_super(sb);
2863 return ret;
2885} 2864}
2886 2865
2887/** 2866/**
@@ -2901,49 +2880,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2901int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2880int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2902{ 2881{
2903 struct cftype_set *set; 2882 struct cftype_set *set;
2883 struct cftype *cft;
2884 int ret;
2904 2885
2905 set = kzalloc(sizeof(*set), GFP_KERNEL); 2886 set = kzalloc(sizeof(*set), GFP_KERNEL);
2906 if (!set) 2887 if (!set)
2907 return -ENOMEM; 2888 return -ENOMEM;
2908 2889
2890 for (cft = cfts; cft->name[0] != '\0'; cft++)
2891 cft->ss = ss;
2892
2909 cgroup_cfts_prepare(); 2893 cgroup_cfts_prepare();
2910 set->cfts = cfts; 2894 set->cfts = cfts;
2911 list_add_tail(&set->node, &ss->cftsets); 2895 list_add_tail(&set->node, &ss->cftsets);
2912 cgroup_cfts_commit(ss, cfts, true); 2896 ret = cgroup_cfts_commit(cfts, true);
2913 2897 if (ret)
2914 return 0; 2898 cgroup_rm_cftypes(cfts);
2899 return ret;
2915} 2900}
2916EXPORT_SYMBOL_GPL(cgroup_add_cftypes); 2901EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2917 2902
2918/** 2903/**
2919 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2904 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2920 * @ss: target cgroup subsystem
2921 * @cfts: zero-length name terminated array of cftypes 2905 * @cfts: zero-length name terminated array of cftypes
2922 * 2906 *
2923 * Unregister @cfts from @ss. Files described by @cfts are removed from 2907 * Unregister @cfts. Files described by @cfts are removed from all
2924 * all existing cgroups to which @ss is attached and all future cgroups 2908 * existing cgroups and all future cgroups won't have them either. This
2925 * won't have them either. This function can be called anytime whether @ss 2909 * function can be called anytime whether @cfts' subsys is attached or not.
2926 * is attached or not.
2927 * 2910 *
2928 * Returns 0 on successful unregistration, -ENOENT if @cfts is not 2911 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2929 * registered with @ss. 2912 * registered.
2930 */ 2913 */
2931int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2914int cgroup_rm_cftypes(struct cftype *cfts)
2932{ 2915{
2933 struct cftype_set *set; 2916 struct cftype_set *set;
2934 2917
2918 if (!cfts || !cfts[0].ss)
2919 return -ENOENT;
2920
2935 cgroup_cfts_prepare(); 2921 cgroup_cfts_prepare();
2936 2922
2937 list_for_each_entry(set, &ss->cftsets, node) { 2923 list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
2938 if (set->cfts == cfts) { 2924 if (set->cfts == cfts) {
2939 list_del(&set->node); 2925 list_del(&set->node);
2940 kfree(set); 2926 kfree(set);
2941 cgroup_cfts_commit(ss, cfts, false); 2927 cgroup_cfts_commit(cfts, false);
2942 return 0; 2928 return 0;
2943 } 2929 }
2944 } 2930 }
2945 2931
2946 cgroup_cfts_commit(ss, NULL, false); 2932 cgroup_cfts_commit(NULL, false);
2947 return -ENOENT; 2933 return -ENOENT;
2948} 2934}
2949 2935
@@ -2966,34 +2952,10 @@ int cgroup_task_count(const struct cgroup *cgrp)
2966} 2952}
2967 2953
2968/* 2954/*
2969 * Advance a list_head iterator. The iterator should be positioned at 2955 * To reduce the fork() overhead for systems that are not actually using
2970 * the start of a css_set 2956 * their cgroups capability, we don't maintain the lists running through
2971 */ 2957 * each css_set to its tasks until we see the list actually used - in other
2972static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) 2958 * words after the first call to css_task_iter_start().
2973{
2974 struct list_head *l = it->cset_link;
2975 struct cgrp_cset_link *link;
2976 struct css_set *cset;
2977
2978 /* Advance to the next non-empty css_set */
2979 do {
2980 l = l->next;
2981 if (l == &cgrp->cset_links) {
2982 it->cset_link = NULL;
2983 return;
2984 }
2985 link = list_entry(l, struct cgrp_cset_link, cset_link);
2986 cset = link->cset;
2987 } while (list_empty(&cset->tasks));
2988 it->cset_link = l;
2989 it->task = cset->tasks.next;
2990}
2991
2992/*
2993 * To reduce the fork() overhead for systems that are not actually
2994 * using their cgroups capability, we don't maintain the lists running
2995 * through each css_set to its tasks until we see the list actually
2996 * used - in other words after the first call to cgroup_iter_start().
2997 */ 2959 */
2998static void cgroup_enable_task_cg_lists(void) 2960static void cgroup_enable_task_cg_lists(void)
2999{ 2961{
@@ -3024,16 +2986,21 @@ static void cgroup_enable_task_cg_lists(void)
3024} 2986}
3025 2987
3026/** 2988/**
3027 * cgroup_next_sibling - find the next sibling of a given cgroup 2989 * css_next_child - find the next child of a given css
3028 * @pos: the current cgroup 2990 * @pos_css: the current position (%NULL to initiate traversal)
2991 * @parent_css: css whose children to walk
3029 * 2992 *
3030 * This function returns the next sibling of @pos and should be called 2993 * This function returns the next child of @parent_css and should be called
3031 * under RCU read lock. The only requirement is that @pos is accessible. 2994 * under RCU read lock. The only requirement is that @parent_css and
3032 * The next sibling is guaranteed to be returned regardless of @pos's 2995 * @pos_css are accessible. The next sibling is guaranteed to be returned
3033 * state. 2996 * regardless of their states.
3034 */ 2997 */
3035struct cgroup *cgroup_next_sibling(struct cgroup *pos) 2998struct cgroup_subsys_state *
2999css_next_child(struct cgroup_subsys_state *pos_css,
3000 struct cgroup_subsys_state *parent_css)
3036{ 3001{
3002 struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
3003 struct cgroup *cgrp = parent_css->cgroup;
3037 struct cgroup *next; 3004 struct cgroup *next;
3038 3005
3039 WARN_ON_ONCE(!rcu_read_lock_held()); 3006 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -3048,78 +3015,81 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos)
3048 * safe to dereference from this RCU critical section. If 3015 * safe to dereference from this RCU critical section. If
3049 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed 3016 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3050 * to be visible as %true here. 3017 * to be visible as %true here.
3018 *
3019 * If @pos is dead, its next pointer can't be dereferenced;
3020 * however, as each cgroup is given a monotonically increasing
3021 * unique serial number and always appended to the sibling list,
3022 * the next one can be found by walking the parent's children until
3023 * we see a cgroup with higher serial number than @pos's. While
3024 * this path can be slower, it's taken only when either the current
3025 * cgroup is removed or iteration and removal race.
3051 */ 3026 */
3052 if (likely(!cgroup_is_dead(pos))) { 3027 if (!pos) {
3028 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
3029 } else if (likely(!cgroup_is_dead(pos))) {
3053 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3030 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3054 if (&next->sibling != &pos->parent->children) 3031 } else {
3055 return next; 3032 list_for_each_entry_rcu(next, &cgrp->children, sibling)
3056 return NULL; 3033 if (next->serial_nr > pos->serial_nr)
3034 break;
3057 } 3035 }
3058 3036
3059 /* 3037 if (&next->sibling == &cgrp->children)
3060 * Can't dereference the next pointer. Each cgroup is given a 3038 return NULL;
3061 * monotonically increasing unique serial number and always 3039
3062 * appended to the sibling list, so the next one can be found by 3040 return cgroup_css(next, parent_css->ss);
3063 * walking the parent's children until we see a cgroup with higher
3064 * serial number than @pos's.
3065 *
3066 * While this path can be slow, it's taken only when either the
3067 * current cgroup is removed or iteration and removal race.
3068 */
3069 list_for_each_entry_rcu(next, &pos->parent->children, sibling)
3070 if (next->serial_nr > pos->serial_nr)
3071 return next;
3072 return NULL;
3073} 3041}
3074EXPORT_SYMBOL_GPL(cgroup_next_sibling); 3042EXPORT_SYMBOL_GPL(css_next_child);
3075 3043
3076/** 3044/**
3077 * cgroup_next_descendant_pre - find the next descendant for pre-order walk 3045 * css_next_descendant_pre - find the next descendant for pre-order walk
3078 * @pos: the current position (%NULL to initiate traversal) 3046 * @pos: the current position (%NULL to initiate traversal)
3079 * @cgroup: cgroup whose descendants to walk 3047 * @root: css whose descendants to walk
3080 * 3048 *
3081 * To be used by cgroup_for_each_descendant_pre(). Find the next 3049 * To be used by css_for_each_descendant_pre(). Find the next descendant
3082 * descendant to visit for pre-order traversal of @cgroup's descendants. 3050 * to visit for pre-order traversal of @root's descendants. @root is
3051 * included in the iteration and the first node to be visited.
3083 * 3052 *
3084 * While this function requires RCU read locking, it doesn't require the 3053 * While this function requires RCU read locking, it doesn't require the
3085 * whole traversal to be contained in a single RCU critical section. This 3054 * whole traversal to be contained in a single RCU critical section. This
3086 * function will return the correct next descendant as long as both @pos 3055 * function will return the correct next descendant as long as both @pos
3087 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3056 * and @root are accessible and @pos is a descendant of @root.
3088 */ 3057 */
3089struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 3058struct cgroup_subsys_state *
3090 struct cgroup *cgroup) 3059css_next_descendant_pre(struct cgroup_subsys_state *pos,
3060 struct cgroup_subsys_state *root)
3091{ 3061{
3092 struct cgroup *next; 3062 struct cgroup_subsys_state *next;
3093 3063
3094 WARN_ON_ONCE(!rcu_read_lock_held()); 3064 WARN_ON_ONCE(!rcu_read_lock_held());
3095 3065
3096 /* if first iteration, pretend we just visited @cgroup */ 3066 /* if first iteration, visit @root */
3097 if (!pos) 3067 if (!pos)
3098 pos = cgroup; 3068 return root;
3099 3069
3100 /* visit the first child if exists */ 3070 /* visit the first child if exists */
3101 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); 3071 next = css_next_child(NULL, pos);
3102 if (next) 3072 if (next)
3103 return next; 3073 return next;
3104 3074
3105 /* no child, visit my or the closest ancestor's next sibling */ 3075 /* no child, visit my or the closest ancestor's next sibling */
3106 while (pos != cgroup) { 3076 while (pos != root) {
3107 next = cgroup_next_sibling(pos); 3077 next = css_next_child(pos, css_parent(pos));
3108 if (next) 3078 if (next)
3109 return next; 3079 return next;
3110 pos = pos->parent; 3080 pos = css_parent(pos);
3111 } 3081 }
3112 3082
3113 return NULL; 3083 return NULL;
3114} 3084}
3115EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); 3085EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3116 3086
3117/** 3087/**
3118 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup 3088 * css_rightmost_descendant - return the rightmost descendant of a css
3119 * @pos: cgroup of interest 3089 * @pos: css of interest
3120 * 3090 *
3121 * Return the rightmost descendant of @pos. If there's no descendant, 3091 * Return the rightmost descendant of @pos. If there's no descendant, @pos
3122 * @pos is returned. This can be used during pre-order traversal to skip 3092 * is returned. This can be used during pre-order traversal to skip
3123 * subtree of @pos. 3093 * subtree of @pos.
3124 * 3094 *
3125 * While this function requires RCU read locking, it doesn't require the 3095 * While this function requires RCU read locking, it doesn't require the
@@ -3127,9 +3097,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3127 * function will return the correct rightmost descendant as long as @pos is 3097 * function will return the correct rightmost descendant as long as @pos is
3128 * accessible. 3098 * accessible.
3129 */ 3099 */
3130struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) 3100struct cgroup_subsys_state *
3101css_rightmost_descendant(struct cgroup_subsys_state *pos)
3131{ 3102{
3132 struct cgroup *last, *tmp; 3103 struct cgroup_subsys_state *last, *tmp;
3133 3104
3134 WARN_ON_ONCE(!rcu_read_lock_held()); 3105 WARN_ON_ONCE(!rcu_read_lock_held());
3135 3106
@@ -3137,82 +3108,136 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
3137 last = pos; 3108 last = pos;
3138 /* ->prev isn't RCU safe, walk ->next till the end */ 3109 /* ->prev isn't RCU safe, walk ->next till the end */
3139 pos = NULL; 3110 pos = NULL;
3140 list_for_each_entry_rcu(tmp, &last->children, sibling) 3111 css_for_each_child(tmp, last)
3141 pos = tmp; 3112 pos = tmp;
3142 } while (pos); 3113 } while (pos);
3143 3114
3144 return last; 3115 return last;
3145} 3116}
3146EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); 3117EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3147 3118
3148static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) 3119static struct cgroup_subsys_state *
3120css_leftmost_descendant(struct cgroup_subsys_state *pos)
3149{ 3121{
3150 struct cgroup *last; 3122 struct cgroup_subsys_state *last;
3151 3123
3152 do { 3124 do {
3153 last = pos; 3125 last = pos;
3154 pos = list_first_or_null_rcu(&pos->children, struct cgroup, 3126 pos = css_next_child(NULL, pos);
3155 sibling);
3156 } while (pos); 3127 } while (pos);
3157 3128
3158 return last; 3129 return last;
3159} 3130}
3160 3131
3161/** 3132/**
3162 * cgroup_next_descendant_post - find the next descendant for post-order walk 3133 * css_next_descendant_post - find the next descendant for post-order walk
3163 * @pos: the current position (%NULL to initiate traversal) 3134 * @pos: the current position (%NULL to initiate traversal)
3164 * @cgroup: cgroup whose descendants to walk 3135 * @root: css whose descendants to walk
3165 * 3136 *
3166 * To be used by cgroup_for_each_descendant_post(). Find the next 3137 * To be used by css_for_each_descendant_post(). Find the next descendant
3167 * descendant to visit for post-order traversal of @cgroup's descendants. 3138 * to visit for post-order traversal of @root's descendants. @root is
3139 * included in the iteration and the last node to be visited.
3168 * 3140 *
3169 * While this function requires RCU read locking, it doesn't require the 3141 * While this function requires RCU read locking, it doesn't require the
3170 * whole traversal to be contained in a single RCU critical section. This 3142 * whole traversal to be contained in a single RCU critical section. This
3171 * function will return the correct next descendant as long as both @pos 3143 * function will return the correct next descendant as long as both @pos
3172 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3144 * and @cgroup are accessible and @pos is a descendant of @cgroup.
3173 */ 3145 */
3174struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 3146struct cgroup_subsys_state *
3175 struct cgroup *cgroup) 3147css_next_descendant_post(struct cgroup_subsys_state *pos,
3148 struct cgroup_subsys_state *root)
3176{ 3149{
3177 struct cgroup *next; 3150 struct cgroup_subsys_state *next;
3178 3151
3179 WARN_ON_ONCE(!rcu_read_lock_held()); 3152 WARN_ON_ONCE(!rcu_read_lock_held());
3180 3153
3181 /* if first iteration, visit the leftmost descendant */ 3154 /* if first iteration, visit leftmost descendant which may be @root */
3182 if (!pos) { 3155 if (!pos)
3183 next = cgroup_leftmost_descendant(cgroup); 3156 return css_leftmost_descendant(root);
3184 return next != cgroup ? next : NULL; 3157
3185 } 3158 /* if we visited @root, we're done */
3159 if (pos == root)
3160 return NULL;
3186 3161
3187 /* if there's an unvisited sibling, visit its leftmost descendant */ 3162 /* if there's an unvisited sibling, visit its leftmost descendant */
3188 next = cgroup_next_sibling(pos); 3163 next = css_next_child(pos, css_parent(pos));
3189 if (next) 3164 if (next)
3190 return cgroup_leftmost_descendant(next); 3165 return css_leftmost_descendant(next);
3191 3166
3192 /* no sibling left, visit parent */ 3167 /* no sibling left, visit parent */
3193 next = pos->parent; 3168 return css_parent(pos);
3194 return next != cgroup ? next : NULL; 3169}
3170EXPORT_SYMBOL_GPL(css_next_descendant_post);
3171
3172/**
3173 * css_advance_task_iter - advance a task itererator to the next css_set
3174 * @it: the iterator to advance
3175 *
3176 * Advance @it to the next css_set to walk.
3177 */
3178static void css_advance_task_iter(struct css_task_iter *it)
3179{
3180 struct list_head *l = it->cset_link;
3181 struct cgrp_cset_link *link;
3182 struct css_set *cset;
3183
3184 /* Advance to the next non-empty css_set */
3185 do {
3186 l = l->next;
3187 if (l == &it->origin_css->cgroup->cset_links) {
3188 it->cset_link = NULL;
3189 return;
3190 }
3191 link = list_entry(l, struct cgrp_cset_link, cset_link);
3192 cset = link->cset;
3193 } while (list_empty(&cset->tasks));
3194 it->cset_link = l;
3195 it->task = cset->tasks.next;
3195} 3196}
3196EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
3197 3197
3198void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3198/**
3199 * css_task_iter_start - initiate task iteration
3200 * @css: the css to walk tasks of
3201 * @it: the task iterator to use
3202 *
3203 * Initiate iteration through the tasks of @css. The caller can call
3204 * css_task_iter_next() to walk through the tasks until the function
3205 * returns NULL. On completion of iteration, css_task_iter_end() must be
3206 * called.
3207 *
3208 * Note that this function acquires a lock which is released when the
3209 * iteration finishes. The caller can't sleep while iteration is in
3210 * progress.
3211 */
3212void css_task_iter_start(struct cgroup_subsys_state *css,
3213 struct css_task_iter *it)
3199 __acquires(css_set_lock) 3214 __acquires(css_set_lock)
3200{ 3215{
3201 /* 3216 /*
3202 * The first time anyone tries to iterate across a cgroup, 3217 * The first time anyone tries to iterate across a css, we need to
3203 * we need to enable the list linking each css_set to its 3218 * enable the list linking each css_set to its tasks, and fix up
3204 * tasks, and fix up all existing tasks. 3219 * all existing tasks.
3205 */ 3220 */
3206 if (!use_task_css_set_links) 3221 if (!use_task_css_set_links)
3207 cgroup_enable_task_cg_lists(); 3222 cgroup_enable_task_cg_lists();
3208 3223
3209 read_lock(&css_set_lock); 3224 read_lock(&css_set_lock);
3210 it->cset_link = &cgrp->cset_links; 3225
3211 cgroup_advance_iter(cgrp, it); 3226 it->origin_css = css;
3227 it->cset_link = &css->cgroup->cset_links;
3228
3229 css_advance_task_iter(it);
3212} 3230}
3213 3231
3214struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 3232/**
3215 struct cgroup_iter *it) 3233 * css_task_iter_next - return the next task for the iterator
3234 * @it: the task iterator being iterated
3235 *
3236 * The "next" function for task iteration. @it should have been
3237 * initialized via css_task_iter_start(). Returns NULL when the iteration
3238 * reaches the end.
3239 */
3240struct task_struct *css_task_iter_next(struct css_task_iter *it)
3216{ 3241{
3217 struct task_struct *res; 3242 struct task_struct *res;
3218 struct list_head *l = it->task; 3243 struct list_head *l = it->task;
@@ -3226,16 +3251,24 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3226 l = l->next; 3251 l = l->next;
3227 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); 3252 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
3228 if (l == &link->cset->tasks) { 3253 if (l == &link->cset->tasks) {
3229 /* We reached the end of this task list - move on to 3254 /*
3230 * the next cg_cgroup_link */ 3255 * We reached the end of this task list - move on to the
3231 cgroup_advance_iter(cgrp, it); 3256 * next cgrp_cset_link.
3257 */
3258 css_advance_task_iter(it);
3232 } else { 3259 } else {
3233 it->task = l; 3260 it->task = l;
3234 } 3261 }
3235 return res; 3262 return res;
3236} 3263}
3237 3264
3238void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 3265/**
3266 * css_task_iter_end - finish task iteration
3267 * @it: the task iterator to finish
3268 *
3269 * Finish task iteration started by css_task_iter_start().
3270 */
3271void css_task_iter_end(struct css_task_iter *it)
3239 __releases(css_set_lock) 3272 __releases(css_set_lock)
3240{ 3273{
3241 read_unlock(&css_set_lock); 3274 read_unlock(&css_set_lock);
@@ -3276,46 +3309,49 @@ static inline int started_after(void *p1, void *p2)
3276} 3309}
3277 3310
3278/** 3311/**
3279 * cgroup_scan_tasks - iterate though all the tasks in a cgroup 3312 * css_scan_tasks - iterate though all the tasks in a css
3280 * @scan: struct cgroup_scanner containing arguments for the scan 3313 * @css: the css to iterate tasks of
3314 * @test: optional test callback
3315 * @process: process callback
3316 * @data: data passed to @test and @process
3317 * @heap: optional pre-allocated heap used for task iteration
3318 *
3319 * Iterate through all the tasks in @css, calling @test for each, and if it
3320 * returns %true, call @process for it also.
3281 * 3321 *
3282 * Arguments include pointers to callback functions test_task() and 3322 * @test may be NULL, meaning always true (select all tasks), which
3283 * process_task(). 3323 * effectively duplicates css_task_iter_{start,next,end}() but does not
3284 * Iterate through all the tasks in a cgroup, calling test_task() for each, 3324 * lock css_set_lock for the call to @process.
3285 * and if it returns true, call process_task() for it also.
3286 * The test_task pointer may be NULL, meaning always true (select all tasks).
3287 * Effectively duplicates cgroup_iter_{start,next,end}()
3288 * but does not lock css_set_lock for the call to process_task().
3289 * The struct cgroup_scanner may be embedded in any structure of the caller's
3290 * creation.
3291 * It is guaranteed that process_task() will act on every task that
3292 * is a member of the cgroup for the duration of this call. This
3293 * function may or may not call process_task() for tasks that exit
3294 * or move to a different cgroup during the call, or are forked or
3295 * move into the cgroup during the call.
3296 * 3325 *
3297 * Note that test_task() may be called with locks held, and may in some 3326 * It is guaranteed that @process will act on every task that is a member
3298 * situations be called multiple times for the same task, so it should 3327 * of @css for the duration of this call. This function may or may not
3299 * be cheap. 3328 * call @process for tasks that exit or move to a different css during the
3300 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been 3329 * call, or are forked or move into the css during the call.
3301 * pre-allocated and will be used for heap operations (and its "gt" member will 3330 *
3302 * be overwritten), else a temporary heap will be used (allocation of which 3331 * Note that @test may be called with locks held, and may in some
3303 * may cause this function to fail). 3332 * situations be called multiple times for the same task, so it should be
3333 * cheap.
3334 *
3335 * If @heap is non-NULL, a heap has been pre-allocated and will be used for
3336 * heap operations (and its "gt" member will be overwritten), else a
3337 * temporary heap will be used (allocation of which may cause this function
3338 * to fail).
3304 */ 3339 */
3305int cgroup_scan_tasks(struct cgroup_scanner *scan) 3340int css_scan_tasks(struct cgroup_subsys_state *css,
3341 bool (*test)(struct task_struct *, void *),
3342 void (*process)(struct task_struct *, void *),
3343 void *data, struct ptr_heap *heap)
3306{ 3344{
3307 int retval, i; 3345 int retval, i;
3308 struct cgroup_iter it; 3346 struct css_task_iter it;
3309 struct task_struct *p, *dropped; 3347 struct task_struct *p, *dropped;
3310 /* Never dereference latest_task, since it's not refcounted */ 3348 /* Never dereference latest_task, since it's not refcounted */
3311 struct task_struct *latest_task = NULL; 3349 struct task_struct *latest_task = NULL;
3312 struct ptr_heap tmp_heap; 3350 struct ptr_heap tmp_heap;
3313 struct ptr_heap *heap;
3314 struct timespec latest_time = { 0, 0 }; 3351 struct timespec latest_time = { 0, 0 };
3315 3352
3316 if (scan->heap) { 3353 if (heap) {
3317 /* The caller supplied our heap and pre-allocated its memory */ 3354 /* The caller supplied our heap and pre-allocated its memory */
3318 heap = scan->heap;
3319 heap->gt = &started_after; 3355 heap->gt = &started_after;
3320 } else { 3356 } else {
3321 /* We need to allocate our own heap memory */ 3357 /* We need to allocate our own heap memory */
@@ -3328,25 +3364,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3328 3364
3329 again: 3365 again:
3330 /* 3366 /*
3331 * Scan tasks in the cgroup, using the scanner's "test_task" callback 3367 * Scan tasks in the css, using the @test callback to determine
3332 * to determine which are of interest, and using the scanner's 3368 * which are of interest, and invoking @process callback on the
3333 * "process_task" callback to process any of them that need an update. 3369 * ones which need an update. Since we don't want to hold any
3334 * Since we don't want to hold any locks during the task updates, 3370 * locks during the task updates, gather tasks to be processed in a
3335 * gather tasks to be processed in a heap structure. 3371 * heap structure. The heap is sorted by descending task start
3336 * The heap is sorted by descending task start time. 3372 * time. If the statically-sized heap fills up, we overflow tasks
3337 * If the statically-sized heap fills up, we overflow tasks that 3373 * that started later, and in future iterations only consider tasks
3338 * started later, and in future iterations only consider tasks that 3374 * that started after the latest task in the previous pass. This
3339 * started after the latest task in the previous pass. This
3340 * guarantees forward progress and that we don't miss any tasks. 3375 * guarantees forward progress and that we don't miss any tasks.
3341 */ 3376 */
3342 heap->size = 0; 3377 heap->size = 0;
3343 cgroup_iter_start(scan->cg, &it); 3378 css_task_iter_start(css, &it);
3344 while ((p = cgroup_iter_next(scan->cg, &it))) { 3379 while ((p = css_task_iter_next(&it))) {
3345 /* 3380 /*
3346 * Only affect tasks that qualify per the caller's callback, 3381 * Only affect tasks that qualify per the caller's callback,
3347 * if he provided one 3382 * if he provided one
3348 */ 3383 */
3349 if (scan->test_task && !scan->test_task(p, scan)) 3384 if (test && !test(p, data))
3350 continue; 3385 continue;
3351 /* 3386 /*
3352 * Only process tasks that started after the last task 3387 * Only process tasks that started after the last task
@@ -3374,7 +3409,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3374 * the heap and wasn't inserted 3409 * the heap and wasn't inserted
3375 */ 3410 */
3376 } 3411 }
3377 cgroup_iter_end(scan->cg, &it); 3412 css_task_iter_end(&it);
3378 3413
3379 if (heap->size) { 3414 if (heap->size) {
3380 for (i = 0; i < heap->size; i++) { 3415 for (i = 0; i < heap->size; i++) {
@@ -3384,7 +3419,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3384 latest_task = q; 3419 latest_task = q;
3385 } 3420 }
3386 /* Process the task per the caller's callback */ 3421 /* Process the task per the caller's callback */
3387 scan->process_task(q, scan); 3422 process(q, data);
3388 put_task_struct(q); 3423 put_task_struct(q);
3389 } 3424 }
3390 /* 3425 /*
@@ -3401,10 +3436,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3401 return 0; 3436 return 0;
3402} 3437}
3403 3438
3404static void cgroup_transfer_one_task(struct task_struct *task, 3439static void cgroup_transfer_one_task(struct task_struct *task, void *data)
3405 struct cgroup_scanner *scan)
3406{ 3440{
3407 struct cgroup *new_cgroup = scan->data; 3441 struct cgroup *new_cgroup = data;
3408 3442
3409 mutex_lock(&cgroup_mutex); 3443 mutex_lock(&cgroup_mutex);
3410 cgroup_attach_task(new_cgroup, task, false); 3444 cgroup_attach_task(new_cgroup, task, false);
@@ -3418,15 +3452,8 @@ static void cgroup_transfer_one_task(struct task_struct *task,
3418 */ 3452 */
3419int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) 3453int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3420{ 3454{
3421 struct cgroup_scanner scan; 3455 return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3422 3456 to, NULL);
3423 scan.cg = from;
3424 scan.test_task = NULL; /* select all tasks in cgroup */
3425 scan.process_task = cgroup_transfer_one_task;
3426 scan.heap = NULL;
3427 scan.data = to;
3428
3429 return cgroup_scan_tasks(&scan);
3430} 3457}
3431 3458
3432/* 3459/*
@@ -3468,7 +3495,7 @@ struct cgroup_pidlist {
3468 /* pointer to the cgroup we belong to, for list removal purposes */ 3495 /* pointer to the cgroup we belong to, for list removal purposes */
3469 struct cgroup *owner; 3496 struct cgroup *owner;
3470 /* protects the other fields */ 3497 /* protects the other fields */
3471 struct rw_semaphore mutex; 3498 struct rw_semaphore rwsem;
3472}; 3499};
3473 3500
3474/* 3501/*
@@ -3541,7 +3568,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3541 struct pid_namespace *ns = task_active_pid_ns(current); 3568 struct pid_namespace *ns = task_active_pid_ns(current);
3542 3569
3543 /* 3570 /*
3544 * We can't drop the pidlist_mutex before taking the l->mutex in case 3571 * We can't drop the pidlist_mutex before taking the l->rwsem in case
3545 * the last ref-holder is trying to remove l from the list at the same 3572 * the last ref-holder is trying to remove l from the list at the same
3546 * time. Holding the pidlist_mutex precludes somebody taking whichever 3573 * time. Holding the pidlist_mutex precludes somebody taking whichever
3547 * list we find out from under us - compare release_pid_array(). 3574 * list we find out from under us - compare release_pid_array().
@@ -3550,7 +3577,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3550 list_for_each_entry(l, &cgrp->pidlists, links) { 3577 list_for_each_entry(l, &cgrp->pidlists, links) {
3551 if (l->key.type == type && l->key.ns == ns) { 3578 if (l->key.type == type && l->key.ns == ns) {
3552 /* make sure l doesn't vanish out from under us */ 3579 /* make sure l doesn't vanish out from under us */
3553 down_write(&l->mutex); 3580 down_write(&l->rwsem);
3554 mutex_unlock(&cgrp->pidlist_mutex); 3581 mutex_unlock(&cgrp->pidlist_mutex);
3555 return l; 3582 return l;
3556 } 3583 }
@@ -3561,8 +3588,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3561 mutex_unlock(&cgrp->pidlist_mutex); 3588 mutex_unlock(&cgrp->pidlist_mutex);
3562 return l; 3589 return l;
3563 } 3590 }
3564 init_rwsem(&l->mutex); 3591 init_rwsem(&l->rwsem);
3565 down_write(&l->mutex); 3592 down_write(&l->rwsem);
3566 l->key.type = type; 3593 l->key.type = type;
3567 l->key.ns = get_pid_ns(ns); 3594 l->key.ns = get_pid_ns(ns);
3568 l->owner = cgrp; 3595 l->owner = cgrp;
@@ -3580,7 +3607,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3580 pid_t *array; 3607 pid_t *array;
3581 int length; 3608 int length;
3582 int pid, n = 0; /* used for populating the array */ 3609 int pid, n = 0; /* used for populating the array */
3583 struct cgroup_iter it; 3610 struct css_task_iter it;
3584 struct task_struct *tsk; 3611 struct task_struct *tsk;
3585 struct cgroup_pidlist *l; 3612 struct cgroup_pidlist *l;
3586 3613
@@ -3595,8 +3622,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3595 if (!array) 3622 if (!array)
3596 return -ENOMEM; 3623 return -ENOMEM;
3597 /* now, populate the array */ 3624 /* now, populate the array */
3598 cgroup_iter_start(cgrp, &it); 3625 css_task_iter_start(&cgrp->dummy_css, &it);
3599 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3626 while ((tsk = css_task_iter_next(&it))) {
3600 if (unlikely(n == length)) 3627 if (unlikely(n == length))
3601 break; 3628 break;
3602 /* get tgid or pid for procs or tasks file respectively */ 3629 /* get tgid or pid for procs or tasks file respectively */
@@ -3607,7 +3634,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3607 if (pid > 0) /* make sure to only use valid results */ 3634 if (pid > 0) /* make sure to only use valid results */
3608 array[n++] = pid; 3635 array[n++] = pid;
3609 } 3636 }
3610 cgroup_iter_end(cgrp, &it); 3637 css_task_iter_end(&it);
3611 length = n; 3638 length = n;
3612 /* now sort & (if procs) strip out duplicates */ 3639 /* now sort & (if procs) strip out duplicates */
3613 sort(array, length, sizeof(pid_t), cmppid, NULL); 3640 sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -3623,7 +3650,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3623 l->list = array; 3650 l->list = array;
3624 l->length = length; 3651 l->length = length;
3625 l->use_count++; 3652 l->use_count++;
3626 up_write(&l->mutex); 3653 up_write(&l->rwsem);
3627 *lp = l; 3654 *lp = l;
3628 return 0; 3655 return 0;
3629} 3656}
@@ -3641,7 +3668,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3641{ 3668{
3642 int ret = -EINVAL; 3669 int ret = -EINVAL;
3643 struct cgroup *cgrp; 3670 struct cgroup *cgrp;
3644 struct cgroup_iter it; 3671 struct css_task_iter it;
3645 struct task_struct *tsk; 3672 struct task_struct *tsk;
3646 3673
3647 /* 3674 /*
@@ -3655,8 +3682,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3655 ret = 0; 3682 ret = 0;
3656 cgrp = dentry->d_fsdata; 3683 cgrp = dentry->d_fsdata;
3657 3684
3658 cgroup_iter_start(cgrp, &it); 3685 css_task_iter_start(&cgrp->dummy_css, &it);
3659 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3686 while ((tsk = css_task_iter_next(&it))) {
3660 switch (tsk->state) { 3687 switch (tsk->state) {
3661 case TASK_RUNNING: 3688 case TASK_RUNNING:
3662 stats->nr_running++; 3689 stats->nr_running++;
@@ -3676,7 +3703,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3676 break; 3703 break;
3677 } 3704 }
3678 } 3705 }
3679 cgroup_iter_end(cgrp, &it); 3706 css_task_iter_end(&it);
3680 3707
3681err: 3708err:
3682 return ret; 3709 return ret;
@@ -3701,7 +3728,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3701 int index = 0, pid = *pos; 3728 int index = 0, pid = *pos;
3702 int *iter; 3729 int *iter;
3703 3730
3704 down_read(&l->mutex); 3731 down_read(&l->rwsem);
3705 if (pid) { 3732 if (pid) {
3706 int end = l->length; 3733 int end = l->length;
3707 3734
@@ -3728,7 +3755,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3728static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3755static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3729{ 3756{
3730 struct cgroup_pidlist *l = s->private; 3757 struct cgroup_pidlist *l = s->private;
3731 up_read(&l->mutex); 3758 up_read(&l->rwsem);
3732} 3759}
3733 3760
3734static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3761static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3774,7 +3801,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3774 * pidlist_mutex, we have to take pidlist_mutex first. 3801 * pidlist_mutex, we have to take pidlist_mutex first.
3775 */ 3802 */
3776 mutex_lock(&l->owner->pidlist_mutex); 3803 mutex_lock(&l->owner->pidlist_mutex);
3777 down_write(&l->mutex); 3804 down_write(&l->rwsem);
3778 BUG_ON(!l->use_count); 3805 BUG_ON(!l->use_count);
3779 if (!--l->use_count) { 3806 if (!--l->use_count) {
3780 /* we're the last user if refcount is 0; remove and free */ 3807 /* we're the last user if refcount is 0; remove and free */
@@ -3782,12 +3809,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3782 mutex_unlock(&l->owner->pidlist_mutex); 3809 mutex_unlock(&l->owner->pidlist_mutex);
3783 pidlist_free(l->list); 3810 pidlist_free(l->list);
3784 put_pid_ns(l->key.ns); 3811 put_pid_ns(l->key.ns);
3785 up_write(&l->mutex); 3812 up_write(&l->rwsem);
3786 kfree(l); 3813 kfree(l);
3787 return; 3814 return;
3788 } 3815 }
3789 mutex_unlock(&l->owner->pidlist_mutex); 3816 mutex_unlock(&l->owner->pidlist_mutex);
3790 up_write(&l->mutex); 3817 up_write(&l->rwsem);
3791} 3818}
3792 3819
3793static int cgroup_pidlist_release(struct inode *inode, struct file *file) 3820static int cgroup_pidlist_release(struct inode *inode, struct file *file)
@@ -3851,21 +3878,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file)
3851 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); 3878 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3852} 3879}
3853 3880
3854static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 3881static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3855 struct cftype *cft) 3882 struct cftype *cft)
3856{ 3883{
3857 return notify_on_release(cgrp); 3884 return notify_on_release(css->cgroup);
3858} 3885}
3859 3886
3860static int cgroup_write_notify_on_release(struct cgroup *cgrp, 3887static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3861 struct cftype *cft, 3888 struct cftype *cft, u64 val)
3862 u64 val)
3863{ 3889{
3864 clear_bit(CGRP_RELEASABLE, &cgrp->flags); 3890 clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3865 if (val) 3891 if (val)
3866 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3892 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3867 else 3893 else
3868 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3894 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3869 return 0; 3895 return 0;
3870} 3896}
3871 3897
@@ -3895,18 +3921,18 @@ static void cgroup_event_remove(struct work_struct *work)
3895{ 3921{
3896 struct cgroup_event *event = container_of(work, struct cgroup_event, 3922 struct cgroup_event *event = container_of(work, struct cgroup_event,
3897 remove); 3923 remove);
3898 struct cgroup *cgrp = event->cgrp; 3924 struct cgroup_subsys_state *css = event->css;
3899 3925
3900 remove_wait_queue(event->wqh, &event->wait); 3926 remove_wait_queue(event->wqh, &event->wait);
3901 3927
3902 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3928 event->cft->unregister_event(css, event->cft, event->eventfd);
3903 3929
3904 /* Notify userspace the event is going away. */ 3930 /* Notify userspace the event is going away. */
3905 eventfd_signal(event->eventfd, 1); 3931 eventfd_signal(event->eventfd, 1);
3906 3932
3907 eventfd_ctx_put(event->eventfd); 3933 eventfd_ctx_put(event->eventfd);
3908 kfree(event); 3934 kfree(event);
3909 cgroup_dput(cgrp); 3935 css_put(css);
3910} 3936}
3911 3937
3912/* 3938/*
@@ -3919,7 +3945,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3919{ 3945{
3920 struct cgroup_event *event = container_of(wait, 3946 struct cgroup_event *event = container_of(wait,
3921 struct cgroup_event, wait); 3947 struct cgroup_event, wait);
3922 struct cgroup *cgrp = event->cgrp; 3948 struct cgroup *cgrp = event->css->cgroup;
3923 unsigned long flags = (unsigned long)key; 3949 unsigned long flags = (unsigned long)key;
3924 3950
3925 if (flags & POLLHUP) { 3951 if (flags & POLLHUP) {
@@ -3963,14 +3989,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file,
3963 * Input must be in format '<event_fd> <control_fd> <args>'. 3989 * Input must be in format '<event_fd> <control_fd> <args>'.
3964 * Interpretation of args is defined by control file implementation. 3990 * Interpretation of args is defined by control file implementation.
3965 */ 3991 */
3966static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, 3992static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
3967 const char *buffer) 3993 struct cftype *cft, const char *buffer)
3968{ 3994{
3969 struct cgroup_event *event = NULL; 3995 struct cgroup *cgrp = dummy_css->cgroup;
3970 struct cgroup *cgrp_cfile; 3996 struct cgroup_event *event;
3997 struct cgroup_subsys_state *cfile_css;
3971 unsigned int efd, cfd; 3998 unsigned int efd, cfd;
3972 struct file *efile = NULL; 3999 struct fd efile;
3973 struct file *cfile = NULL; 4000 struct fd cfile;
3974 char *endp; 4001 char *endp;
3975 int ret; 4002 int ret;
3976 4003
@@ -3987,109 +4014,113 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3987 event = kzalloc(sizeof(*event), GFP_KERNEL); 4014 event = kzalloc(sizeof(*event), GFP_KERNEL);
3988 if (!event) 4015 if (!event)
3989 return -ENOMEM; 4016 return -ENOMEM;
3990 event->cgrp = cgrp; 4017
3991 INIT_LIST_HEAD(&event->list); 4018 INIT_LIST_HEAD(&event->list);
3992 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); 4019 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3993 init_waitqueue_func_entry(&event->wait, cgroup_event_wake); 4020 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3994 INIT_WORK(&event->remove, cgroup_event_remove); 4021 INIT_WORK(&event->remove, cgroup_event_remove);
3995 4022
3996 efile = eventfd_fget(efd); 4023 efile = fdget(efd);
3997 if (IS_ERR(efile)) { 4024 if (!efile.file) {
3998 ret = PTR_ERR(efile); 4025 ret = -EBADF;
3999 goto fail; 4026 goto out_kfree;
4000 } 4027 }
4001 4028
4002 event->eventfd = eventfd_ctx_fileget(efile); 4029 event->eventfd = eventfd_ctx_fileget(efile.file);
4003 if (IS_ERR(event->eventfd)) { 4030 if (IS_ERR(event->eventfd)) {
4004 ret = PTR_ERR(event->eventfd); 4031 ret = PTR_ERR(event->eventfd);
4005 goto fail; 4032 goto out_put_efile;
4006 } 4033 }
4007 4034
4008 cfile = fget(cfd); 4035 cfile = fdget(cfd);
4009 if (!cfile) { 4036 if (!cfile.file) {
4010 ret = -EBADF; 4037 ret = -EBADF;
4011 goto fail; 4038 goto out_put_eventfd;
4012 } 4039 }
4013 4040
4014 /* the process need read permission on control file */ 4041 /* the process need read permission on control file */
4015 /* AV: shouldn't we check that it's been opened for read instead? */ 4042 /* AV: shouldn't we check that it's been opened for read instead? */
4016 ret = inode_permission(file_inode(cfile), MAY_READ); 4043 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4017 if (ret < 0) 4044 if (ret < 0)
4018 goto fail; 4045 goto out_put_cfile;
4019 4046
4020 event->cft = __file_cft(cfile); 4047 event->cft = __file_cft(cfile.file);
4021 if (IS_ERR(event->cft)) { 4048 if (IS_ERR(event->cft)) {
4022 ret = PTR_ERR(event->cft); 4049 ret = PTR_ERR(event->cft);
4023 goto fail; 4050 goto out_put_cfile;
4051 }
4052
4053 if (!event->cft->ss) {
4054 ret = -EBADF;
4055 goto out_put_cfile;
4024 } 4056 }
4025 4057
4026 /* 4058 /*
4027 * The file to be monitored must be in the same cgroup as 4059 * Determine the css of @cfile, verify it belongs to the same
4028 * cgroup.event_control is. 4060 * cgroup as cgroup.event_control, and associate @event with it.
4061 * Remaining events are automatically removed on cgroup destruction
4062 * but the removal is asynchronous, so take an extra ref.
4029 */ 4063 */
4030 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); 4064 rcu_read_lock();
4031 if (cgrp_cfile != cgrp) { 4065
4032 ret = -EINVAL; 4066 ret = -EINVAL;
4033 goto fail; 4067 event->css = cgroup_css(cgrp, event->cft->ss);
4034 } 4068 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
4069 if (event->css && event->css == cfile_css && css_tryget(event->css))
4070 ret = 0;
4071
4072 rcu_read_unlock();
4073 if (ret)
4074 goto out_put_cfile;
4035 4075
4036 if (!event->cft->register_event || !event->cft->unregister_event) { 4076 if (!event->cft->register_event || !event->cft->unregister_event) {
4037 ret = -EINVAL; 4077 ret = -EINVAL;
4038 goto fail; 4078 goto out_put_css;
4039 } 4079 }
4040 4080
4041 ret = event->cft->register_event(cgrp, event->cft, 4081 ret = event->cft->register_event(event->css, event->cft,
4042 event->eventfd, buffer); 4082 event->eventfd, buffer);
4043 if (ret) 4083 if (ret)
4044 goto fail; 4084 goto out_put_css;
4045
4046 efile->f_op->poll(efile, &event->pt);
4047 4085
4048 /* 4086 efile.file->f_op->poll(efile.file, &event->pt);
4049 * Events should be removed after rmdir of cgroup directory, but before
4050 * destroying subsystem state objects. Let's take reference to cgroup
4051 * directory dentry to do that.
4052 */
4053 dget(cgrp->dentry);
4054 4087
4055 spin_lock(&cgrp->event_list_lock); 4088 spin_lock(&cgrp->event_list_lock);
4056 list_add(&event->list, &cgrp->event_list); 4089 list_add(&event->list, &cgrp->event_list);
4057 spin_unlock(&cgrp->event_list_lock); 4090 spin_unlock(&cgrp->event_list_lock);
4058 4091
4059 fput(cfile); 4092 fdput(cfile);
4060 fput(efile); 4093 fdput(efile);
4061 4094
4062 return 0; 4095 return 0;
4063 4096
4064fail: 4097out_put_css:
4065 if (cfile) 4098 css_put(event->css);
4066 fput(cfile); 4099out_put_cfile:
4067 4100 fdput(cfile);
4068 if (event && event->eventfd && !IS_ERR(event->eventfd)) 4101out_put_eventfd:
4069 eventfd_ctx_put(event->eventfd); 4102 eventfd_ctx_put(event->eventfd);
4070 4103out_put_efile:
4071 if (!IS_ERR_OR_NULL(efile)) 4104 fdput(efile);
4072 fput(efile); 4105out_kfree:
4073
4074 kfree(event); 4106 kfree(event);
4075 4107
4076 return ret; 4108 return ret;
4077} 4109}
4078 4110
4079static u64 cgroup_clone_children_read(struct cgroup *cgrp, 4111static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4080 struct cftype *cft) 4112 struct cftype *cft)
4081{ 4113{
4082 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4114 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4083} 4115}
4084 4116
4085static int cgroup_clone_children_write(struct cgroup *cgrp, 4117static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4086 struct cftype *cft, 4118 struct cftype *cft, u64 val)
4087 u64 val)
4088{ 4119{
4089 if (val) 4120 if (val)
4090 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4121 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4091 else 4122 else
4092 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4123 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4093 return 0; 4124 return 0;
4094} 4125}
4095 4126
@@ -4148,56 +4179,82 @@ static struct cftype cgroup_base_files[] = {
4148}; 4179};
4149 4180
4150/** 4181/**
4151 * cgroup_populate_dir - selectively creation of files in a directory 4182 * cgroup_populate_dir - create subsys files in a cgroup directory
4152 * @cgrp: target cgroup 4183 * @cgrp: target cgroup
4153 * @base_files: true if the base files should be added
4154 * @subsys_mask: mask of the subsystem ids whose files should be added 4184 * @subsys_mask: mask of the subsystem ids whose files should be added
4185 *
4186 * On failure, no file is added.
4155 */ 4187 */
4156static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 4188static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
4157 unsigned long subsys_mask)
4158{ 4189{
4159 int err;
4160 struct cgroup_subsys *ss; 4190 struct cgroup_subsys *ss;
4161 4191 int i, ret = 0;
4162 if (base_files) {
4163 err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
4164 if (err < 0)
4165 return err;
4166 }
4167 4192
4168 /* process cftsets of each subsystem */ 4193 /* process cftsets of each subsystem */
4169 for_each_root_subsys(cgrp->root, ss) { 4194 for_each_subsys(ss, i) {
4170 struct cftype_set *set; 4195 struct cftype_set *set;
4171 if (!test_bit(ss->subsys_id, &subsys_mask)) 4196
4197 if (!test_bit(i, &subsys_mask))
4172 continue; 4198 continue;
4173 4199
4174 list_for_each_entry(set, &ss->cftsets, node) 4200 list_for_each_entry(set, &ss->cftsets, node) {
4175 cgroup_addrm_files(cgrp, ss, set->cfts, true); 4201 ret = cgroup_addrm_files(cgrp, set->cfts, true);
4202 if (ret < 0)
4203 goto err;
4204 }
4176 } 4205 }
4206 return 0;
4207err:
4208 cgroup_clear_dir(cgrp, subsys_mask);
4209 return ret;
4210}
4177 4211
4178 /* This cgroup is ready now */ 4212/*
4179 for_each_root_subsys(cgrp->root, ss) { 4213 * css destruction is four-stage process.
4180 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4214 *
4181 struct css_id *id = rcu_dereference_protected(css->id, true); 4215 * 1. Destruction starts. Killing of the percpu_ref is initiated.
4216 * Implemented in kill_css().
4217 *
4218 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4219 * and thus css_tryget() is guaranteed to fail, the css can be offlined
4220 * by invoking offline_css(). After offlining, the base ref is put.
4221 * Implemented in css_killed_work_fn().
4222 *
4223 * 3. When the percpu_ref reaches zero, the only possible remaining
4224 * accessors are inside RCU read sections. css_release() schedules the
4225 * RCU callback.
4226 *
4227 * 4. After the grace period, the css can be freed. Implemented in
4228 * css_free_work_fn().
4229 *
4230 * It is actually hairier because both step 2 and 4 require process context
4231 * and thus involve punting to css->destroy_work adding two additional
4232 * steps to the already complex sequence.
4233 */
4234static void css_free_work_fn(struct work_struct *work)
4235{
4236 struct cgroup_subsys_state *css =
4237 container_of(work, struct cgroup_subsys_state, destroy_work);
4238 struct cgroup *cgrp = css->cgroup;
4182 4239
4183 /* 4240 if (css->parent)
4184 * Update id->css pointer and make this css visible from 4241 css_put(css->parent);
4185 * CSS ID functions. This pointer will be dereferened
4186 * from RCU-read-side without locks.
4187 */
4188 if (id)
4189 rcu_assign_pointer(id->css, css);
4190 }
4191 4242
4192 return 0; 4243 css->ss->css_free(css);
4244 cgroup_dput(cgrp);
4193} 4245}
4194 4246
4195static void css_dput_fn(struct work_struct *work) 4247static void css_free_rcu_fn(struct rcu_head *rcu_head)
4196{ 4248{
4197 struct cgroup_subsys_state *css = 4249 struct cgroup_subsys_state *css =
4198 container_of(work, struct cgroup_subsys_state, dput_work); 4250 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4199 4251
4200 cgroup_dput(css->cgroup); 4252 /*
4253 * css holds an extra ref to @cgrp->dentry which is put on the last
4254 * css_put(). dput() requires process context which we don't have.
4255 */
4256 INIT_WORK(&css->destroy_work, css_free_work_fn);
4257 schedule_work(&css->destroy_work);
4201} 4258}
4202 4259
4203static void css_release(struct percpu_ref *ref) 4260static void css_release(struct percpu_ref *ref)
@@ -4205,49 +4262,46 @@ static void css_release(struct percpu_ref *ref)
4205 struct cgroup_subsys_state *css = 4262 struct cgroup_subsys_state *css =
4206 container_of(ref, struct cgroup_subsys_state, refcnt); 4263 container_of(ref, struct cgroup_subsys_state, refcnt);
4207 4264
4208 schedule_work(&css->dput_work); 4265 call_rcu(&css->rcu_head, css_free_rcu_fn);
4209} 4266}
4210 4267
4211static void init_cgroup_css(struct cgroup_subsys_state *css, 4268static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
4212 struct cgroup_subsys *ss, 4269 struct cgroup *cgrp)
4213 struct cgroup *cgrp)
4214{ 4270{
4215 css->cgroup = cgrp; 4271 css->cgroup = cgrp;
4272 css->ss = ss;
4216 css->flags = 0; 4273 css->flags = 0;
4217 css->id = NULL; 4274
4218 if (cgrp == cgroup_dummy_top) 4275 if (cgrp->parent)
4276 css->parent = cgroup_css(cgrp->parent, ss);
4277 else
4219 css->flags |= CSS_ROOT; 4278 css->flags |= CSS_ROOT;
4220 BUG_ON(cgrp->subsys[ss->subsys_id]);
4221 cgrp->subsys[ss->subsys_id] = css;
4222 4279
4223 /* 4280 BUG_ON(cgroup_css(cgrp, ss));
4224 * css holds an extra ref to @cgrp->dentry which is put on the last
4225 * css_put(). dput() requires process context, which css_put() may
4226 * be called without. @css->dput_work will be used to invoke
4227 * dput() asynchronously from css_put().
4228 */
4229 INIT_WORK(&css->dput_work, css_dput_fn);
4230} 4281}
4231 4282
4232/* invoke ->post_create() on a new CSS and mark it online if successful */ 4283/* invoke ->css_online() on a new CSS and mark it online if successful */
4233static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4284static int online_css(struct cgroup_subsys_state *css)
4234{ 4285{
4286 struct cgroup_subsys *ss = css->ss;
4235 int ret = 0; 4287 int ret = 0;
4236 4288
4237 lockdep_assert_held(&cgroup_mutex); 4289 lockdep_assert_held(&cgroup_mutex);
4238 4290
4239 if (ss->css_online) 4291 if (ss->css_online)
4240 ret = ss->css_online(cgrp); 4292 ret = ss->css_online(css);
4241 if (!ret) 4293 if (!ret) {
4242 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; 4294 css->flags |= CSS_ONLINE;
4295 css->cgroup->nr_css++;
4296 rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
4297 }
4243 return ret; 4298 return ret;
4244} 4299}
4245 4300
4246/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ 4301/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4247static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4302static void offline_css(struct cgroup_subsys_state *css)
4248 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4249{ 4303{
4250 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4304 struct cgroup_subsys *ss = css->ss;
4251 4305
4252 lockdep_assert_held(&cgroup_mutex); 4306 lockdep_assert_held(&cgroup_mutex);
4253 4307
@@ -4255,9 +4309,11 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4255 return; 4309 return;
4256 4310
4257 if (ss->css_offline) 4311 if (ss->css_offline)
4258 ss->css_offline(cgrp); 4312 ss->css_offline(css);
4259 4313
4260 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; 4314 css->flags &= ~CSS_ONLINE;
4315 css->cgroup->nr_css--;
4316 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4261} 4317}
4262 4318
4263/* 4319/*
@@ -4271,6 +4327,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4271static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4327static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4272 umode_t mode) 4328 umode_t mode)
4273{ 4329{
4330 struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
4274 struct cgroup *cgrp; 4331 struct cgroup *cgrp;
4275 struct cgroup_name *name; 4332 struct cgroup_name *name;
4276 struct cgroupfs_root *root = parent->root; 4333 struct cgroupfs_root *root = parent->root;
@@ -4288,7 +4345,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4288 goto err_free_cgrp; 4345 goto err_free_cgrp;
4289 rcu_assign_pointer(cgrp->name, name); 4346 rcu_assign_pointer(cgrp->name, name);
4290 4347
4291 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); 4348 /*
4349 * Temporarily set the pointer to NULL, so idr_find() won't return
4350 * a half-baked cgroup.
4351 */
4352 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4292 if (cgrp->id < 0) 4353 if (cgrp->id < 0)
4293 goto err_free_name; 4354 goto err_free_name;
4294 4355
@@ -4317,6 +4378,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4317 cgrp->dentry = dentry; 4378 cgrp->dentry = dentry;
4318 4379
4319 cgrp->parent = parent; 4380 cgrp->parent = parent;
4381 cgrp->dummy_css.parent = &parent->dummy_css;
4320 cgrp->root = parent->root; 4382 cgrp->root = parent->root;
4321 4383
4322 if (notify_on_release(parent)) 4384 if (notify_on_release(parent))
@@ -4328,25 +4390,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4328 for_each_root_subsys(root, ss) { 4390 for_each_root_subsys(root, ss) {
4329 struct cgroup_subsys_state *css; 4391 struct cgroup_subsys_state *css;
4330 4392
4331 css = ss->css_alloc(cgrp); 4393 css = ss->css_alloc(cgroup_css(parent, ss));
4332 if (IS_ERR(css)) { 4394 if (IS_ERR(css)) {
4333 err = PTR_ERR(css); 4395 err = PTR_ERR(css);
4334 goto err_free_all; 4396 goto err_free_all;
4335 } 4397 }
4398 css_ar[ss->subsys_id] = css;
4336 4399
4337 err = percpu_ref_init(&css->refcnt, css_release); 4400 err = percpu_ref_init(&css->refcnt, css_release);
4338 if (err) { 4401 if (err)
4339 ss->css_free(cgrp);
4340 goto err_free_all; 4402 goto err_free_all;
4341 }
4342
4343 init_cgroup_css(css, ss, cgrp);
4344 4403
4345 if (ss->use_id) { 4404 init_css(css, ss, cgrp);
4346 err = alloc_css_id(ss, parent, cgrp);
4347 if (err)
4348 goto err_free_all;
4349 }
4350 } 4405 }
4351 4406
4352 /* 4407 /*
@@ -4365,16 +4420,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4365 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4420 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4366 root->number_of_cgroups++; 4421 root->number_of_cgroups++;
4367 4422
4368 /* each css holds a ref to the cgroup's dentry */ 4423 /* each css holds a ref to the cgroup's dentry and the parent css */
4369 for_each_root_subsys(root, ss) 4424 for_each_root_subsys(root, ss) {
4425 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4426
4370 dget(dentry); 4427 dget(dentry);
4428 css_get(css->parent);
4429 }
4371 4430
4372 /* hold a ref to the parent's dentry */ 4431 /* hold a ref to the parent's dentry */
4373 dget(parent->dentry); 4432 dget(parent->dentry);
4374 4433
4375 /* creation succeeded, notify subsystems */ 4434 /* creation succeeded, notify subsystems */
4376 for_each_root_subsys(root, ss) { 4435 for_each_root_subsys(root, ss) {
4377 err = online_css(ss, cgrp); 4436 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4437
4438 err = online_css(css);
4378 if (err) 4439 if (err)
4379 goto err_destroy; 4440 goto err_destroy;
4380 4441
@@ -4388,7 +4449,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4388 } 4449 }
4389 } 4450 }
4390 4451
4391 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4452 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4453
4454 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4455 if (err)
4456 goto err_destroy;
4457
4458 err = cgroup_populate_dir(cgrp, root->subsys_mask);
4392 if (err) 4459 if (err)
4393 goto err_destroy; 4460 goto err_destroy;
4394 4461
@@ -4399,18 +4466,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4399 4466
4400err_free_all: 4467err_free_all:
4401 for_each_root_subsys(root, ss) { 4468 for_each_root_subsys(root, ss) {
4402 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4469 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4403 4470
4404 if (css) { 4471 if (css) {
4405 percpu_ref_cancel_init(&css->refcnt); 4472 percpu_ref_cancel_init(&css->refcnt);
4406 ss->css_free(cgrp); 4473 ss->css_free(css);
4407 } 4474 }
4408 } 4475 }
4409 mutex_unlock(&cgroup_mutex); 4476 mutex_unlock(&cgroup_mutex);
4410 /* Release the reference count that we took on the superblock */ 4477 /* Release the reference count that we took on the superblock */
4411 deactivate_super(sb); 4478 deactivate_super(sb);
4412err_free_id: 4479err_free_id:
4413 ida_simple_remove(&root->cgroup_ida, cgrp->id); 4480 idr_remove(&root->cgroup_idr, cgrp->id);
4414err_free_name: 4481err_free_name:
4415 kfree(rcu_dereference_raw(cgrp->name)); 4482 kfree(rcu_dereference_raw(cgrp->name));
4416err_free_cgrp: 4483err_free_cgrp:
@@ -4432,22 +4499,84 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4432 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4499 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4433} 4500}
4434 4501
4435static void cgroup_css_killed(struct cgroup *cgrp) 4502/*
4503 * This is called when the refcnt of a css is confirmed to be killed.
4504 * css_tryget() is now guaranteed to fail.
4505 */
4506static void css_killed_work_fn(struct work_struct *work)
4436{ 4507{
4437 if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) 4508 struct cgroup_subsys_state *css =
4438 return; 4509 container_of(work, struct cgroup_subsys_state, destroy_work);
4510 struct cgroup *cgrp = css->cgroup;
4439 4511
4440 /* percpu ref's of all css's are killed, kick off the next step */ 4512 mutex_lock(&cgroup_mutex);
4441 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); 4513
4442 schedule_work(&cgrp->destroy_work); 4514 /*
4515 * css_tryget() is guaranteed to fail now. Tell subsystems to
4516 * initate destruction.
4517 */
4518 offline_css(css);
4519
4520 /*
4521 * If @cgrp is marked dead, it's waiting for refs of all css's to
4522 * be disabled before proceeding to the second phase of cgroup
4523 * destruction. If we are the last one, kick it off.
4524 */
4525 if (!cgrp->nr_css && cgroup_is_dead(cgrp))
4526 cgroup_destroy_css_killed(cgrp);
4527
4528 mutex_unlock(&cgroup_mutex);
4529
4530 /*
4531 * Put the css refs from kill_css(). Each css holds an extra
4532 * reference to the cgroup's dentry and cgroup removal proceeds
4533 * regardless of css refs. On the last put of each css, whenever
4534 * that may be, the extra dentry ref is put so that dentry
4535 * destruction happens only after all css's are released.
4536 */
4537 css_put(css);
4443} 4538}
4444 4539
4445static void css_ref_killed_fn(struct percpu_ref *ref) 4540/* css kill confirmation processing requires process context, bounce */
4541static void css_killed_ref_fn(struct percpu_ref *ref)
4446{ 4542{
4447 struct cgroup_subsys_state *css = 4543 struct cgroup_subsys_state *css =
4448 container_of(ref, struct cgroup_subsys_state, refcnt); 4544 container_of(ref, struct cgroup_subsys_state, refcnt);
4449 4545
4450 cgroup_css_killed(css->cgroup); 4546 INIT_WORK(&css->destroy_work, css_killed_work_fn);
4547 schedule_work(&css->destroy_work);
4548}
4549
4550/**
4551 * kill_css - destroy a css
4552 * @css: css to destroy
4553 *
4554 * This function initiates destruction of @css by removing cgroup interface
4555 * files and putting its base reference. ->css_offline() will be invoked
4556 * asynchronously once css_tryget() is guaranteed to fail and when the
4557 * reference count reaches zero, @css will be released.
4558 */
4559static void kill_css(struct cgroup_subsys_state *css)
4560{
4561 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
4562
4563 /*
4564 * Killing would put the base ref, but we need to keep it alive
4565 * until after ->css_offline().
4566 */
4567 css_get(css);
4568
4569 /*
4570 * cgroup core guarantees that, by the time ->css_offline() is
4571 * invoked, no new css reference will be given out via
4572 * css_tryget(). We can't simply call percpu_ref_kill() and
4573 * proceed to offlining css's because percpu_ref_kill() doesn't
4574 * guarantee that the ref is seen as killed on all CPUs on return.
4575 *
4576 * Use percpu_ref_kill_and_confirm() to get notifications as each
4577 * css is confirmed to be seen as killed on all CPUs.
4578 */
4579 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4451} 4580}
4452 4581
4453/** 4582/**
@@ -4480,6 +4609,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4480 struct dentry *d = cgrp->dentry; 4609 struct dentry *d = cgrp->dentry;
4481 struct cgroup_event *event, *tmp; 4610 struct cgroup_event *event, *tmp;
4482 struct cgroup_subsys *ss; 4611 struct cgroup_subsys *ss;
4612 struct cgroup *child;
4483 bool empty; 4613 bool empty;
4484 4614
4485 lockdep_assert_held(&d->d_inode->i_mutex); 4615 lockdep_assert_held(&d->d_inode->i_mutex);
@@ -4490,47 +4620,41 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4490 * @cgrp from being removed while __put_css_set() is in progress. 4620 * @cgrp from being removed while __put_css_set() is in progress.
4491 */ 4621 */
4492 read_lock(&css_set_lock); 4622 read_lock(&css_set_lock);
4493 empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children); 4623 empty = list_empty(&cgrp->cset_links);
4494 read_unlock(&css_set_lock); 4624 read_unlock(&css_set_lock);
4495 if (!empty) 4625 if (!empty)
4496 return -EBUSY; 4626 return -EBUSY;
4497 4627
4498 /* 4628 /*
4499 * Block new css_tryget() by killing css refcnts. cgroup core 4629 * Make sure there's no live children. We can't test ->children
4500 * guarantees that, by the time ->css_offline() is invoked, no new 4630 * emptiness as dead children linger on it while being destroyed;
4501 * css reference will be given out via css_tryget(). We can't 4631 * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
4502 * simply call percpu_ref_kill() and proceed to offlining css's
4503 * because percpu_ref_kill() doesn't guarantee that the ref is seen
4504 * as killed on all CPUs on return.
4505 *
4506 * Use percpu_ref_kill_and_confirm() to get notifications as each
4507 * css is confirmed to be seen as killed on all CPUs. The
4508 * notification callback keeps track of the number of css's to be
4509 * killed and schedules cgroup_offline_fn() to perform the rest of
4510 * destruction once the percpu refs of all css's are confirmed to
4511 * be killed.
4512 */ 4632 */
4513 atomic_set(&cgrp->css_kill_cnt, 1); 4633 empty = true;
4514 for_each_root_subsys(cgrp->root, ss) { 4634 rcu_read_lock();
4515 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4635 list_for_each_entry_rcu(child, &cgrp->children, sibling) {
4516 4636 empty = cgroup_is_dead(child);
4517 /* 4637 if (!empty)
4518 * Killing would put the base ref, but we need to keep it 4638 break;
4519 * alive until after ->css_offline.
4520 */
4521 percpu_ref_get(&css->refcnt);
4522
4523 atomic_inc(&cgrp->css_kill_cnt);
4524 percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
4525 } 4639 }
4526 cgroup_css_killed(cgrp); 4640 rcu_read_unlock();
4641 if (!empty)
4642 return -EBUSY;
4643
4644 /*
4645 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4646 * will be invoked to perform the rest of destruction once the
4647 * percpu refs of all css's are confirmed to be killed.
4648 */
4649 for_each_root_subsys(cgrp->root, ss)
4650 kill_css(cgroup_css(cgrp, ss));
4527 4651
4528 /* 4652 /*
4529 * Mark @cgrp dead. This prevents further task migration and child 4653 * Mark @cgrp dead. This prevents further task migration and child
4530 * creation by disabling cgroup_lock_live_group(). Note that 4654 * creation by disabling cgroup_lock_live_group(). Note that
4531 * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to 4655 * CGRP_DEAD assertion is depended upon by css_next_child() to
4532 * resume iteration after dropping RCU read lock. See 4656 * resume iteration after dropping RCU read lock. See
4533 * cgroup_next_sibling() for details. 4657 * css_next_child() for details.
4534 */ 4658 */
4535 set_bit(CGRP_DEAD, &cgrp->flags); 4659 set_bit(CGRP_DEAD, &cgrp->flags);
4536 4660
@@ -4541,9 +4665,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4541 raw_spin_unlock(&release_list_lock); 4665 raw_spin_unlock(&release_list_lock);
4542 4666
4543 /* 4667 /*
4544 * Remove @cgrp directory. The removal puts the base ref but we 4668 * If @cgrp has css's attached, the second stage of cgroup
4545 * aren't quite done with @cgrp yet, so hold onto it. 4669 * destruction is kicked off from css_killed_work_fn() after the
4670 * refs of all attached css's are killed. If @cgrp doesn't have
4671 * any css, we kick it off here.
4546 */ 4672 */
4673 if (!cgrp->nr_css)
4674 cgroup_destroy_css_killed(cgrp);
4675
4676 /*
4677 * Clear the base files and remove @cgrp directory. The removal
4678 * puts the base ref but we aren't quite done with @cgrp yet, so
4679 * hold onto it.
4680 */
4681 cgroup_addrm_files(cgrp, cgroup_base_files, false);
4547 dget(d); 4682 dget(d);
4548 cgroup_d_remove_dir(d); 4683 cgroup_d_remove_dir(d);
4549 4684
@@ -4563,50 +4698,36 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4563}; 4698};
4564 4699
4565/** 4700/**
4566 * cgroup_offline_fn - the second step of cgroup destruction 4701 * cgroup_destroy_css_killed - the second step of cgroup destruction
4567 * @work: cgroup->destroy_free_work 4702 * @work: cgroup->destroy_free_work
4568 * 4703 *
4569 * This function is invoked from a work item for a cgroup which is being 4704 * This function is invoked from a work item for a cgroup which is being
4570 * destroyed after the percpu refcnts of all css's are guaranteed to be 4705 * destroyed after all css's are offlined and performs the rest of
4571 * seen as killed on all CPUs, and performs the rest of destruction. This 4706 * destruction. This is the second step of destruction described in the
4572 * is the second step of destruction described in the comment above 4707 * comment above cgroup_destroy_locked().
4573 * cgroup_destroy_locked().
4574 */ 4708 */
4575static void cgroup_offline_fn(struct work_struct *work) 4709static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4576{ 4710{
4577 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
4578 struct cgroup *parent = cgrp->parent; 4711 struct cgroup *parent = cgrp->parent;
4579 struct dentry *d = cgrp->dentry; 4712 struct dentry *d = cgrp->dentry;
4580 struct cgroup_subsys *ss;
4581 4713
4582 mutex_lock(&cgroup_mutex); 4714 lockdep_assert_held(&cgroup_mutex);
4583 4715
4584 /* 4716 /* delete this cgroup from parent->children */
4585 * css_tryget() is guaranteed to fail now. Tell subsystems to 4717 list_del_rcu(&cgrp->sibling);
4586 * initate destruction.
4587 */
4588 for_each_root_subsys(cgrp->root, ss)
4589 offline_css(ss, cgrp);
4590 4718
4591 /* 4719 /*
4592 * Put the css refs from cgroup_destroy_locked(). Each css holds 4720 * We should remove the cgroup object from idr before its grace
4593 * an extra reference to the cgroup's dentry and cgroup removal 4721 * period starts, so we won't be looking up a cgroup while the
4594 * proceeds regardless of css refs. On the last put of each css, 4722 * cgroup is being freed.
4595 * whenever that may be, the extra dentry ref is put so that dentry
4596 * destruction happens only after all css's are released.
4597 */ 4723 */
4598 for_each_root_subsys(cgrp->root, ss) 4724 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4599 css_put(cgrp->subsys[ss->subsys_id]); 4725 cgrp->id = -1;
4600
4601 /* delete this cgroup from parent->children */
4602 list_del_rcu(&cgrp->sibling);
4603 4726
4604 dput(d); 4727 dput(d);
4605 4728
4606 set_bit(CGRP_RELEASABLE, &parent->flags); 4729 set_bit(CGRP_RELEASABLE, &parent->flags);
4607 check_for_release(parent); 4730 check_for_release(parent);
4608
4609 mutex_unlock(&cgroup_mutex);
4610} 4731}
4611 4732
4612static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4733static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4629,6 +4750,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4629 * deregistration. 4750 * deregistration.
4630 */ 4751 */
4631 if (ss->base_cftypes) { 4752 if (ss->base_cftypes) {
4753 struct cftype *cft;
4754
4755 for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
4756 cft->ss = ss;
4757
4632 ss->base_cftset.cfts = ss->base_cftypes; 4758 ss->base_cftset.cfts = ss->base_cftypes;
4633 list_add_tail(&ss->base_cftset.node, &ss->cftsets); 4759 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4634 } 4760 }
@@ -4648,10 +4774,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4648 /* Create the top cgroup state for this subsystem */ 4774 /* Create the top cgroup state for this subsystem */
4649 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); 4775 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4650 ss->root = &cgroup_dummy_root; 4776 ss->root = &cgroup_dummy_root;
4651 css = ss->css_alloc(cgroup_dummy_top); 4777 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4652 /* We don't handle early failures gracefully */ 4778 /* We don't handle early failures gracefully */
4653 BUG_ON(IS_ERR(css)); 4779 BUG_ON(IS_ERR(css));
4654 init_cgroup_css(css, ss, cgroup_dummy_top); 4780 init_css(css, ss, cgroup_dummy_top);
4655 4781
4656 /* Update the init_css_set to contain a subsys 4782 /* Update the init_css_set to contain a subsys
4657 * pointer to this state - since the subsystem is 4783 * pointer to this state - since the subsystem is
@@ -4666,7 +4792,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4666 * need to invoke fork callbacks here. */ 4792 * need to invoke fork callbacks here. */
4667 BUG_ON(!list_empty(&init_task.tasks)); 4793 BUG_ON(!list_empty(&init_task.tasks));
4668 4794
4669 BUG_ON(online_css(ss, cgroup_dummy_top)); 4795 BUG_ON(online_css(css));
4670 4796
4671 mutex_unlock(&cgroup_mutex); 4797 mutex_unlock(&cgroup_mutex);
4672 4798
@@ -4727,7 +4853,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4727 * struct, so this can happen first (i.e. before the dummy root 4853 * struct, so this can happen first (i.e. before the dummy root
4728 * attachment). 4854 * attachment).
4729 */ 4855 */
4730 css = ss->css_alloc(cgroup_dummy_top); 4856 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4731 if (IS_ERR(css)) { 4857 if (IS_ERR(css)) {
4732 /* failure case - need to deassign the cgroup_subsys[] slot. */ 4858 /* failure case - need to deassign the cgroup_subsys[] slot. */
4733 cgroup_subsys[ss->subsys_id] = NULL; 4859 cgroup_subsys[ss->subsys_id] = NULL;
@@ -4739,13 +4865,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4739 ss->root = &cgroup_dummy_root; 4865 ss->root = &cgroup_dummy_root;
4740 4866
4741 /* our new subsystem will be attached to the dummy hierarchy. */ 4867 /* our new subsystem will be attached to the dummy hierarchy. */
4742 init_cgroup_css(css, ss, cgroup_dummy_top); 4868 init_css(css, ss, cgroup_dummy_top);
4743 /* init_idr must be after init_cgroup_css because it sets css->id. */
4744 if (ss->use_id) {
4745 ret = cgroup_init_idr(ss, css);
4746 if (ret)
4747 goto err_unload;
4748 }
4749 4869
4750 /* 4870 /*
4751 * Now we need to entangle the css into the existing css_sets. unlike 4871 * Now we need to entangle the css into the existing css_sets. unlike
@@ -4770,7 +4890,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4770 } 4890 }
4771 write_unlock(&css_set_lock); 4891 write_unlock(&css_set_lock);
4772 4892
4773 ret = online_css(ss, cgroup_dummy_top); 4893 ret = online_css(css);
4774 if (ret) 4894 if (ret)
4775 goto err_unload; 4895 goto err_unload;
4776 4896
@@ -4802,17 +4922,14 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4802 4922
4803 /* 4923 /*
4804 * we shouldn't be called if the subsystem is in use, and the use of 4924 * we shouldn't be called if the subsystem is in use, and the use of
4805 * try_module_get in parse_cgroupfs_options should ensure that it 4925 * try_module_get() in rebind_subsystems() should ensure that it
4806 * doesn't start being used while we're killing it off. 4926 * doesn't start being used while we're killing it off.
4807 */ 4927 */
4808 BUG_ON(ss->root != &cgroup_dummy_root); 4928 BUG_ON(ss->root != &cgroup_dummy_root);
4809 4929
4810 mutex_lock(&cgroup_mutex); 4930 mutex_lock(&cgroup_mutex);
4811 4931
4812 offline_css(ss, cgroup_dummy_top); 4932 offline_css(cgroup_css(cgroup_dummy_top, ss));
4813
4814 if (ss->use_id)
4815 idr_destroy(&ss->idr);
4816 4933
4817 /* deassign the subsys_id */ 4934 /* deassign the subsys_id */
4818 cgroup_subsys[ss->subsys_id] = NULL; 4935 cgroup_subsys[ss->subsys_id] = NULL;
@@ -4840,11 +4957,10 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4840 /* 4957 /*
4841 * remove subsystem's css from the cgroup_dummy_top and free it - 4958 * remove subsystem's css from the cgroup_dummy_top and free it -
4842 * need to free before marking as null because ss->css_free needs 4959 * need to free before marking as null because ss->css_free needs
4843 * the cgrp->subsys pointer to find their state. note that this 4960 * the cgrp->subsys pointer to find their state.
4844 * also takes care of freeing the css_id.
4845 */ 4961 */
4846 ss->css_free(cgroup_dummy_top); 4962 ss->css_free(cgroup_css(cgroup_dummy_top, ss));
4847 cgroup_dummy_top->subsys[ss->subsys_id] = NULL; 4963 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4848 4964
4849 mutex_unlock(&cgroup_mutex); 4965 mutex_unlock(&cgroup_mutex);
4850} 4966}
@@ -4912,8 +5028,6 @@ int __init cgroup_init(void)
4912 for_each_builtin_subsys(ss, i) { 5028 for_each_builtin_subsys(ss, i) {
4913 if (!ss->early_init) 5029 if (!ss->early_init)
4914 cgroup_init_subsys(ss); 5030 cgroup_init_subsys(ss);
4915 if (ss->use_id)
4916 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
4917 } 5031 }
4918 5032
4919 /* allocate id for the dummy hierarchy */ 5033 /* allocate id for the dummy hierarchy */
@@ -4926,6 +5040,10 @@ int __init cgroup_init(void)
4926 5040
4927 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); 5041 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
4928 5042
5043 err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
5044 0, 1, GFP_KERNEL);
5045 BUG_ON(err < 0);
5046
4929 mutex_unlock(&cgroup_root_mutex); 5047 mutex_unlock(&cgroup_root_mutex);
4930 mutex_unlock(&cgroup_mutex); 5048 mutex_unlock(&cgroup_mutex);
4931 5049
@@ -5082,7 +5200,7 @@ void cgroup_fork(struct task_struct *child)
5082 * Adds the task to the list running through its css_set if necessary and 5200 * Adds the task to the list running through its css_set if necessary and
5083 * call the subsystem fork() callbacks. Has to be after the task is 5201 * call the subsystem fork() callbacks. Has to be after the task is
5084 * visible on the task list in case we race with the first call to 5202 * visible on the task list in case we race with the first call to
5085 * cgroup_iter_start() - to guarantee that the new task ends up on its 5203 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5086 * list. 5204 * list.
5087 */ 5205 */
5088void cgroup_post_fork(struct task_struct *child) 5206void cgroup_post_fork(struct task_struct *child)
@@ -5195,10 +5313,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5195 */ 5313 */
5196 for_each_builtin_subsys(ss, i) { 5314 for_each_builtin_subsys(ss, i) {
5197 if (ss->exit) { 5315 if (ss->exit) {
5198 struct cgroup *old_cgrp = cset->subsys[i]->cgroup; 5316 struct cgroup_subsys_state *old_css = cset->subsys[i];
5199 struct cgroup *cgrp = task_cgroup(tsk, i); 5317 struct cgroup_subsys_state *css = task_css(tsk, i);
5200 5318
5201 ss->exit(cgrp, old_cgrp, tsk); 5319 ss->exit(css, old_css, tsk);
5202 } 5320 }
5203 } 5321 }
5204 } 5322 }
@@ -5329,210 +5447,56 @@ static int __init cgroup_disable(char *str)
5329} 5447}
5330__setup("cgroup_disable=", cgroup_disable); 5448__setup("cgroup_disable=", cgroup_disable);
5331 5449
5332/*
5333 * Functons for CSS ID.
5334 */
5335
5336/* to get ID other than 0, this should be called when !cgroup_is_dead() */
5337unsigned short css_id(struct cgroup_subsys_state *css)
5338{
5339 struct css_id *cssid;
5340
5341 /*
5342 * This css_id() can return correct value when somone has refcnt
5343 * on this or this is under rcu_read_lock(). Once css->id is allocated,
5344 * it's unchanged until freed.
5345 */
5346 cssid = rcu_dereference_raw(css->id);
5347
5348 if (cssid)
5349 return cssid->id;
5350 return 0;
5351}
5352EXPORT_SYMBOL_GPL(css_id);
5353
5354/** 5450/**
5355 * css_is_ancestor - test "root" css is an ancestor of "child" 5451 * css_from_dir - get corresponding css from the dentry of a cgroup dir
5356 * @child: the css to be tested. 5452 * @dentry: directory dentry of interest
5357 * @root: the css supporsed to be an ancestor of the child. 5453 * @ss: subsystem of interest
5358 * 5454 *
5359 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because 5455 * Must be called under RCU read lock. The caller is responsible for
5360 * this function reads css->id, the caller must hold rcu_read_lock(). 5456 * pinning the returned css if it needs to be accessed outside the RCU
5361 * But, considering usual usage, the csses should be valid objects after test. 5457 * critical section.
5362 * Assuming that the caller will do some action to the child if this returns
5363 * returns true, the caller must take "child";s reference count.
5364 * If "child" is valid object and this returns true, "root" is valid, too.
5365 */
5366
5367bool css_is_ancestor(struct cgroup_subsys_state *child,
5368 const struct cgroup_subsys_state *root)
5369{
5370 struct css_id *child_id;
5371 struct css_id *root_id;
5372
5373 child_id = rcu_dereference(child->id);
5374 if (!child_id)
5375 return false;
5376 root_id = rcu_dereference(root->id);
5377 if (!root_id)
5378 return false;
5379 if (child_id->depth < root_id->depth)
5380 return false;
5381 if (child_id->stack[root_id->depth] != root_id->id)
5382 return false;
5383 return true;
5384}
5385
5386void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
5387{
5388 struct css_id *id = rcu_dereference_protected(css->id, true);
5389
5390 /* When this is called before css_id initialization, id can be NULL */
5391 if (!id)
5392 return;
5393
5394 BUG_ON(!ss->use_id);
5395
5396 rcu_assign_pointer(id->css, NULL);
5397 rcu_assign_pointer(css->id, NULL);
5398 spin_lock(&ss->id_lock);
5399 idr_remove(&ss->idr, id->id);
5400 spin_unlock(&ss->id_lock);
5401 kfree_rcu(id, rcu_head);
5402}
5403EXPORT_SYMBOL_GPL(free_css_id);
5404
5405/*
5406 * This is called by init or create(). Then, calls to this function are
5407 * always serialized (By cgroup_mutex() at create()).
5408 */ 5458 */
5409 5459struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5410static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) 5460 struct cgroup_subsys *ss)
5411{ 5461{
5412 struct css_id *newid; 5462 struct cgroup *cgrp;
5413 int ret, size;
5414
5415 BUG_ON(!ss->use_id);
5416
5417 size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
5418 newid = kzalloc(size, GFP_KERNEL);
5419 if (!newid)
5420 return ERR_PTR(-ENOMEM);
5421
5422 idr_preload(GFP_KERNEL);
5423 spin_lock(&ss->id_lock);
5424 /* Don't use 0. allocates an ID of 1-65535 */
5425 ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
5426 spin_unlock(&ss->id_lock);
5427 idr_preload_end();
5428
5429 /* Returns error when there are no free spaces for new ID.*/
5430 if (ret < 0)
5431 goto err_out;
5432
5433 newid->id = ret;
5434 newid->depth = depth;
5435 return newid;
5436err_out:
5437 kfree(newid);
5438 return ERR_PTR(ret);
5439
5440}
5441
5442static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5443 struct cgroup_subsys_state *rootcss)
5444{
5445 struct css_id *newid;
5446
5447 spin_lock_init(&ss->id_lock);
5448 idr_init(&ss->idr);
5449
5450 newid = get_new_cssid(ss, 0);
5451 if (IS_ERR(newid))
5452 return PTR_ERR(newid);
5453
5454 newid->stack[0] = newid->id;
5455 RCU_INIT_POINTER(newid->css, rootcss);
5456 RCU_INIT_POINTER(rootcss->id, newid);
5457 return 0;
5458}
5459
5460static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
5461 struct cgroup *child)
5462{
5463 int subsys_id, i, depth = 0;
5464 struct cgroup_subsys_state *parent_css, *child_css;
5465 struct css_id *child_id, *parent_id;
5466
5467 subsys_id = ss->subsys_id;
5468 parent_css = parent->subsys[subsys_id];
5469 child_css = child->subsys[subsys_id];
5470 parent_id = rcu_dereference_protected(parent_css->id, true);
5471 depth = parent_id->depth + 1;
5472 5463
5473 child_id = get_new_cssid(ss, depth); 5464 WARN_ON_ONCE(!rcu_read_lock_held());
5474 if (IS_ERR(child_id))
5475 return PTR_ERR(child_id);
5476 5465
5477 for (i = 0; i < depth; i++) 5466 /* is @dentry a cgroup dir? */
5478 child_id->stack[i] = parent_id->stack[i]; 5467 if (!dentry->d_inode ||
5479 child_id->stack[depth] = child_id->id; 5468 dentry->d_inode->i_op != &cgroup_dir_inode_operations)
5480 /* 5469 return ERR_PTR(-EBADF);
5481 * child_id->css pointer will be set after this cgroup is available
5482 * see cgroup_populate_dir()
5483 */
5484 rcu_assign_pointer(child_css->id, child_id);
5485 5470
5486 return 0; 5471 cgrp = __d_cgrp(dentry);
5472 return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT);
5487} 5473}
5488 5474
5489/** 5475/**
5490 * css_lookup - lookup css by id 5476 * css_from_id - lookup css by id
5491 * @ss: cgroup subsys to be looked into. 5477 * @id: the cgroup id
5492 * @id: the id 5478 * @ss: cgroup subsys to be looked into
5493 * 5479 *
5494 * Returns pointer to cgroup_subsys_state if there is valid one with id. 5480 * Returns the css if there's valid one with @id, otherwise returns NULL.
5495 * NULL if not. Should be called under rcu_read_lock() 5481 * Should be called under rcu_read_lock().
5496 */ 5482 */
5497struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) 5483struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5498{
5499 struct css_id *cssid = NULL;
5500
5501 BUG_ON(!ss->use_id);
5502 cssid = idr_find(&ss->idr, id);
5503
5504 if (unlikely(!cssid))
5505 return NULL;
5506
5507 return rcu_dereference(cssid->css);
5508}
5509EXPORT_SYMBOL_GPL(css_lookup);
5510
5511/*
5512 * get corresponding css from file open on cgroupfs directory
5513 */
5514struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5515{ 5484{
5516 struct cgroup *cgrp; 5485 struct cgroup *cgrp;
5517 struct inode *inode;
5518 struct cgroup_subsys_state *css;
5519 5486
5520 inode = file_inode(f); 5487 rcu_lockdep_assert(rcu_read_lock_held() ||
5521 /* check in cgroup filesystem dir */ 5488 lockdep_is_held(&cgroup_mutex),
5522 if (inode->i_op != &cgroup_dir_inode_operations) 5489 "css_from_id() needs proper protection");
5523 return ERR_PTR(-EBADF);
5524 5490
5525 if (id < 0 || id >= CGROUP_SUBSYS_COUNT) 5491 cgrp = idr_find(&ss->root->cgroup_idr, id);
5526 return ERR_PTR(-EINVAL); 5492 if (cgrp)
5527 5493 return cgroup_css(cgrp, ss);
5528 /* get cgroup */ 5494 return NULL;
5529 cgrp = __d_cgrp(f->f_dentry);
5530 css = cgrp->subsys[id];
5531 return css ? css : ERR_PTR(-ENOENT);
5532} 5495}
5533 5496
5534#ifdef CONFIG_CGROUP_DEBUG 5497#ifdef CONFIG_CGROUP_DEBUG
5535static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) 5498static struct cgroup_subsys_state *
5499debug_css_alloc(struct cgroup_subsys_state *parent_css)
5536{ 5500{
5537 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5501 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5538 5502
@@ -5542,22 +5506,24 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
5542 return css; 5506 return css;
5543} 5507}
5544 5508
5545static void debug_css_free(struct cgroup *cgrp) 5509static void debug_css_free(struct cgroup_subsys_state *css)
5546{ 5510{
5547 kfree(cgrp->subsys[debug_subsys_id]); 5511 kfree(css);
5548} 5512}
5549 5513
5550static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) 5514static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
5515 struct cftype *cft)
5551{ 5516{
5552 return cgroup_task_count(cgrp); 5517 return cgroup_task_count(css->cgroup);
5553} 5518}
5554 5519
5555static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) 5520static u64 current_css_set_read(struct cgroup_subsys_state *css,
5521 struct cftype *cft)
5556{ 5522{
5557 return (u64)(unsigned long)current->cgroups; 5523 return (u64)(unsigned long)current->cgroups;
5558} 5524}
5559 5525
5560static u64 current_css_set_refcount_read(struct cgroup *cgrp, 5526static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5561 struct cftype *cft) 5527 struct cftype *cft)
5562{ 5528{
5563 u64 count; 5529 u64 count;
@@ -5568,7 +5534,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp,
5568 return count; 5534 return count;
5569} 5535}
5570 5536
5571static int current_css_set_cg_links_read(struct cgroup *cgrp, 5537static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
5572 struct cftype *cft, 5538 struct cftype *cft,
5573 struct seq_file *seq) 5539 struct seq_file *seq)
5574{ 5540{
@@ -5595,14 +5561,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp,
5595} 5561}
5596 5562
5597#define MAX_TASKS_SHOWN_PER_CSS 25 5563#define MAX_TASKS_SHOWN_PER_CSS 25
5598static int cgroup_css_links_read(struct cgroup *cgrp, 5564static int cgroup_css_links_read(struct cgroup_subsys_state *css,
5599 struct cftype *cft, 5565 struct cftype *cft, struct seq_file *seq)
5600 struct seq_file *seq)
5601{ 5566{
5602 struct cgrp_cset_link *link; 5567 struct cgrp_cset_link *link;
5603 5568
5604 read_lock(&css_set_lock); 5569 read_lock(&css_set_lock);
5605 list_for_each_entry(link, &cgrp->cset_links, cset_link) { 5570 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5606 struct css_set *cset = link->cset; 5571 struct css_set *cset = link->cset;
5607 struct task_struct *task; 5572 struct task_struct *task;
5608 int count = 0; 5573 int count = 0;
@@ -5621,9 +5586,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp,
5621 return 0; 5586 return 0;
5622} 5587}
5623 5588
5624static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) 5589static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5625{ 5590{
5626 return test_bit(CGRP_RELEASABLE, &cgrp->flags); 5591 return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
5627} 5592}
5628 5593
5629static struct cftype debug_files[] = { 5594static struct cftype debug_files[] = {
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 75dda1ea5026..f0ff64d0ebaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -45,25 +45,19 @@ struct freezer {
45 spinlock_t lock; 45 spinlock_t lock;
46}; 46};
47 47
48static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) 48static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
49{ 49{
50 return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), 50 return css ? container_of(css, struct freezer, css) : NULL;
51 struct freezer, css);
52} 51}
53 52
54static inline struct freezer *task_freezer(struct task_struct *task) 53static inline struct freezer *task_freezer(struct task_struct *task)
55{ 54{
56 return container_of(task_subsys_state(task, freezer_subsys_id), 55 return css_freezer(task_css(task, freezer_subsys_id));
57 struct freezer, css);
58} 56}
59 57
60static struct freezer *parent_freezer(struct freezer *freezer) 58static struct freezer *parent_freezer(struct freezer *freezer)
61{ 59{
62 struct cgroup *pcg = freezer->css.cgroup->parent; 60 return css_freezer(css_parent(&freezer->css));
63
64 if (pcg)
65 return cgroup_freezer(pcg);
66 return NULL;
67} 61}
68 62
69bool cgroup_freezing(struct task_struct *task) 63bool cgroup_freezing(struct task_struct *task)
@@ -92,7 +86,8 @@ static const char *freezer_state_strs(unsigned int state)
92 86
93struct cgroup_subsys freezer_subsys; 87struct cgroup_subsys freezer_subsys;
94 88
95static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) 89static struct cgroup_subsys_state *
90freezer_css_alloc(struct cgroup_subsys_state *parent_css)
96{ 91{
97 struct freezer *freezer; 92 struct freezer *freezer;
98 93
@@ -105,22 +100,22 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
105} 100}
106 101
107/** 102/**
108 * freezer_css_online - commit creation of a freezer cgroup 103 * freezer_css_online - commit creation of a freezer css
109 * @cgroup: cgroup being created 104 * @css: css being created
110 * 105 *
111 * We're committing to creation of @cgroup. Mark it online and inherit 106 * We're committing to creation of @css. Mark it online and inherit
112 * parent's freezing state while holding both parent's and our 107 * parent's freezing state while holding both parent's and our
113 * freezer->lock. 108 * freezer->lock.
114 */ 109 */
115static int freezer_css_online(struct cgroup *cgroup) 110static int freezer_css_online(struct cgroup_subsys_state *css)
116{ 111{
117 struct freezer *freezer = cgroup_freezer(cgroup); 112 struct freezer *freezer = css_freezer(css);
118 struct freezer *parent = parent_freezer(freezer); 113 struct freezer *parent = parent_freezer(freezer);
119 114
120 /* 115 /*
121 * The following double locking and freezing state inheritance 116 * The following double locking and freezing state inheritance
122 * guarantee that @cgroup can never escape ancestors' freezing 117 * guarantee that @cgroup can never escape ancestors' freezing
123 * states. See cgroup_for_each_descendant_pre() for details. 118 * states. See css_for_each_descendant_pre() for details.
124 */ 119 */
125 if (parent) 120 if (parent)
126 spin_lock_irq(&parent->lock); 121 spin_lock_irq(&parent->lock);
@@ -141,15 +136,15 @@ static int freezer_css_online(struct cgroup *cgroup)
141} 136}
142 137
143/** 138/**
144 * freezer_css_offline - initiate destruction of @cgroup 139 * freezer_css_offline - initiate destruction of a freezer css
145 * @cgroup: cgroup being destroyed 140 * @css: css being destroyed
146 * 141 *
147 * @cgroup is going away. Mark it dead and decrement system_freezing_count 142 * @css is going away. Mark it dead and decrement system_freezing_count if
148 * if it was holding one. 143 * it was holding one.
149 */ 144 */
150static void freezer_css_offline(struct cgroup *cgroup) 145static void freezer_css_offline(struct cgroup_subsys_state *css)
151{ 146{
152 struct freezer *freezer = cgroup_freezer(cgroup); 147 struct freezer *freezer = css_freezer(css);
153 148
154 spin_lock_irq(&freezer->lock); 149 spin_lock_irq(&freezer->lock);
155 150
@@ -161,9 +156,9 @@ static void freezer_css_offline(struct cgroup *cgroup)
161 spin_unlock_irq(&freezer->lock); 156 spin_unlock_irq(&freezer->lock);
162} 157}
163 158
164static void freezer_css_free(struct cgroup *cgroup) 159static void freezer_css_free(struct cgroup_subsys_state *css)
165{ 160{
166 kfree(cgroup_freezer(cgroup)); 161 kfree(css_freezer(css));
167} 162}
168 163
169/* 164/*
@@ -175,25 +170,26 @@ static void freezer_css_free(struct cgroup *cgroup)
175 * @freezer->lock. freezer_attach() makes the new tasks conform to the 170 * @freezer->lock. freezer_attach() makes the new tasks conform to the
176 * current state and all following state changes can see the new tasks. 171 * current state and all following state changes can see the new tasks.
177 */ 172 */
178static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) 173static void freezer_attach(struct cgroup_subsys_state *new_css,
174 struct cgroup_taskset *tset)
179{ 175{
180 struct freezer *freezer = cgroup_freezer(new_cgrp); 176 struct freezer *freezer = css_freezer(new_css);
181 struct task_struct *task; 177 struct task_struct *task;
182 bool clear_frozen = false; 178 bool clear_frozen = false;
183 179
184 spin_lock_irq(&freezer->lock); 180 spin_lock_irq(&freezer->lock);
185 181
186 /* 182 /*
187 * Make the new tasks conform to the current state of @new_cgrp. 183 * Make the new tasks conform to the current state of @new_css.
188 * For simplicity, when migrating any task to a FROZEN cgroup, we 184 * For simplicity, when migrating any task to a FROZEN cgroup, we
189 * revert it to FREEZING and let update_if_frozen() determine the 185 * revert it to FREEZING and let update_if_frozen() determine the
190 * correct state later. 186 * correct state later.
191 * 187 *
192 * Tasks in @tset are on @new_cgrp but may not conform to its 188 * Tasks in @tset are on @new_css but may not conform to its
193 * current state before executing the following - !frozen tasks may 189 * current state before executing the following - !frozen tasks may
194 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. 190 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
195 */ 191 */
196 cgroup_taskset_for_each(task, new_cgrp, tset) { 192 cgroup_taskset_for_each(task, new_css, tset) {
197 if (!(freezer->state & CGROUP_FREEZING)) { 193 if (!(freezer->state & CGROUP_FREEZING)) {
198 __thaw_task(task); 194 __thaw_task(task);
199 } else { 195 } else {
@@ -231,7 +227,7 @@ static void freezer_fork(struct task_struct *task)
231 * The root cgroup is non-freezable, so we can skip the 227 * The root cgroup is non-freezable, so we can skip the
232 * following check. 228 * following check.
233 */ 229 */
234 if (!freezer->css.cgroup->parent) 230 if (!parent_freezer(freezer))
235 goto out; 231 goto out;
236 232
237 spin_lock_irq(&freezer->lock); 233 spin_lock_irq(&freezer->lock);
@@ -244,7 +240,7 @@ out:
244 240
245/** 241/**
246 * update_if_frozen - update whether a cgroup finished freezing 242 * update_if_frozen - update whether a cgroup finished freezing
247 * @cgroup: cgroup of interest 243 * @css: css of interest
248 * 244 *
249 * Once FREEZING is initiated, transition to FROZEN is lazily updated by 245 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
250 * calling this function. If the current state is FREEZING but not FROZEN, 246 * calling this function. If the current state is FREEZING but not FROZEN,
@@ -255,14 +251,14 @@ out:
255 * update_if_frozen() on all descendants prior to invoking this function. 251 * update_if_frozen() on all descendants prior to invoking this function.
256 * 252 *
257 * Task states and freezer state might disagree while tasks are being 253 * Task states and freezer state might disagree while tasks are being
258 * migrated into or out of @cgroup, so we can't verify task states against 254 * migrated into or out of @css, so we can't verify task states against
259 * @freezer state here. See freezer_attach() for details. 255 * @freezer state here. See freezer_attach() for details.
260 */ 256 */
261static void update_if_frozen(struct cgroup *cgroup) 257static void update_if_frozen(struct cgroup_subsys_state *css)
262{ 258{
263 struct freezer *freezer = cgroup_freezer(cgroup); 259 struct freezer *freezer = css_freezer(css);
264 struct cgroup *pos; 260 struct cgroup_subsys_state *pos;
265 struct cgroup_iter it; 261 struct css_task_iter it;
266 struct task_struct *task; 262 struct task_struct *task;
267 263
268 WARN_ON_ONCE(!rcu_read_lock_held()); 264 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -274,8 +270,8 @@ static void update_if_frozen(struct cgroup *cgroup)
274 goto out_unlock; 270 goto out_unlock;
275 271
276 /* are all (live) children frozen? */ 272 /* are all (live) children frozen? */
277 cgroup_for_each_child(pos, cgroup) { 273 css_for_each_child(pos, css) {
278 struct freezer *child = cgroup_freezer(pos); 274 struct freezer *child = css_freezer(pos);
279 275
280 if ((child->state & CGROUP_FREEZER_ONLINE) && 276 if ((child->state & CGROUP_FREEZER_ONLINE) &&
281 !(child->state & CGROUP_FROZEN)) 277 !(child->state & CGROUP_FROZEN))
@@ -283,9 +279,9 @@ static void update_if_frozen(struct cgroup *cgroup)
283 } 279 }
284 280
285 /* are all tasks frozen? */ 281 /* are all tasks frozen? */
286 cgroup_iter_start(cgroup, &it); 282 css_task_iter_start(css, &it);
287 283
288 while ((task = cgroup_iter_next(cgroup, &it))) { 284 while ((task = css_task_iter_next(&it))) {
289 if (freezing(task)) { 285 if (freezing(task)) {
290 /* 286 /*
291 * freezer_should_skip() indicates that the task 287 * freezer_should_skip() indicates that the task
@@ -300,52 +296,49 @@ static void update_if_frozen(struct cgroup *cgroup)
300 296
301 freezer->state |= CGROUP_FROZEN; 297 freezer->state |= CGROUP_FROZEN;
302out_iter_end: 298out_iter_end:
303 cgroup_iter_end(cgroup, &it); 299 css_task_iter_end(&it);
304out_unlock: 300out_unlock:
305 spin_unlock_irq(&freezer->lock); 301 spin_unlock_irq(&freezer->lock);
306} 302}
307 303
308static int freezer_read(struct cgroup *cgroup, struct cftype *cft, 304static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft,
309 struct seq_file *m) 305 struct seq_file *m)
310{ 306{
311 struct cgroup *pos; 307 struct cgroup_subsys_state *pos;
312 308
313 rcu_read_lock(); 309 rcu_read_lock();
314 310
315 /* update states bottom-up */ 311 /* update states bottom-up */
316 cgroup_for_each_descendant_post(pos, cgroup) 312 css_for_each_descendant_post(pos, css)
317 update_if_frozen(pos); 313 update_if_frozen(pos);
318 update_if_frozen(cgroup);
319 314
320 rcu_read_unlock(); 315 rcu_read_unlock();
321 316
322 seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); 317 seq_puts(m, freezer_state_strs(css_freezer(css)->state));
323 seq_putc(m, '\n'); 318 seq_putc(m, '\n');
324 return 0; 319 return 0;
325} 320}
326 321
327static void freeze_cgroup(struct freezer *freezer) 322static void freeze_cgroup(struct freezer *freezer)
328{ 323{
329 struct cgroup *cgroup = freezer->css.cgroup; 324 struct css_task_iter it;
330 struct cgroup_iter it;
331 struct task_struct *task; 325 struct task_struct *task;
332 326
333 cgroup_iter_start(cgroup, &it); 327 css_task_iter_start(&freezer->css, &it);
334 while ((task = cgroup_iter_next(cgroup, &it))) 328 while ((task = css_task_iter_next(&it)))
335 freeze_task(task); 329 freeze_task(task);
336 cgroup_iter_end(cgroup, &it); 330 css_task_iter_end(&it);
337} 331}
338 332
339static void unfreeze_cgroup(struct freezer *freezer) 333static void unfreeze_cgroup(struct freezer *freezer)
340{ 334{
341 struct cgroup *cgroup = freezer->css.cgroup; 335 struct css_task_iter it;
342 struct cgroup_iter it;
343 struct task_struct *task; 336 struct task_struct *task;
344 337
345 cgroup_iter_start(cgroup, &it); 338 css_task_iter_start(&freezer->css, &it);
346 while ((task = cgroup_iter_next(cgroup, &it))) 339 while ((task = css_task_iter_next(&it)))
347 __thaw_task(task); 340 __thaw_task(task);
348 cgroup_iter_end(cgroup, &it); 341 css_task_iter_end(&it);
349} 342}
350 343
351/** 344/**
@@ -395,12 +388,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
395 */ 388 */
396static void freezer_change_state(struct freezer *freezer, bool freeze) 389static void freezer_change_state(struct freezer *freezer, bool freeze)
397{ 390{
398 struct cgroup *pos; 391 struct cgroup_subsys_state *pos;
399
400 /* update @freezer */
401 spin_lock_irq(&freezer->lock);
402 freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
403 spin_unlock_irq(&freezer->lock);
404 392
405 /* 393 /*
406 * Update all its descendants in pre-order traversal. Each 394 * Update all its descendants in pre-order traversal. Each
@@ -408,24 +396,33 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
408 * CGROUP_FREEZING_PARENT. 396 * CGROUP_FREEZING_PARENT.
409 */ 397 */
410 rcu_read_lock(); 398 rcu_read_lock();
411 cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { 399 css_for_each_descendant_pre(pos, &freezer->css) {
412 struct freezer *pos_f = cgroup_freezer(pos); 400 struct freezer *pos_f = css_freezer(pos);
413 struct freezer *parent = parent_freezer(pos_f); 401 struct freezer *parent = parent_freezer(pos_f);
414 402
415 /*
416 * Our update to @parent->state is already visible which is
417 * all we need. No need to lock @parent. For more info on
418 * synchronization, see freezer_post_create().
419 */
420 spin_lock_irq(&pos_f->lock); 403 spin_lock_irq(&pos_f->lock);
421 freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, 404
422 CGROUP_FREEZING_PARENT); 405 if (pos_f == freezer) {
406 freezer_apply_state(pos_f, freeze,
407 CGROUP_FREEZING_SELF);
408 } else {
409 /*
410 * Our update to @parent->state is already visible
411 * which is all we need. No need to lock @parent.
412 * For more info on synchronization, see
413 * freezer_post_create().
414 */
415 freezer_apply_state(pos_f,
416 parent->state & CGROUP_FREEZING,
417 CGROUP_FREEZING_PARENT);
418 }
419
423 spin_unlock_irq(&pos_f->lock); 420 spin_unlock_irq(&pos_f->lock);
424 } 421 }
425 rcu_read_unlock(); 422 rcu_read_unlock();
426} 423}
427 424
428static int freezer_write(struct cgroup *cgroup, struct cftype *cft, 425static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
429 const char *buffer) 426 const char *buffer)
430{ 427{
431 bool freeze; 428 bool freeze;
@@ -437,20 +434,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
437 else 434 else
438 return -EINVAL; 435 return -EINVAL;
439 436
440 freezer_change_state(cgroup_freezer(cgroup), freeze); 437 freezer_change_state(css_freezer(css), freeze);
441 return 0; 438 return 0;
442} 439}
443 440
444static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) 441static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
442 struct cftype *cft)
445{ 443{
446 struct freezer *freezer = cgroup_freezer(cgroup); 444 struct freezer *freezer = css_freezer(css);
447 445
448 return (bool)(freezer->state & CGROUP_FREEZING_SELF); 446 return (bool)(freezer->state & CGROUP_FREEZING_SELF);
449} 447}
450 448
451static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) 449static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
450 struct cftype *cft)
452{ 451{
453 struct freezer *freezer = cgroup_freezer(cgroup); 452 struct freezer *freezer = css_freezer(css);
454 453
455 return (bool)(freezer->state & CGROUP_FREEZING_PARENT); 454 return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
456} 455}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 383f8231e436..e5f3917aa05b 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -20,26 +20,46 @@
20#include <linux/hardirq.h> 20#include <linux/hardirq.h>
21#include <linux/export.h> 21#include <linux/export.h>
22 22
23DEFINE_PER_CPU(struct context_tracking, context_tracking) = { 23#define CREATE_TRACE_POINTS
24#ifdef CONFIG_CONTEXT_TRACKING_FORCE 24#include <trace/events/context_tracking.h>
25 .active = true, 25
26#endif 26struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
27}; 27EXPORT_SYMBOL_GPL(context_tracking_enabled);
28
29DEFINE_PER_CPU(struct context_tracking, context_tracking);
30EXPORT_SYMBOL_GPL(context_tracking);
31
32void context_tracking_cpu_set(int cpu)
33{
34 if (!per_cpu(context_tracking.active, cpu)) {
35 per_cpu(context_tracking.active, cpu) = true;
36 static_key_slow_inc(&context_tracking_enabled);
37 }
38}
28 39
29/** 40/**
30 * user_enter - Inform the context tracking that the CPU is going to 41 * context_tracking_user_enter - Inform the context tracking that the CPU is going to
31 * enter userspace mode. 42 * enter userspace mode.
32 * 43 *
33 * This function must be called right before we switch from the kernel 44 * This function must be called right before we switch from the kernel
34 * to userspace, when it's guaranteed the remaining kernel instructions 45 * to userspace, when it's guaranteed the remaining kernel instructions
35 * to execute won't use any RCU read side critical section because this 46 * to execute won't use any RCU read side critical section because this
36 * function sets RCU in extended quiescent state. 47 * function sets RCU in extended quiescent state.
37 */ 48 */
38void user_enter(void) 49void context_tracking_user_enter(void)
39{ 50{
40 unsigned long flags; 51 unsigned long flags;
41 52
42 /* 53 /*
54 * Repeat the user_enter() check here because some archs may be calling
55 * this from asm and if no CPU needs context tracking, they shouldn't
56 * go further. Repeat the check here until they support the static key
57 * check.
58 */
59 if (!static_key_false(&context_tracking_enabled))
60 return;
61
62 /*
43 * Some contexts may involve an exception occuring in an irq, 63 * Some contexts may involve an exception occuring in an irq,
44 * leading to that nesting: 64 * leading to that nesting:
45 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() 65 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
@@ -54,17 +74,32 @@ void user_enter(void)
54 WARN_ON_ONCE(!current->mm); 74 WARN_ON_ONCE(!current->mm);
55 75
56 local_irq_save(flags); 76 local_irq_save(flags);
57 if (__this_cpu_read(context_tracking.active) && 77 if ( __this_cpu_read(context_tracking.state) != IN_USER) {
58 __this_cpu_read(context_tracking.state) != IN_USER) { 78 if (__this_cpu_read(context_tracking.active)) {
79 trace_user_enter(0);
80 /*
81 * At this stage, only low level arch entry code remains and
82 * then we'll run in userspace. We can assume there won't be
83 * any RCU read-side critical section until the next call to
84 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
85 * on the tick.
86 */
87 vtime_user_enter(current);
88 rcu_user_enter();
89 }
59 /* 90 /*
60 * At this stage, only low level arch entry code remains and 91 * Even if context tracking is disabled on this CPU, because it's outside
61 * then we'll run in userspace. We can assume there won't be 92 * the full dynticks mask for example, we still have to keep track of the
62 * any RCU read-side critical section until the next call to 93 * context transitions and states to prevent inconsistency on those of
63 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency 94 * other CPUs.
64 * on the tick. 95 * If a task triggers an exception in userspace, sleep on the exception
96 * handler and then migrate to another CPU, that new CPU must know where
97 * the exception returns by the time we call exception_exit().
98 * This information can only be provided by the previous CPU when it called
99 * exception_enter().
100 * OTOH we can spare the calls to vtime and RCU when context_tracking.active
101 * is false because we know that CPU is not tickless.
65 */ 102 */
66 vtime_user_enter(current);
67 rcu_user_enter();
68 __this_cpu_write(context_tracking.state, IN_USER); 103 __this_cpu_write(context_tracking.state, IN_USER);
69 } 104 }
70 local_irq_restore(flags); 105 local_irq_restore(flags);
@@ -85,12 +120,11 @@ void user_enter(void)
85 * instead of preempt_schedule() to exit user context if needed before 120 * instead of preempt_schedule() to exit user context if needed before
86 * calling the scheduler. 121 * calling the scheduler.
87 */ 122 */
88void __sched notrace preempt_schedule_context(void) 123asmlinkage void __sched notrace preempt_schedule_context(void)
89{ 124{
90 struct thread_info *ti = current_thread_info();
91 enum ctx_state prev_ctx; 125 enum ctx_state prev_ctx;
92 126
93 if (likely(ti->preempt_count || irqs_disabled())) 127 if (likely(!preemptible()))
94 return; 128 return;
95 129
96 /* 130 /*
@@ -112,8 +146,8 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
112#endif /* CONFIG_PREEMPT */ 146#endif /* CONFIG_PREEMPT */
113 147
114/** 148/**
115 * user_exit - Inform the context tracking that the CPU is 149 * context_tracking_user_exit - Inform the context tracking that the CPU is
116 * exiting userspace mode and entering the kernel. 150 * exiting userspace mode and entering the kernel.
117 * 151 *
118 * This function must be called after we entered the kernel from userspace 152 * This function must be called after we entered the kernel from userspace
119 * before any use of RCU read side critical section. This potentially include 153 * before any use of RCU read side critical section. This potentially include
@@ -122,47 +156,34 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
122 * This call supports re-entrancy. This way it can be called from any exception 156 * This call supports re-entrancy. This way it can be called from any exception
123 * handler without needing to know if we came from userspace or not. 157 * handler without needing to know if we came from userspace or not.
124 */ 158 */
125void user_exit(void) 159void context_tracking_user_exit(void)
126{ 160{
127 unsigned long flags; 161 unsigned long flags;
128 162
163 if (!static_key_false(&context_tracking_enabled))
164 return;
165
129 if (in_interrupt()) 166 if (in_interrupt())
130 return; 167 return;
131 168
132 local_irq_save(flags); 169 local_irq_save(flags);
133 if (__this_cpu_read(context_tracking.state) == IN_USER) { 170 if (__this_cpu_read(context_tracking.state) == IN_USER) {
134 /* 171 if (__this_cpu_read(context_tracking.active)) {
135 * We are going to run code that may use RCU. Inform 172 /*
136 * RCU core about that (ie: we may need the tick again). 173 * We are going to run code that may use RCU. Inform
137 */ 174 * RCU core about that (ie: we may need the tick again).
138 rcu_user_exit(); 175 */
139 vtime_user_exit(current); 176 rcu_user_exit();
177 vtime_user_exit(current);
178 trace_user_exit(0);
179 }
140 __this_cpu_write(context_tracking.state, IN_KERNEL); 180 __this_cpu_write(context_tracking.state, IN_KERNEL);
141 } 181 }
142 local_irq_restore(flags); 182 local_irq_restore(flags);
143} 183}
144 184
145void guest_enter(void)
146{
147 if (vtime_accounting_enabled())
148 vtime_guest_enter(current);
149 else
150 __guest_enter();
151}
152EXPORT_SYMBOL_GPL(guest_enter);
153
154void guest_exit(void)
155{
156 if (vtime_accounting_enabled())
157 vtime_guest_exit(current);
158 else
159 __guest_exit();
160}
161EXPORT_SYMBOL_GPL(guest_exit);
162
163
164/** 185/**
165 * context_tracking_task_switch - context switch the syscall callbacks 186 * __context_tracking_task_switch - context switch the syscall callbacks
166 * @prev: the task that is being switched out 187 * @prev: the task that is being switched out
167 * @next: the task that is being switched in 188 * @next: the task that is being switched in
168 * 189 *
@@ -174,11 +195,19 @@ EXPORT_SYMBOL_GPL(guest_exit);
174 * migrate to some CPU that doesn't do the context tracking. As such the TIF 195 * migrate to some CPU that doesn't do the context tracking. As such the TIF
175 * flag may not be desired there. 196 * flag may not be desired there.
176 */ 197 */
177void context_tracking_task_switch(struct task_struct *prev, 198void __context_tracking_task_switch(struct task_struct *prev,
178 struct task_struct *next) 199 struct task_struct *next)
179{ 200{
180 if (__this_cpu_read(context_tracking.active)) { 201 clear_tsk_thread_flag(prev, TIF_NOHZ);
181 clear_tsk_thread_flag(prev, TIF_NOHZ); 202 set_tsk_thread_flag(next, TIF_NOHZ);
182 set_tsk_thread_flag(next, TIF_NOHZ);
183 }
184} 203}
204
205#ifdef CONFIG_CONTEXT_TRACKING_FORCE
206void __init context_tracking_init(void)
207{
208 int cpu;
209
210 for_each_possible_cpu(cpu)
211 context_tracking_cpu_set(cpu);
212}
213#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b2b227b82123..63aa50d7ce1e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
113 * get_online_cpus() not an api which is called all that often. 113 * get_online_cpus() not an api which is called all that often.
114 * 114 *
115 */ 115 */
116static void cpu_hotplug_begin(void) 116void cpu_hotplug_begin(void)
117{ 117{
118 cpu_hotplug.active_writer = current; 118 cpu_hotplug.active_writer = current;
119 119
@@ -127,7 +127,7 @@ static void cpu_hotplug_begin(void)
127 } 127 }
128} 128}
129 129
130static void cpu_hotplug_done(void) 130void cpu_hotplug_done(void)
131{ 131{
132 cpu_hotplug.active_writer = NULL; 132 cpu_hotplug.active_writer = NULL;
133 mutex_unlock(&cpu_hotplug.lock); 133 mutex_unlock(&cpu_hotplug.lock);
@@ -154,10 +154,7 @@ void cpu_hotplug_enable(void)
154 cpu_maps_update_done(); 154 cpu_maps_update_done();
155} 155}
156 156
157#else /* #if CONFIG_HOTPLUG_CPU */ 157#endif /* CONFIG_HOTPLUG_CPU */
158static void cpu_hotplug_begin(void) {}
159static void cpu_hotplug_done(void) {}
160#endif /* #else #if CONFIG_HOTPLUG_CPU */
161 158
162/* Need to know about CPUs going up/down? */ 159/* Need to know about CPUs going up/down? */
163int __ref register_cpu_notifier(struct notifier_block *nb) 160int __ref register_cpu_notifier(struct notifier_block *nb)
@@ -311,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
311 } 308 }
312 smpboot_park_threads(cpu); 309 smpboot_park_threads(cpu);
313 310
311 /*
312 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
313 * and RCU users of this state to go away such that all new such users
314 * will observe it.
315 *
316 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
317 * not imply sync_sched(), so explicitly call both.
318 */
319#ifdef CONFIG_PREEMPT
320 synchronize_sched();
321#endif
322 synchronize_rcu();
323
324 /*
325 * So now all preempt/rcu users must observe !cpu_active().
326 */
327
314 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 328 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
315 if (err) { 329 if (err) {
316 /* CPU didn't die: tell everyone. Can't complain. */ 330 /* CPU didn't die: tell everyone. Can't complain. */
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index e695c0a0bcb5..988573a9a387 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void)
44 rcu_idle_enter(); 44 rcu_idle_enter();
45 trace_cpu_idle_rcuidle(0, smp_processor_id()); 45 trace_cpu_idle_rcuidle(0, smp_processor_id());
46 local_irq_enable(); 46 local_irq_enable();
47 while (!need_resched()) 47 while (!tif_need_resched())
48 cpu_relax(); 48 cpu_relax();
49 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 49 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
50 rcu_idle_exit(); 50 rcu_idle_exit();
@@ -92,8 +92,7 @@ static void cpu_idle_loop(void)
92 if (cpu_idle_force_poll || tick_check_broadcast_expired()) { 92 if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
93 cpu_idle_poll(); 93 cpu_idle_poll();
94 } else { 94 } else {
95 current_clr_polling(); 95 if (!current_clr_polling_and_test()) {
96 if (!need_resched()) {
97 stop_critical_timings(); 96 stop_critical_timings();
98 rcu_idle_enter(); 97 rcu_idle_enter();
99 arch_cpu_idle(); 98 arch_cpu_idle();
@@ -103,9 +102,16 @@ static void cpu_idle_loop(void)
103 } else { 102 } else {
104 local_irq_enable(); 103 local_irq_enable();
105 } 104 }
106 current_set_polling(); 105 __current_set_polling();
107 } 106 }
108 arch_cpu_idle_exit(); 107 arch_cpu_idle_exit();
108 /*
109 * We need to test and propagate the TIF_NEED_RESCHED
110 * bit here because we might not have send the
111 * reschedule IPI to idle tasks.
112 */
113 if (tif_need_resched())
114 set_preempt_need_resched();
109 } 115 }
110 tick_nohz_idle_exit(); 116 tick_nohz_idle_exit();
111 schedule_preempt_disabled(); 117 schedule_preempt_disabled();
@@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state)
129 */ 135 */
130 boot_init_stack_canary(); 136 boot_init_stack_canary();
131#endif 137#endif
132 current_set_polling(); 138 __current_set_polling();
133 arch_cpu_idle_prepare(); 139 arch_cpu_idle_prepare();
134 cpu_idle_loop(); 140 cpu_idle_loop();
135} 141}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e5657788fedd..6bf981e13c43 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -68,10 +68,6 @@
68 */ 68 */
69int number_of_cpusets __read_mostly; 69int number_of_cpusets __read_mostly;
70 70
71/* Forward declare cgroup structures */
72struct cgroup_subsys cpuset_subsys;
73struct cpuset;
74
75/* See "Frequency meter" comments, below. */ 71/* See "Frequency meter" comments, below. */
76 72
77struct fmeter { 73struct fmeter {
@@ -115,27 +111,20 @@ struct cpuset {
115 int relax_domain_level; 111 int relax_domain_level;
116}; 112};
117 113
118/* Retrieve the cpuset for a cgroup */ 114static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
119static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
120{ 115{
121 return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), 116 return css ? container_of(css, struct cpuset, css) : NULL;
122 struct cpuset, css);
123} 117}
124 118
125/* Retrieve the cpuset for a task */ 119/* Retrieve the cpuset for a task */
126static inline struct cpuset *task_cs(struct task_struct *task) 120static inline struct cpuset *task_cs(struct task_struct *task)
127{ 121{
128 return container_of(task_subsys_state(task, cpuset_subsys_id), 122 return css_cs(task_css(task, cpuset_subsys_id));
129 struct cpuset, css);
130} 123}
131 124
132static inline struct cpuset *parent_cs(const struct cpuset *cs) 125static inline struct cpuset *parent_cs(struct cpuset *cs)
133{ 126{
134 struct cgroup *pcgrp = cs->css.cgroup->parent; 127 return css_cs(css_parent(&cs->css));
135
136 if (pcgrp)
137 return cgroup_cs(pcgrp);
138 return NULL;
139} 128}
140 129
141#ifdef CONFIG_NUMA 130#ifdef CONFIG_NUMA
@@ -212,29 +201,30 @@ static struct cpuset top_cpuset = {
212/** 201/**
213 * cpuset_for_each_child - traverse online children of a cpuset 202 * cpuset_for_each_child - traverse online children of a cpuset
214 * @child_cs: loop cursor pointing to the current child 203 * @child_cs: loop cursor pointing to the current child
215 * @pos_cgrp: used for iteration 204 * @pos_css: used for iteration
216 * @parent_cs: target cpuset to walk children of 205 * @parent_cs: target cpuset to walk children of
217 * 206 *
218 * Walk @child_cs through the online children of @parent_cs. Must be used 207 * Walk @child_cs through the online children of @parent_cs. Must be used
219 * with RCU read locked. 208 * with RCU read locked.
220 */ 209 */
221#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ 210#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
222 cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ 211 css_for_each_child((pos_css), &(parent_cs)->css) \
223 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) 212 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
224 213
225/** 214/**
226 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants 215 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
227 * @des_cs: loop cursor pointing to the current descendant 216 * @des_cs: loop cursor pointing to the current descendant
228 * @pos_cgrp: used for iteration 217 * @pos_css: used for iteration
229 * @root_cs: target cpuset to walk ancestor of 218 * @root_cs: target cpuset to walk ancestor of
230 * 219 *
231 * Walk @des_cs through the online descendants of @root_cs. Must be used 220 * Walk @des_cs through the online descendants of @root_cs. Must be used
232 * with RCU read locked. The caller may modify @pos_cgrp by calling 221 * with RCU read locked. The caller may modify @pos_css by calling
233 * cgroup_rightmost_descendant() to skip subtree. 222 * css_rightmost_descendant() to skip subtree. @root_cs is included in the
223 * iteration and the first node to be visited.
234 */ 224 */
235#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ 225#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
236 cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ 226 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
237 if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) 227 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
238 228
239/* 229/*
240 * There are two global mutexes guarding cpuset structures - cpuset_mutex 230 * There are two global mutexes guarding cpuset structures - cpuset_mutex
@@ -320,8 +310,7 @@ static struct file_system_type cpuset_fs_type = {
320 * 310 *
321 * Call with callback_mutex held. 311 * Call with callback_mutex held.
322 */ 312 */
323static void guarantee_online_cpus(const struct cpuset *cs, 313static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
324 struct cpumask *pmask)
325{ 314{
326 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 315 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
327 cs = parent_cs(cs); 316 cs = parent_cs(cs);
@@ -339,7 +328,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
339 * 328 *
340 * Call with callback_mutex held. 329 * Call with callback_mutex held.
341 */ 330 */
342static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 331static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
343{ 332{
344 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) 333 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
345 cs = parent_cs(cs); 334 cs = parent_cs(cs);
@@ -384,7 +373,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
384 * alloc_trial_cpuset - allocate a trial cpuset 373 * alloc_trial_cpuset - allocate a trial cpuset
385 * @cs: the cpuset that the trial cpuset duplicates 374 * @cs: the cpuset that the trial cpuset duplicates
386 */ 375 */
387static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) 376static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
388{ 377{
389 struct cpuset *trial; 378 struct cpuset *trial;
390 379
@@ -431,9 +420,9 @@ static void free_trial_cpuset(struct cpuset *trial)
431 * Return 0 if valid, -errno if not. 420 * Return 0 if valid, -errno if not.
432 */ 421 */
433 422
434static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 423static int validate_change(struct cpuset *cur, struct cpuset *trial)
435{ 424{
436 struct cgroup *cgrp; 425 struct cgroup_subsys_state *css;
437 struct cpuset *c, *par; 426 struct cpuset *c, *par;
438 int ret; 427 int ret;
439 428
@@ -441,7 +430,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
441 430
442 /* Each of our child cpusets must be a subset of us */ 431 /* Each of our child cpusets must be a subset of us */
443 ret = -EBUSY; 432 ret = -EBUSY;
444 cpuset_for_each_child(c, cgrp, cur) 433 cpuset_for_each_child(c, css, cur)
445 if (!is_cpuset_subset(c, trial)) 434 if (!is_cpuset_subset(c, trial))
446 goto out; 435 goto out;
447 436
@@ -462,7 +451,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
462 * overlap 451 * overlap
463 */ 452 */
464 ret = -EINVAL; 453 ret = -EINVAL;
465 cpuset_for_each_child(c, cgrp, par) { 454 cpuset_for_each_child(c, css, par) {
466 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 455 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
467 c != cur && 456 c != cur &&
468 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 457 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -475,13 +464,17 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
475 464
476 /* 465 /*
477 * Cpusets with tasks - existing or newly being attached - can't 466 * Cpusets with tasks - existing or newly being attached - can't
478 * have empty cpus_allowed or mems_allowed. 467 * be changed to have empty cpus_allowed or mems_allowed.
479 */ 468 */
480 ret = -ENOSPC; 469 ret = -ENOSPC;
481 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && 470 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) {
482 (cpumask_empty(trial->cpus_allowed) && 471 if (!cpumask_empty(cur->cpus_allowed) &&
483 nodes_empty(trial->mems_allowed))) 472 cpumask_empty(trial->cpus_allowed))
484 goto out; 473 goto out;
474 if (!nodes_empty(cur->mems_allowed) &&
475 nodes_empty(trial->mems_allowed))
476 goto out;
477 }
485 478
486 ret = 0; 479 ret = 0;
487out: 480out:
@@ -511,13 +504,16 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
511 struct cpuset *root_cs) 504 struct cpuset *root_cs)
512{ 505{
513 struct cpuset *cp; 506 struct cpuset *cp;
514 struct cgroup *pos_cgrp; 507 struct cgroup_subsys_state *pos_css;
515 508
516 rcu_read_lock(); 509 rcu_read_lock();
517 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 510 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
511 if (cp == root_cs)
512 continue;
513
518 /* skip the whole subtree if @cp doesn't have any CPU */ 514 /* skip the whole subtree if @cp doesn't have any CPU */
519 if (cpumask_empty(cp->cpus_allowed)) { 515 if (cpumask_empty(cp->cpus_allowed)) {
520 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 516 pos_css = css_rightmost_descendant(pos_css);
521 continue; 517 continue;
522 } 518 }
523 519
@@ -592,7 +588,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
592 struct sched_domain_attr *dattr; /* attributes for custom domains */ 588 struct sched_domain_attr *dattr; /* attributes for custom domains */
593 int ndoms = 0; /* number of sched domains in result */ 589 int ndoms = 0; /* number of sched domains in result */
594 int nslot; /* next empty doms[] struct cpumask slot */ 590 int nslot; /* next empty doms[] struct cpumask slot */
595 struct cgroup *pos_cgrp; 591 struct cgroup_subsys_state *pos_css;
596 592
597 doms = NULL; 593 doms = NULL;
598 dattr = NULL; 594 dattr = NULL;
@@ -621,7 +617,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
621 csn = 0; 617 csn = 0;
622 618
623 rcu_read_lock(); 619 rcu_read_lock();
624 cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { 620 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
621 if (cp == &top_cpuset)
622 continue;
625 /* 623 /*
626 * Continue traversing beyond @cp iff @cp has some CPUs and 624 * Continue traversing beyond @cp iff @cp has some CPUs and
627 * isn't load balancing. The former is obvious. The 625 * isn't load balancing. The former is obvious. The
@@ -638,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
638 csa[csn++] = cp; 636 csa[csn++] = cp;
639 637
640 /* skip @cp's subtree */ 638 /* skip @cp's subtree */
641 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 639 pos_css = css_rightmost_descendant(pos_css);
642 } 640 }
643 rcu_read_unlock(); 641 rcu_read_unlock();
644 642
@@ -833,52 +831,45 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
833/** 831/**
834 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's 832 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
835 * @tsk: task to test 833 * @tsk: task to test
836 * @scan: struct cgroup_scanner containing the cgroup of the task 834 * @data: cpuset to @tsk belongs to
837 * 835 *
838 * Called by cgroup_scan_tasks() for each task in a cgroup whose 836 * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
839 * cpus_allowed mask needs to be changed. 837 * mask needs to be changed.
840 * 838 *
841 * We don't need to re-check for the cgroup/cpuset membership, since we're 839 * We don't need to re-check for the cgroup/cpuset membership, since we're
842 * holding cpuset_mutex at this point. 840 * holding cpuset_mutex at this point.
843 */ 841 */
844static void cpuset_change_cpumask(struct task_struct *tsk, 842static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
845 struct cgroup_scanner *scan)
846{ 843{
847 struct cpuset *cpus_cs; 844 struct cpuset *cs = data;
845 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
848 846
849 cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
850 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); 847 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
851} 848}
852 849
853/** 850/**
854 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 851 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
855 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 852 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
856 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 853 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
857 * 854 *
858 * Called with cpuset_mutex held 855 * Called with cpuset_mutex held
859 * 856 *
860 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 857 * The css_scan_tasks() function will scan all the tasks in a cgroup,
861 * calling callback functions for each. 858 * calling callback functions for each.
862 * 859 *
863 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 860 * No return value. It's guaranteed that css_scan_tasks() always returns 0
864 * if @heap != NULL. 861 * if @heap != NULL.
865 */ 862 */
866static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) 863static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
867{ 864{
868 struct cgroup_scanner scan; 865 css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap);
869
870 scan.cg = cs->css.cgroup;
871 scan.test_task = NULL;
872 scan.process_task = cpuset_change_cpumask;
873 scan.heap = heap;
874 cgroup_scan_tasks(&scan);
875} 866}
876 867
877/* 868/*
878 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 869 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
879 * @root_cs: the root cpuset of the hierarchy 870 * @root_cs: the root cpuset of the hierarchy
880 * @update_root: update root cpuset or not? 871 * @update_root: update root cpuset or not?
881 * @heap: the heap used by cgroup_scan_tasks() 872 * @heap: the heap used by css_scan_tasks()
882 * 873 *
883 * This will update cpumasks of tasks in @root_cs and all other empty cpusets 874 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
884 * which take on cpumask of @root_cs. 875 * which take on cpumask of @root_cs.
@@ -889,17 +880,19 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
889 bool update_root, struct ptr_heap *heap) 880 bool update_root, struct ptr_heap *heap)
890{ 881{
891 struct cpuset *cp; 882 struct cpuset *cp;
892 struct cgroup *pos_cgrp; 883 struct cgroup_subsys_state *pos_css;
893
894 if (update_root)
895 update_tasks_cpumask(root_cs, heap);
896 884
897 rcu_read_lock(); 885 rcu_read_lock();
898 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 886 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
899 /* skip the whole subtree if @cp have some CPU */ 887 if (cp == root_cs) {
900 if (!cpumask_empty(cp->cpus_allowed)) { 888 if (!update_root)
901 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 889 continue;
902 continue; 890 } else {
891 /* skip the whole subtree if @cp have some CPU */
892 if (!cpumask_empty(cp->cpus_allowed)) {
893 pos_css = css_rightmost_descendant(pos_css);
894 continue;
895 }
903 } 896 }
904 if (!css_tryget(&cp->css)) 897 if (!css_tryget(&cp->css))
905 continue; 898 continue;
@@ -1055,20 +1048,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1055 task_unlock(tsk); 1048 task_unlock(tsk);
1056} 1049}
1057 1050
1051struct cpuset_change_nodemask_arg {
1052 struct cpuset *cs;
1053 nodemask_t *newmems;
1054};
1055
1058/* 1056/*
1059 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy 1057 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1060 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if 1058 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1061 * memory_migrate flag is set. Called with cpuset_mutex held. 1059 * memory_migrate flag is set. Called with cpuset_mutex held.
1062 */ 1060 */
1063static void cpuset_change_nodemask(struct task_struct *p, 1061static void cpuset_change_nodemask(struct task_struct *p, void *data)
1064 struct cgroup_scanner *scan)
1065{ 1062{
1066 struct cpuset *cs = cgroup_cs(scan->cg); 1063 struct cpuset_change_nodemask_arg *arg = data;
1064 struct cpuset *cs = arg->cs;
1067 struct mm_struct *mm; 1065 struct mm_struct *mm;
1068 int migrate; 1066 int migrate;
1069 nodemask_t *newmems = scan->data;
1070 1067
1071 cpuset_change_task_nodemask(p, newmems); 1068 cpuset_change_task_nodemask(p, arg->newmems);
1072 1069
1073 mm = get_task_mm(p); 1070 mm = get_task_mm(p);
1074 if (!mm) 1071 if (!mm)
@@ -1078,7 +1075,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1078 1075
1079 mpol_rebind_mm(mm, &cs->mems_allowed); 1076 mpol_rebind_mm(mm, &cs->mems_allowed);
1080 if (migrate) 1077 if (migrate)
1081 cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); 1078 cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
1082 mmput(mm); 1079 mmput(mm);
1083} 1080}
1084 1081
@@ -1087,28 +1084,22 @@ static void *cpuset_being_rebound;
1087/** 1084/**
1088 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1085 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1089 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1086 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1090 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1087 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1091 * 1088 *
1092 * Called with cpuset_mutex held 1089 * Called with cpuset_mutex held. No return value. It's guaranteed that
1093 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1090 * css_scan_tasks() always returns 0 if @heap != NULL.
1094 * if @heap != NULL.
1095 */ 1091 */
1096static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) 1092static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1097{ 1093{
1098 static nodemask_t newmems; /* protected by cpuset_mutex */ 1094 static nodemask_t newmems; /* protected by cpuset_mutex */
1099 struct cgroup_scanner scan;
1100 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1095 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1096 struct cpuset_change_nodemask_arg arg = { .cs = cs,
1097 .newmems = &newmems };
1101 1098
1102 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1099 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1103 1100
1104 guarantee_online_mems(mems_cs, &newmems); 1101 guarantee_online_mems(mems_cs, &newmems);
1105 1102
1106 scan.cg = cs->css.cgroup;
1107 scan.test_task = NULL;
1108 scan.process_task = cpuset_change_nodemask;
1109 scan.heap = heap;
1110 scan.data = &newmems;
1111
1112 /* 1103 /*
1113 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1104 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1114 * take while holding tasklist_lock. Forks can happen - the 1105 * take while holding tasklist_lock. Forks can happen - the
@@ -1119,7 +1110,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1119 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1110 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1120 * is idempotent. Also migrate pages in each mm to new nodes. 1111 * is idempotent. Also migrate pages in each mm to new nodes.
1121 */ 1112 */
1122 cgroup_scan_tasks(&scan); 1113 css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap);
1123 1114
1124 /* 1115 /*
1125 * All the tasks' nodemasks have been updated, update 1116 * All the tasks' nodemasks have been updated, update
@@ -1135,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1135 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1126 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1136 * @cs: the root cpuset of the hierarchy 1127 * @cs: the root cpuset of the hierarchy
1137 * @update_root: update the root cpuset or not? 1128 * @update_root: update the root cpuset or not?
1138 * @heap: the heap used by cgroup_scan_tasks() 1129 * @heap: the heap used by css_scan_tasks()
1139 * 1130 *
1140 * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1131 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1141 * which take on nodemask of @root_cs. 1132 * which take on nodemask of @root_cs.
@@ -1146,17 +1137,19 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1146 bool update_root, struct ptr_heap *heap) 1137 bool update_root, struct ptr_heap *heap)
1147{ 1138{
1148 struct cpuset *cp; 1139 struct cpuset *cp;
1149 struct cgroup *pos_cgrp; 1140 struct cgroup_subsys_state *pos_css;
1150
1151 if (update_root)
1152 update_tasks_nodemask(root_cs, heap);
1153 1141
1154 rcu_read_lock(); 1142 rcu_read_lock();
1155 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 1143 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
1156 /* skip the whole subtree if @cp have some CPU */ 1144 if (cp == root_cs) {
1157 if (!nodes_empty(cp->mems_allowed)) { 1145 if (!update_root)
1158 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 1146 continue;
1159 continue; 1147 } else {
1148 /* skip the whole subtree if @cp have some CPU */
1149 if (!nodes_empty(cp->mems_allowed)) {
1150 pos_css = css_rightmost_descendant(pos_css);
1151 continue;
1152 }
1160 } 1153 }
1161 if (!css_tryget(&cp->css)) 1154 if (!css_tryget(&cp->css))
1162 continue; 1155 continue;
@@ -1263,44 +1256,39 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1263 return 0; 1256 return 0;
1264} 1257}
1265 1258
1266/* 1259/**
1267 * cpuset_change_flag - make a task's spread flags the same as its cpuset's 1260 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1268 * @tsk: task to be updated 1261 * @tsk: task to be updated
1269 * @scan: struct cgroup_scanner containing the cgroup of the task 1262 * @data: cpuset to @tsk belongs to
1270 * 1263 *
1271 * Called by cgroup_scan_tasks() for each task in a cgroup. 1264 * Called by css_scan_tasks() for each task in a cgroup.
1272 * 1265 *
1273 * We don't need to re-check for the cgroup/cpuset membership, since we're 1266 * We don't need to re-check for the cgroup/cpuset membership, since we're
1274 * holding cpuset_mutex at this point. 1267 * holding cpuset_mutex at this point.
1275 */ 1268 */
1276static void cpuset_change_flag(struct task_struct *tsk, 1269static void cpuset_change_flag(struct task_struct *tsk, void *data)
1277 struct cgroup_scanner *scan)
1278{ 1270{
1279 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); 1271 struct cpuset *cs = data;
1272
1273 cpuset_update_task_spread_flag(cs, tsk);
1280} 1274}
1281 1275
1282/* 1276/**
1283 * update_tasks_flags - update the spread flags of tasks in the cpuset. 1277 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1284 * @cs: the cpuset in which each task's spread flags needs to be changed 1278 * @cs: the cpuset in which each task's spread flags needs to be changed
1285 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1279 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1286 * 1280 *
1287 * Called with cpuset_mutex held 1281 * Called with cpuset_mutex held
1288 * 1282 *
1289 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 1283 * The css_scan_tasks() function will scan all the tasks in a cgroup,
1290 * calling callback functions for each. 1284 * calling callback functions for each.
1291 * 1285 *
1292 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1286 * No return value. It's guaranteed that css_scan_tasks() always returns 0
1293 * if @heap != NULL. 1287 * if @heap != NULL.
1294 */ 1288 */
1295static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) 1289static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1296{ 1290{
1297 struct cgroup_scanner scan; 1291 css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap);
1298
1299 scan.cg = cs->css.cgroup;
1300 scan.test_task = NULL;
1301 scan.process_task = cpuset_change_flag;
1302 scan.heap = heap;
1303 cgroup_scan_tasks(&scan);
1304} 1292}
1305 1293
1306/* 1294/*
@@ -1458,9 +1446,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1458} 1446}
1459 1447
1460/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 1448/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1461static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1449static int cpuset_can_attach(struct cgroup_subsys_state *css,
1450 struct cgroup_taskset *tset)
1462{ 1451{
1463 struct cpuset *cs = cgroup_cs(cgrp); 1452 struct cpuset *cs = css_cs(css);
1464 struct task_struct *task; 1453 struct task_struct *task;
1465 int ret; 1454 int ret;
1466 1455
@@ -1471,11 +1460,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1471 * flag is set. 1460 * flag is set.
1472 */ 1461 */
1473 ret = -ENOSPC; 1462 ret = -ENOSPC;
1474 if (!cgroup_sane_behavior(cgrp) && 1463 if (!cgroup_sane_behavior(css->cgroup) &&
1475 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1464 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1476 goto out_unlock; 1465 goto out_unlock;
1477 1466
1478 cgroup_taskset_for_each(task, cgrp, tset) { 1467 cgroup_taskset_for_each(task, css, tset) {
1479 /* 1468 /*
1480 * Kthreads which disallow setaffinity shouldn't be moved 1469 * Kthreads which disallow setaffinity shouldn't be moved
1481 * to a new cpuset; we don't want to change their cpu 1470 * to a new cpuset; we don't want to change their cpu
@@ -1504,11 +1493,11 @@ out_unlock:
1504 return ret; 1493 return ret;
1505} 1494}
1506 1495
1507static void cpuset_cancel_attach(struct cgroup *cgrp, 1496static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
1508 struct cgroup_taskset *tset) 1497 struct cgroup_taskset *tset)
1509{ 1498{
1510 mutex_lock(&cpuset_mutex); 1499 mutex_lock(&cpuset_mutex);
1511 cgroup_cs(cgrp)->attach_in_progress--; 1500 css_cs(css)->attach_in_progress--;
1512 mutex_unlock(&cpuset_mutex); 1501 mutex_unlock(&cpuset_mutex);
1513} 1502}
1514 1503
@@ -1519,16 +1508,18 @@ static void cpuset_cancel_attach(struct cgroup *cgrp,
1519 */ 1508 */
1520static cpumask_var_t cpus_attach; 1509static cpumask_var_t cpus_attach;
1521 1510
1522static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1511static void cpuset_attach(struct cgroup_subsys_state *css,
1512 struct cgroup_taskset *tset)
1523{ 1513{
1524 /* static buf protected by cpuset_mutex */ 1514 /* static buf protected by cpuset_mutex */
1525 static nodemask_t cpuset_attach_nodemask_to; 1515 static nodemask_t cpuset_attach_nodemask_to;
1526 struct mm_struct *mm; 1516 struct mm_struct *mm;
1527 struct task_struct *task; 1517 struct task_struct *task;
1528 struct task_struct *leader = cgroup_taskset_first(tset); 1518 struct task_struct *leader = cgroup_taskset_first(tset);
1529 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); 1519 struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
1530 struct cpuset *cs = cgroup_cs(cgrp); 1520 cpuset_subsys_id);
1531 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1521 struct cpuset *cs = css_cs(css);
1522 struct cpuset *oldcs = css_cs(oldcss);
1532 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); 1523 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1533 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1524 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1534 1525
@@ -1542,7 +1533,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1542 1533
1543 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1534 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1544 1535
1545 cgroup_taskset_for_each(task, cgrp, tset) { 1536 cgroup_taskset_for_each(task, css, tset) {
1546 /* 1537 /*
1547 * can_attach beforehand should guarantee that this doesn't 1538 * can_attach beforehand should guarantee that this doesn't
1548 * fail. TODO: have a better way to handle failure here 1539 * fail. TODO: have a better way to handle failure here
@@ -1604,15 +1595,18 @@ typedef enum {
1604 FILE_SPREAD_SLAB, 1595 FILE_SPREAD_SLAB,
1605} cpuset_filetype_t; 1596} cpuset_filetype_t;
1606 1597
1607static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1598static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1599 u64 val)
1608{ 1600{
1609 struct cpuset *cs = cgroup_cs(cgrp); 1601 struct cpuset *cs = css_cs(css);
1610 cpuset_filetype_t type = cft->private; 1602 cpuset_filetype_t type = cft->private;
1611 int retval = -ENODEV; 1603 int retval = 0;
1612 1604
1613 mutex_lock(&cpuset_mutex); 1605 mutex_lock(&cpuset_mutex);
1614 if (!is_cpuset_online(cs)) 1606 if (!is_cpuset_online(cs)) {
1607 retval = -ENODEV;
1615 goto out_unlock; 1608 goto out_unlock;
1609 }
1616 1610
1617 switch (type) { 1611 switch (type) {
1618 case FILE_CPU_EXCLUSIVE: 1612 case FILE_CPU_EXCLUSIVE:
@@ -1651,9 +1645,10 @@ out_unlock:
1651 return retval; 1645 return retval;
1652} 1646}
1653 1647
1654static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) 1648static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
1649 s64 val)
1655{ 1650{
1656 struct cpuset *cs = cgroup_cs(cgrp); 1651 struct cpuset *cs = css_cs(css);
1657 cpuset_filetype_t type = cft->private; 1652 cpuset_filetype_t type = cft->private;
1658 int retval = -ENODEV; 1653 int retval = -ENODEV;
1659 1654
@@ -1677,10 +1672,10 @@ out_unlock:
1677/* 1672/*
1678 * Common handling for a write to a "cpus" or "mems" file. 1673 * Common handling for a write to a "cpus" or "mems" file.
1679 */ 1674 */
1680static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, 1675static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1681 const char *buf) 1676 struct cftype *cft, const char *buf)
1682{ 1677{
1683 struct cpuset *cs = cgroup_cs(cgrp); 1678 struct cpuset *cs = css_cs(css);
1684 struct cpuset *trialcs; 1679 struct cpuset *trialcs;
1685 int retval = -ENODEV; 1680 int retval = -ENODEV;
1686 1681
@@ -1759,13 +1754,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1759 return count; 1754 return count;
1760} 1755}
1761 1756
1762static ssize_t cpuset_common_file_read(struct cgroup *cgrp, 1757static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
1763 struct cftype *cft, 1758 struct cftype *cft, struct file *file,
1764 struct file *file, 1759 char __user *buf, size_t nbytes,
1765 char __user *buf, 1760 loff_t *ppos)
1766 size_t nbytes, loff_t *ppos)
1767{ 1761{
1768 struct cpuset *cs = cgroup_cs(cgrp); 1762 struct cpuset *cs = css_cs(css);
1769 cpuset_filetype_t type = cft->private; 1763 cpuset_filetype_t type = cft->private;
1770 char *page; 1764 char *page;
1771 ssize_t retval = 0; 1765 ssize_t retval = 0;
@@ -1795,9 +1789,9 @@ out:
1795 return retval; 1789 return retval;
1796} 1790}
1797 1791
1798static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) 1792static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
1799{ 1793{
1800 struct cpuset *cs = cgroup_cs(cgrp); 1794 struct cpuset *cs = css_cs(css);
1801 cpuset_filetype_t type = cft->private; 1795 cpuset_filetype_t type = cft->private;
1802 switch (type) { 1796 switch (type) {
1803 case FILE_CPU_EXCLUSIVE: 1797 case FILE_CPU_EXCLUSIVE:
@@ -1826,9 +1820,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
1826 return 0; 1820 return 0;
1827} 1821}
1828 1822
1829static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) 1823static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1830{ 1824{
1831 struct cpuset *cs = cgroup_cs(cgrp); 1825 struct cpuset *cs = css_cs(css);
1832 cpuset_filetype_t type = cft->private; 1826 cpuset_filetype_t type = cft->private;
1833 switch (type) { 1827 switch (type) {
1834 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1828 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1943,11 +1937,12 @@ static struct cftype files[] = {
1943 * cgrp: control group that the new cpuset will be part of 1937 * cgrp: control group that the new cpuset will be part of
1944 */ 1938 */
1945 1939
1946static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) 1940static struct cgroup_subsys_state *
1941cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1947{ 1942{
1948 struct cpuset *cs; 1943 struct cpuset *cs;
1949 1944
1950 if (!cgrp->parent) 1945 if (!parent_css)
1951 return &top_cpuset.css; 1946 return &top_cpuset.css;
1952 1947
1953 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1948 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
@@ -1967,12 +1962,12 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
1967 return &cs->css; 1962 return &cs->css;
1968} 1963}
1969 1964
1970static int cpuset_css_online(struct cgroup *cgrp) 1965static int cpuset_css_online(struct cgroup_subsys_state *css)
1971{ 1966{
1972 struct cpuset *cs = cgroup_cs(cgrp); 1967 struct cpuset *cs = css_cs(css);
1973 struct cpuset *parent = parent_cs(cs); 1968 struct cpuset *parent = parent_cs(cs);
1974 struct cpuset *tmp_cs; 1969 struct cpuset *tmp_cs;
1975 struct cgroup *pos_cg; 1970 struct cgroup_subsys_state *pos_css;
1976 1971
1977 if (!parent) 1972 if (!parent)
1978 return 0; 1973 return 0;
@@ -1987,7 +1982,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
1987 1982
1988 number_of_cpusets++; 1983 number_of_cpusets++;
1989 1984
1990 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) 1985 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1991 goto out_unlock; 1986 goto out_unlock;
1992 1987
1993 /* 1988 /*
@@ -2004,7 +1999,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
2004 * (and likewise for mems) to the new cgroup. 1999 * (and likewise for mems) to the new cgroup.
2005 */ 2000 */
2006 rcu_read_lock(); 2001 rcu_read_lock();
2007 cpuset_for_each_child(tmp_cs, pos_cg, parent) { 2002 cpuset_for_each_child(tmp_cs, pos_css, parent) {
2008 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { 2003 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2009 rcu_read_unlock(); 2004 rcu_read_unlock();
2010 goto out_unlock; 2005 goto out_unlock;
@@ -2021,9 +2016,15 @@ out_unlock:
2021 return 0; 2016 return 0;
2022} 2017}
2023 2018
2024static void cpuset_css_offline(struct cgroup *cgrp) 2019/*
2020 * If the cpuset being removed has its flag 'sched_load_balance'
2021 * enabled, then simulate turning sched_load_balance off, which
2022 * will call rebuild_sched_domains_locked().
2023 */
2024
2025static void cpuset_css_offline(struct cgroup_subsys_state *css)
2025{ 2026{
2026 struct cpuset *cs = cgroup_cs(cgrp); 2027 struct cpuset *cs = css_cs(css);
2027 2028
2028 mutex_lock(&cpuset_mutex); 2029 mutex_lock(&cpuset_mutex);
2029 2030
@@ -2036,15 +2037,9 @@ static void cpuset_css_offline(struct cgroup *cgrp)
2036 mutex_unlock(&cpuset_mutex); 2037 mutex_unlock(&cpuset_mutex);
2037} 2038}
2038 2039
2039/* 2040static void cpuset_css_free(struct cgroup_subsys_state *css)
2040 * If the cpuset being removed has its flag 'sched_load_balance'
2041 * enabled, then simulate turning sched_load_balance off, which
2042 * will call rebuild_sched_domains_locked().
2043 */
2044
2045static void cpuset_css_free(struct cgroup *cgrp)
2046{ 2041{
2047 struct cpuset *cs = cgroup_cs(cgrp); 2042 struct cpuset *cs = css_cs(css);
2048 2043
2049 free_cpumask_var(cs->cpus_allowed); 2044 free_cpumask_var(cs->cpus_allowed);
2050 kfree(cs); 2045 kfree(cs);
@@ -2251,11 +2246,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2251 /* if cpus or mems changed, we need to propagate to descendants */ 2246 /* if cpus or mems changed, we need to propagate to descendants */
2252 if (cpus_updated || mems_updated) { 2247 if (cpus_updated || mems_updated) {
2253 struct cpuset *cs; 2248 struct cpuset *cs;
2254 struct cgroup *pos_cgrp; 2249 struct cgroup_subsys_state *pos_css;
2255 2250
2256 rcu_read_lock(); 2251 rcu_read_lock();
2257 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { 2252 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2258 if (!css_tryget(&cs->css)) 2253 if (cs == &top_cpuset || !css_tryget(&cs->css))
2259 continue; 2254 continue;
2260 rcu_read_unlock(); 2255 rcu_read_unlock();
2261 2256
@@ -2344,7 +2339,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2344 2339
2345void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2340void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2346{ 2341{
2347 const struct cpuset *cpus_cs; 2342 struct cpuset *cpus_cs;
2348 2343
2349 rcu_read_lock(); 2344 rcu_read_lock();
2350 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2345 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
@@ -2417,7 +2412,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2417 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall 2412 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
2418 * (an unusual configuration), then returns the root cpuset. 2413 * (an unusual configuration), then returns the root cpuset.
2419 */ 2414 */
2420static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) 2415static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2421{ 2416{
2422 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) 2417 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2423 cs = parent_cs(cs); 2418 cs = parent_cs(cs);
@@ -2487,7 +2482,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2487 */ 2482 */
2488int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) 2483int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2489{ 2484{
2490 const struct cpuset *cs; /* current cpuset ancestors */ 2485 struct cpuset *cs; /* current cpuset ancestors */
2491 int allowed; /* is allocation in zone z allowed? */ 2486 int allowed; /* is allocation in zone z allowed? */
2492 2487
2493 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2488 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
@@ -2725,7 +2720,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2725 goto out_free; 2720 goto out_free;
2726 2721
2727 rcu_read_lock(); 2722 rcu_read_lock();
2728 css = task_subsys_state(tsk, cpuset_subsys_id); 2723 css = task_css(tsk, cpuset_subsys_id);
2729 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2724 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2730 rcu_read_unlock(); 2725 rcu_read_unlock();
2731 if (retval < 0) 2726 if (retval < 0)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0506d447aed2..7d2f35e5df2f 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -575,8 +575,12 @@ return_normal:
575 raw_spin_lock(&dbg_slave_lock); 575 raw_spin_lock(&dbg_slave_lock);
576 576
577#ifdef CONFIG_SMP 577#ifdef CONFIG_SMP
578 /* If send_ready set, slaves are already waiting */
579 if (ks->send_ready)
580 atomic_set(ks->send_ready, 1);
581
578 /* Signal the other CPUs to enter kgdb_wait() */ 582 /* Signal the other CPUs to enter kgdb_wait() */
579 if ((!kgdb_single_step) && kgdb_do_roundup) 583 else if ((!kgdb_single_step) && kgdb_do_roundup)
580 kgdb_roundup_cpus(flags); 584 kgdb_roundup_cpus(flags);
581#endif 585#endif
582 586
@@ -678,11 +682,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
678 if (arch_kgdb_ops.enable_nmi) 682 if (arch_kgdb_ops.enable_nmi)
679 arch_kgdb_ops.enable_nmi(0); 683 arch_kgdb_ops.enable_nmi(0);
680 684
685 memset(ks, 0, sizeof(struct kgdb_state));
681 ks->cpu = raw_smp_processor_id(); 686 ks->cpu = raw_smp_processor_id();
682 ks->ex_vector = evector; 687 ks->ex_vector = evector;
683 ks->signo = signo; 688 ks->signo = signo;
684 ks->err_code = ecode; 689 ks->err_code = ecode;
685 ks->kgdb_usethreadid = 0;
686 ks->linux_regs = regs; 690 ks->linux_regs = regs;
687 691
688 if (kgdb_reenter_check(ks)) 692 if (kgdb_reenter_check(ks))
@@ -732,6 +736,30 @@ int kgdb_nmicallback(int cpu, void *regs)
732 return 1; 736 return 1;
733} 737}
734 738
739int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready)
740{
741#ifdef CONFIG_SMP
742 if (!kgdb_io_ready(0) || !send_ready)
743 return 1;
744
745 if (kgdb_info[cpu].enter_kgdb == 0) {
746 struct kgdb_state kgdb_var;
747 struct kgdb_state *ks = &kgdb_var;
748
749 memset(ks, 0, sizeof(struct kgdb_state));
750 ks->cpu = cpu;
751 ks->ex_vector = trapnr;
752 ks->signo = SIGTRAP;
753 ks->err_code = KGDB_KDB_REASON_SYSTEM_NMI;
754 ks->linux_regs = regs;
755 ks->send_ready = send_ready;
756 kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
757 return 0;
758 }
759#endif
760 return 1;
761}
762
735static void kgdb_console_write(struct console *co, const char *s, 763static void kgdb_console_write(struct console *co, const char *s,
736 unsigned count) 764 unsigned count)
737{ 765{
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 2235967e78b0..572aa4f5677c 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -26,6 +26,7 @@ struct kgdb_state {
26 unsigned long threadid; 26 unsigned long threadid;
27 long kgdb_usethreadid; 27 long kgdb_usethreadid;
28 struct pt_regs *linux_regs; 28 struct pt_regs *linux_regs;
29 atomic_t *send_ready;
29}; 30};
30 31
31/* Exception state values */ 32/* Exception state values */
@@ -74,11 +75,13 @@ extern int kdb_stub(struct kgdb_state *ks);
74extern int kdb_parse(const char *cmdstr); 75extern int kdb_parse(const char *cmdstr);
75extern int kdb_common_init_state(struct kgdb_state *ks); 76extern int kdb_common_init_state(struct kgdb_state *ks);
76extern int kdb_common_deinit_state(void); 77extern int kdb_common_deinit_state(void);
78#define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI
77#else /* ! CONFIG_KGDB_KDB */ 79#else /* ! CONFIG_KGDB_KDB */
78static inline int kdb_stub(struct kgdb_state *ks) 80static inline int kdb_stub(struct kgdb_state *ks)
79{ 81{
80 return DBG_PASS_EVENT; 82 return DBG_PASS_EVENT;
81} 83}
84#define KGDB_KDB_REASON_SYSTEM_NMI 0
82#endif /* CONFIG_KGDB_KDB */ 85#endif /* CONFIG_KGDB_KDB */
83 86
84#endif /* _DEBUG_CORE_H_ */ 87#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 328d18ef31e4..8859ca34dcfe 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -69,7 +69,10 @@ int kdb_stub(struct kgdb_state *ks)
69 if (atomic_read(&kgdb_setting_breakpoint)) 69 if (atomic_read(&kgdb_setting_breakpoint))
70 reason = KDB_REASON_KEYBOARD; 70 reason = KDB_REASON_KEYBOARD;
71 71
72 if (in_nmi()) 72 if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP)
73 reason = KDB_REASON_SYSTEM_NMI;
74
75 else if (in_nmi())
73 reason = KDB_REASON_NMI; 76 reason = KDB_REASON_NMI;
74 77
75 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { 78 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 00eb8f7fbf41..0b097c8a1e50 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1200,6 +1200,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1200 instruction_pointer(regs)); 1200 instruction_pointer(regs));
1201 kdb_dumpregs(regs); 1201 kdb_dumpregs(regs);
1202 break; 1202 break;
1203 case KDB_REASON_SYSTEM_NMI:
1204 kdb_printf("due to System NonMaskable Interrupt\n");
1205 break;
1203 case KDB_REASON_NMI: 1206 case KDB_REASON_NMI:
1204 kdb_printf("due to NonMaskable Interrupt @ " 1207 kdb_printf("due to NonMaskable Interrupt @ "
1205 kdb_machreg_fmt "\n", 1208 kdb_machreg_fmt "\n",
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c77206184b8b..97b67df8fbfe 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -116,6 +116,9 @@ int get_callchain_buffers(void)
116 116
117 err = alloc_callchain_buffers(); 117 err = alloc_callchain_buffers();
118exit: 118exit:
119 if (err)
120 atomic_dec(&nr_callchain_events);
121
119 mutex_unlock(&callchain_mutex); 122 mutex_unlock(&callchain_mutex);
120 123
121 return err; 124 return err;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f86599e8c123..8c875ef6e120 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -145,6 +145,7 @@ static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
145static atomic_t nr_mmap_events __read_mostly; 145static atomic_t nr_mmap_events __read_mostly;
146static atomic_t nr_comm_events __read_mostly; 146static atomic_t nr_comm_events __read_mostly;
147static atomic_t nr_task_events __read_mostly; 147static atomic_t nr_task_events __read_mostly;
148static atomic_t nr_freq_events __read_mostly;
148 149
149static LIST_HEAD(pmus); 150static LIST_HEAD(pmus);
150static DEFINE_MUTEX(pmus_lock); 151static DEFINE_MUTEX(pmus_lock);
@@ -174,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
174static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 175static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
175static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; 176static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
176 177
177static atomic_t perf_sample_allowed_ns __read_mostly = 178static int perf_sample_allowed_ns __read_mostly =
178 ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); 179 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
179 180
180void update_perf_cpu_limits(void) 181void update_perf_cpu_limits(void)
181{ 182{
@@ -183,7 +184,7 @@ void update_perf_cpu_limits(void)
183 184
184 tmp *= sysctl_perf_cpu_time_max_percent; 185 tmp *= sysctl_perf_cpu_time_max_percent;
185 do_div(tmp, 100); 186 do_div(tmp, 100);
186 atomic_set(&perf_sample_allowed_ns, tmp); 187 ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
187} 188}
188 189
189static int perf_rotate_context(struct perf_cpu_context *cpuctx); 190static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -192,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
192 void __user *buffer, size_t *lenp, 193 void __user *buffer, size_t *lenp,
193 loff_t *ppos) 194 loff_t *ppos)
194{ 195{
195 int ret = proc_dointvec(table, write, buffer, lenp, ppos); 196 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
196 197
197 if (ret || !write) 198 if (ret || !write)
198 return ret; 199 return ret;
@@ -227,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
227 * we detect that events are taking too long. 228 * we detect that events are taking too long.
228 */ 229 */
229#define NR_ACCUMULATED_SAMPLES 128 230#define NR_ACCUMULATED_SAMPLES 128
230DEFINE_PER_CPU(u64, running_sample_length); 231static DEFINE_PER_CPU(u64, running_sample_length);
231 232
232void perf_sample_event_took(u64 sample_len_ns) 233void perf_sample_event_took(u64 sample_len_ns)
233{ 234{
234 u64 avg_local_sample_len; 235 u64 avg_local_sample_len;
235 u64 local_samples_len; 236 u64 local_samples_len;
237 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
236 238
237 if (atomic_read(&perf_sample_allowed_ns) == 0) 239 if (allowed_ns == 0)
238 return; 240 return;
239 241
240 /* decay the counter by 1 average sample */ 242 /* decay the counter by 1 average sample */
@@ -250,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns)
250 */ 252 */
251 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; 253 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
252 254
253 if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) 255 if (avg_local_sample_len <= allowed_ns)
254 return; 256 return;
255 257
256 if (max_samples_per_tick <= 1) 258 if (max_samples_per_tick <= 1)
@@ -261,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns)
261 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 263 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
262 264
263 printk_ratelimited(KERN_WARNING 265 printk_ratelimited(KERN_WARNING
264 "perf samples too long (%lld > %d), lowering " 266 "perf samples too long (%lld > %lld), lowering "
265 "kernel.perf_event_max_sample_rate to %d\n", 267 "kernel.perf_event_max_sample_rate to %d\n",
266 avg_local_sample_len, 268 avg_local_sample_len, allowed_ns,
267 atomic_read(&perf_sample_allowed_ns),
268 sysctl_perf_event_sample_rate); 269 sysctl_perf_event_sample_rate);
269 270
270 update_perf_cpu_limits(); 271 update_perf_cpu_limits();
@@ -340,8 +341,8 @@ struct perf_cgroup {
340static inline struct perf_cgroup * 341static inline struct perf_cgroup *
341perf_cgroup_from_task(struct task_struct *task) 342perf_cgroup_from_task(struct task_struct *task)
342{ 343{
343 return container_of(task_subsys_state(task, perf_subsys_id), 344 return container_of(task_css(task, perf_subsys_id),
344 struct perf_cgroup, css); 345 struct perf_cgroup, css);
345} 346}
346 347
347static inline bool 348static inline bool
@@ -591,7 +592,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
591 if (!f.file) 592 if (!f.file)
592 return -EBADF; 593 return -EBADF;
593 594
594 css = cgroup_css_from_dir(f.file, perf_subsys_id); 595 rcu_read_lock();
596
597 css = css_from_dir(f.file->f_dentry, &perf_subsys);
595 if (IS_ERR(css)) { 598 if (IS_ERR(css)) {
596 ret = PTR_ERR(css); 599 ret = PTR_ERR(css);
597 goto out; 600 goto out;
@@ -617,6 +620,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
617 ret = -EINVAL; 620 ret = -EINVAL;
618 } 621 }
619out: 622out:
623 rcu_read_unlock();
620 fdput(f); 624 fdput(f);
621 return ret; 625 return ret;
622} 626}
@@ -869,12 +873,8 @@ static void perf_pmu_rotate_start(struct pmu *pmu)
869 873
870 WARN_ON(!irqs_disabled()); 874 WARN_ON(!irqs_disabled());
871 875
872 if (list_empty(&cpuctx->rotation_list)) { 876 if (list_empty(&cpuctx->rotation_list))
873 int was_empty = list_empty(head);
874 list_add(&cpuctx->rotation_list, head); 877 list_add(&cpuctx->rotation_list, head);
875 if (was_empty)
876 tick_nohz_full_kick();
877 }
878} 878}
879 879
880static void get_ctx(struct perf_event_context *ctx) 880static void get_ctx(struct perf_event_context *ctx)
@@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx)
899 put_ctx(ctx->parent_ctx); 899 put_ctx(ctx->parent_ctx);
900 ctx->parent_ctx = NULL; 900 ctx->parent_ctx = NULL;
901 } 901 }
902 ctx->generation++;
902} 903}
903 904
904static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) 905static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1136 ctx->nr_events++; 1137 ctx->nr_events++;
1137 if (event->attr.inherit_stat) 1138 if (event->attr.inherit_stat)
1138 ctx->nr_stat++; 1139 ctx->nr_stat++;
1140
1141 ctx->generation++;
1139} 1142}
1140 1143
1141/* 1144/*
@@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event)
1201 if (sample_type & PERF_SAMPLE_DATA_SRC) 1204 if (sample_type & PERF_SAMPLE_DATA_SRC)
1202 size += sizeof(data->data_src.val); 1205 size += sizeof(data->data_src.val);
1203 1206
1207 if (sample_type & PERF_SAMPLE_TRANSACTION)
1208 size += sizeof(data->txn);
1209
1204 event->header_size = size; 1210 event->header_size = size;
1205} 1211}
1206 1212
@@ -1216,6 +1222,9 @@ static void perf_event__id_header_size(struct perf_event *event)
1216 if (sample_type & PERF_SAMPLE_TIME) 1222 if (sample_type & PERF_SAMPLE_TIME)
1217 size += sizeof(data->time); 1223 size += sizeof(data->time);
1218 1224
1225 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1226 size += sizeof(data->id);
1227
1219 if (sample_type & PERF_SAMPLE_ID) 1228 if (sample_type & PERF_SAMPLE_ID)
1220 size += sizeof(data->id); 1229 size += sizeof(data->id);
1221 1230
@@ -1307,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1307 */ 1316 */
1308 if (event->state > PERF_EVENT_STATE_OFF) 1317 if (event->state > PERF_EVENT_STATE_OFF)
1309 event->state = PERF_EVENT_STATE_OFF; 1318 event->state = PERF_EVENT_STATE_OFF;
1319
1320 ctx->generation++;
1310} 1321}
1311 1322
1312static void perf_group_detach(struct perf_event *event) 1323static void perf_group_detach(struct perf_event *event)
@@ -2143,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,
2143} 2154}
2144 2155
2145/* 2156/*
2146 * Test whether two contexts are equivalent, i.e. whether they 2157 * Test whether two contexts are equivalent, i.e. whether they have both been
2147 * have both been cloned from the same version of the same context 2158 * cloned from the same version of the same context.
2148 * and they both have the same number of enabled events. 2159 *
2149 * If the number of enabled events is the same, then the set 2160 * Equivalence is measured using a generation number in the context that is
2150 * of enabled events should be the same, because these are both 2161 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2151 * inherited contexts, therefore we can't access individual events 2162 * and list_del_event().
2152 * in them directly with an fd; we can only enable/disable all
2153 * events via prctl, or enable/disable all events in a family
2154 * via ioctl, which will have the same effect on both contexts.
2155 */ 2163 */
2156static int context_equiv(struct perf_event_context *ctx1, 2164static int context_equiv(struct perf_event_context *ctx1,
2157 struct perf_event_context *ctx2) 2165 struct perf_event_context *ctx2)
2158{ 2166{
2159 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx 2167 /* Pinning disables the swap optimization */
2160 && ctx1->parent_gen == ctx2->parent_gen 2168 if (ctx1->pin_count || ctx2->pin_count)
2161 && !ctx1->pin_count && !ctx2->pin_count; 2169 return 0;
2170
2171 /* If ctx1 is the parent of ctx2 */
2172 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2173 return 1;
2174
2175 /* If ctx2 is the parent of ctx1 */
2176 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2177 return 1;
2178
2179 /*
2180 * If ctx1 and ctx2 have the same parent; we flatten the parent
2181 * hierarchy, see perf_event_init_context().
2182 */
2183 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2184 ctx1->parent_gen == ctx2->parent_gen)
2185 return 1;
2186
2187 /* Unmatched */
2188 return 0;
2162} 2189}
2163 2190
2164static void __perf_event_sync_stat(struct perf_event *event, 2191static void __perf_event_sync_stat(struct perf_event *event,
@@ -2241,7 +2268,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2241{ 2268{
2242 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 2269 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2243 struct perf_event_context *next_ctx; 2270 struct perf_event_context *next_ctx;
2244 struct perf_event_context *parent; 2271 struct perf_event_context *parent, *next_parent;
2245 struct perf_cpu_context *cpuctx; 2272 struct perf_cpu_context *cpuctx;
2246 int do_switch = 1; 2273 int do_switch = 1;
2247 2274
@@ -2253,10 +2280,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2253 return; 2280 return;
2254 2281
2255 rcu_read_lock(); 2282 rcu_read_lock();
2256 parent = rcu_dereference(ctx->parent_ctx);
2257 next_ctx = next->perf_event_ctxp[ctxn]; 2283 next_ctx = next->perf_event_ctxp[ctxn];
2258 if (parent && next_ctx && 2284 if (!next_ctx)
2259 rcu_dereference(next_ctx->parent_ctx) == parent) { 2285 goto unlock;
2286
2287 parent = rcu_dereference(ctx->parent_ctx);
2288 next_parent = rcu_dereference(next_ctx->parent_ctx);
2289
2290 /* If neither context have a parent context; they cannot be clones. */
2291 if (!parent && !next_parent)
2292 goto unlock;
2293
2294 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2260 /* 2295 /*
2261 * Looks like the two contexts are clones, so we might be 2296 * Looks like the two contexts are clones, so we might be
2262 * able to optimize the context switch. We lock both 2297 * able to optimize the context switch. We lock both
@@ -2284,6 +2319,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2284 raw_spin_unlock(&next_ctx->lock); 2319 raw_spin_unlock(&next_ctx->lock);
2285 raw_spin_unlock(&ctx->lock); 2320 raw_spin_unlock(&ctx->lock);
2286 } 2321 }
2322unlock:
2287 rcu_read_unlock(); 2323 rcu_read_unlock();
2288 2324
2289 if (do_switch) { 2325 if (do_switch) {
@@ -2712,7 +2748,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2712 2748
2713 hwc = &event->hw; 2749 hwc = &event->hw;
2714 2750
2715 if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { 2751 if (hwc->interrupts == MAX_INTERRUPTS) {
2716 hwc->interrupts = 0; 2752 hwc->interrupts = 0;
2717 perf_log_throttle(event, 1); 2753 perf_log_throttle(event, 1);
2718 event->pmu->start(event, 0); 2754 event->pmu->start(event, 0);
@@ -2811,10 +2847,11 @@ done:
2811#ifdef CONFIG_NO_HZ_FULL 2847#ifdef CONFIG_NO_HZ_FULL
2812bool perf_event_can_stop_tick(void) 2848bool perf_event_can_stop_tick(void)
2813{ 2849{
2814 if (list_empty(&__get_cpu_var(rotation_list))) 2850 if (atomic_read(&nr_freq_events) ||
2815 return true; 2851 __this_cpu_read(perf_throttled_count))
2816 else
2817 return false; 2852 return false;
2853 else
2854 return true;
2818} 2855}
2819#endif 2856#endif
2820 2857
@@ -3128,36 +3165,63 @@ static void free_event_rcu(struct rcu_head *head)
3128static void ring_buffer_put(struct ring_buffer *rb); 3165static void ring_buffer_put(struct ring_buffer *rb);
3129static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); 3166static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
3130 3167
3131static void free_event(struct perf_event *event) 3168static void unaccount_event_cpu(struct perf_event *event, int cpu)
3132{ 3169{
3133 irq_work_sync(&event->pending); 3170 if (event->parent)
3171 return;
3172
3173 if (has_branch_stack(event)) {
3174 if (!(event->attach_state & PERF_ATTACH_TASK))
3175 atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
3176 }
3177 if (is_cgroup_event(event))
3178 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3179}
3180
3181static void unaccount_event(struct perf_event *event)
3182{
3183 if (event->parent)
3184 return;
3134 3185
3186 if (event->attach_state & PERF_ATTACH_TASK)
3187 static_key_slow_dec_deferred(&perf_sched_events);
3188 if (event->attr.mmap || event->attr.mmap_data)
3189 atomic_dec(&nr_mmap_events);
3190 if (event->attr.comm)
3191 atomic_dec(&nr_comm_events);
3192 if (event->attr.task)
3193 atomic_dec(&nr_task_events);
3194 if (event->attr.freq)
3195 atomic_dec(&nr_freq_events);
3196 if (is_cgroup_event(event))
3197 static_key_slow_dec_deferred(&perf_sched_events);
3198 if (has_branch_stack(event))
3199 static_key_slow_dec_deferred(&perf_sched_events);
3200
3201 unaccount_event_cpu(event, event->cpu);
3202}
3203
3204static void __free_event(struct perf_event *event)
3205{
3135 if (!event->parent) { 3206 if (!event->parent) {
3136 if (event->attach_state & PERF_ATTACH_TASK)
3137 static_key_slow_dec_deferred(&perf_sched_events);
3138 if (event->attr.mmap || event->attr.mmap_data)
3139 atomic_dec(&nr_mmap_events);
3140 if (event->attr.comm)
3141 atomic_dec(&nr_comm_events);
3142 if (event->attr.task)
3143 atomic_dec(&nr_task_events);
3144 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 3207 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3145 put_callchain_buffers(); 3208 put_callchain_buffers();
3146 if (is_cgroup_event(event)) {
3147 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
3148 static_key_slow_dec_deferred(&perf_sched_events);
3149 }
3150
3151 if (has_branch_stack(event)) {
3152 static_key_slow_dec_deferred(&perf_sched_events);
3153 /* is system-wide event */
3154 if (!(event->attach_state & PERF_ATTACH_TASK)) {
3155 atomic_dec(&per_cpu(perf_branch_stack_events,
3156 event->cpu));
3157 }
3158 }
3159 } 3209 }
3160 3210
3211 if (event->destroy)
3212 event->destroy(event);
3213
3214 if (event->ctx)
3215 put_ctx(event->ctx);
3216
3217 call_rcu(&event->rcu_head, free_event_rcu);
3218}
3219static void free_event(struct perf_event *event)
3220{
3221 irq_work_sync(&event->pending);
3222
3223 unaccount_event(event);
3224
3161 if (event->rb) { 3225 if (event->rb) {
3162 struct ring_buffer *rb; 3226 struct ring_buffer *rb;
3163 3227
@@ -3180,13 +3244,8 @@ static void free_event(struct perf_event *event)
3180 if (is_cgroup_event(event)) 3244 if (is_cgroup_event(event))
3181 perf_detach_cgroup(event); 3245 perf_detach_cgroup(event);
3182 3246
3183 if (event->destroy)
3184 event->destroy(event);
3185
3186 if (event->ctx)
3187 put_ctx(event->ctx);
3188 3247
3189 call_rcu(&event->rcu_head, free_event_rcu); 3248 __free_event(event);
3190} 3249}
3191 3250
3192int perf_event_release_kernel(struct perf_event *event) 3251int perf_event_release_kernel(struct perf_event *event)
@@ -3544,6 +3603,15 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3544 case PERF_EVENT_IOC_PERIOD: 3603 case PERF_EVENT_IOC_PERIOD:
3545 return perf_event_period(event, (u64 __user *)arg); 3604 return perf_event_period(event, (u64 __user *)arg);
3546 3605
3606 case PERF_EVENT_IOC_ID:
3607 {
3608 u64 id = primary_event_id(event);
3609
3610 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
3611 return -EFAULT;
3612 return 0;
3613 }
3614
3547 case PERF_EVENT_IOC_SET_OUTPUT: 3615 case PERF_EVENT_IOC_SET_OUTPUT:
3548 { 3616 {
3549 int ret; 3617 int ret;
@@ -3625,6 +3693,26 @@ static void calc_timer_values(struct perf_event *event,
3625 *running = ctx_time - event->tstamp_running; 3693 *running = ctx_time - event->tstamp_running;
3626} 3694}
3627 3695
3696static void perf_event_init_userpage(struct perf_event *event)
3697{
3698 struct perf_event_mmap_page *userpg;
3699 struct ring_buffer *rb;
3700
3701 rcu_read_lock();
3702 rb = rcu_dereference(event->rb);
3703 if (!rb)
3704 goto unlock;
3705
3706 userpg = rb->user_page;
3707
3708 /* Allow new userspace to detect that bit 0 is deprecated */
3709 userpg->cap_bit0_is_deprecated = 1;
3710 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
3711
3712unlock:
3713 rcu_read_unlock();
3714}
3715
3628void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) 3716void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
3629{ 3717{
3630} 3718}
@@ -3641,6 +3729,10 @@ void perf_event_update_userpage(struct perf_event *event)
3641 u64 enabled, running, now; 3729 u64 enabled, running, now;
3642 3730
3643 rcu_read_lock(); 3731 rcu_read_lock();
3732 rb = rcu_dereference(event->rb);
3733 if (!rb)
3734 goto unlock;
3735
3644 /* 3736 /*
3645 * compute total_time_enabled, total_time_running 3737 * compute total_time_enabled, total_time_running
3646 * based on snapshot values taken when the event 3738 * based on snapshot values taken when the event
@@ -3651,12 +3743,8 @@ void perf_event_update_userpage(struct perf_event *event)
3651 * NMI context 3743 * NMI context
3652 */ 3744 */
3653 calc_timer_values(event, &now, &enabled, &running); 3745 calc_timer_values(event, &now, &enabled, &running);
3654 rb = rcu_dereference(event->rb);
3655 if (!rb)
3656 goto unlock;
3657 3746
3658 userpg = rb->user_page; 3747 userpg = rb->user_page;
3659
3660 /* 3748 /*
3661 * Disable preemption so as to not let the corresponding user-space 3749 * Disable preemption so as to not let the corresponding user-space
3662 * spin too long if we get preempted. 3750 * spin too long if we get preempted.
@@ -4009,6 +4097,7 @@ again:
4009 ring_buffer_attach(event, rb); 4097 ring_buffer_attach(event, rb);
4010 rcu_assign_pointer(event->rb, rb); 4098 rcu_assign_pointer(event->rb, rb);
4011 4099
4100 perf_event_init_userpage(event);
4012 perf_event_update_userpage(event); 4101 perf_event_update_userpage(event);
4013 4102
4014unlock: 4103unlock:
@@ -4251,7 +4340,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
4251 if (sample_type & PERF_SAMPLE_TIME) 4340 if (sample_type & PERF_SAMPLE_TIME)
4252 data->time = perf_clock(); 4341 data->time = perf_clock();
4253 4342
4254 if (sample_type & PERF_SAMPLE_ID) 4343 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
4255 data->id = primary_event_id(event); 4344 data->id = primary_event_id(event);
4256 4345
4257 if (sample_type & PERF_SAMPLE_STREAM_ID) 4346 if (sample_type & PERF_SAMPLE_STREAM_ID)
@@ -4290,6 +4379,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4290 4379
4291 if (sample_type & PERF_SAMPLE_CPU) 4380 if (sample_type & PERF_SAMPLE_CPU)
4292 perf_output_put(handle, data->cpu_entry); 4381 perf_output_put(handle, data->cpu_entry);
4382
4383 if (sample_type & PERF_SAMPLE_IDENTIFIER)
4384 perf_output_put(handle, data->id);
4293} 4385}
4294 4386
4295void perf_event__output_id_sample(struct perf_event *event, 4387void perf_event__output_id_sample(struct perf_event *event,
@@ -4355,7 +4447,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4355 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 4447 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4356 n = 0; 4448 n = 0;
4357 4449
4358 if (sub != event) 4450 if ((sub != event) &&
4451 (sub->state == PERF_EVENT_STATE_ACTIVE))
4359 sub->pmu->read(sub); 4452 sub->pmu->read(sub);
4360 4453
4361 values[n++] = perf_event_count(sub); 4454 values[n++] = perf_event_count(sub);
@@ -4402,6 +4495,9 @@ void perf_output_sample(struct perf_output_handle *handle,
4402 4495
4403 perf_output_put(handle, *header); 4496 perf_output_put(handle, *header);
4404 4497
4498 if (sample_type & PERF_SAMPLE_IDENTIFIER)
4499 perf_output_put(handle, data->id);
4500
4405 if (sample_type & PERF_SAMPLE_IP) 4501 if (sample_type & PERF_SAMPLE_IP)
4406 perf_output_put(handle, data->ip); 4502 perf_output_put(handle, data->ip);
4407 4503
@@ -4462,20 +4558,6 @@ void perf_output_sample(struct perf_output_handle *handle,
4462 } 4558 }
4463 } 4559 }
4464 4560
4465 if (!event->attr.watermark) {
4466 int wakeup_events = event->attr.wakeup_events;
4467
4468 if (wakeup_events) {
4469 struct ring_buffer *rb = handle->rb;
4470 int events = local_inc_return(&rb->events);
4471
4472 if (events >= wakeup_events) {
4473 local_sub(wakeup_events, &rb->events);
4474 local_inc(&rb->wakeup);
4475 }
4476 }
4477 }
4478
4479 if (sample_type & PERF_SAMPLE_BRANCH_STACK) { 4561 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4480 if (data->br_stack) { 4562 if (data->br_stack) {
4481 size_t size; 4563 size_t size;
@@ -4511,16 +4593,34 @@ void perf_output_sample(struct perf_output_handle *handle,
4511 } 4593 }
4512 } 4594 }
4513 4595
4514 if (sample_type & PERF_SAMPLE_STACK_USER) 4596 if (sample_type & PERF_SAMPLE_STACK_USER) {
4515 perf_output_sample_ustack(handle, 4597 perf_output_sample_ustack(handle,
4516 data->stack_user_size, 4598 data->stack_user_size,
4517 data->regs_user.regs); 4599 data->regs_user.regs);
4600 }
4518 4601
4519 if (sample_type & PERF_SAMPLE_WEIGHT) 4602 if (sample_type & PERF_SAMPLE_WEIGHT)
4520 perf_output_put(handle, data->weight); 4603 perf_output_put(handle, data->weight);
4521 4604
4522 if (sample_type & PERF_SAMPLE_DATA_SRC) 4605 if (sample_type & PERF_SAMPLE_DATA_SRC)
4523 perf_output_put(handle, data->data_src.val); 4606 perf_output_put(handle, data->data_src.val);
4607
4608 if (sample_type & PERF_SAMPLE_TRANSACTION)
4609 perf_output_put(handle, data->txn);
4610
4611 if (!event->attr.watermark) {
4612 int wakeup_events = event->attr.wakeup_events;
4613
4614 if (wakeup_events) {
4615 struct ring_buffer *rb = handle->rb;
4616 int events = local_inc_return(&rb->events);
4617
4618 if (events >= wakeup_events) {
4619 local_sub(wakeup_events, &rb->events);
4620 local_inc(&rb->wakeup);
4621 }
4622 }
4623 }
4524} 4624}
4525 4625
4526void perf_prepare_sample(struct perf_event_header *header, 4626void perf_prepare_sample(struct perf_event_header *header,
@@ -4680,12 +4780,10 @@ perf_event_read_event(struct perf_event *event,
4680 perf_output_end(&handle); 4780 perf_output_end(&handle);
4681} 4781}
4682 4782
4683typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data);
4684typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); 4783typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
4685 4784
4686static void 4785static void
4687perf_event_aux_ctx(struct perf_event_context *ctx, 4786perf_event_aux_ctx(struct perf_event_context *ctx,
4688 perf_event_aux_match_cb match,
4689 perf_event_aux_output_cb output, 4787 perf_event_aux_output_cb output,
4690 void *data) 4788 void *data)
4691{ 4789{
@@ -4696,15 +4794,12 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
4696 continue; 4794 continue;
4697 if (!event_filter_match(event)) 4795 if (!event_filter_match(event))
4698 continue; 4796 continue;
4699 if (match(event, data)) 4797 output(event, data);
4700 output(event, data);
4701 } 4798 }
4702} 4799}
4703 4800
4704static void 4801static void
4705perf_event_aux(perf_event_aux_match_cb match, 4802perf_event_aux(perf_event_aux_output_cb output, void *data,
4706 perf_event_aux_output_cb output,
4707 void *data,
4708 struct perf_event_context *task_ctx) 4803 struct perf_event_context *task_ctx)
4709{ 4804{
4710 struct perf_cpu_context *cpuctx; 4805 struct perf_cpu_context *cpuctx;
@@ -4717,7 +4812,7 @@ perf_event_aux(perf_event_aux_match_cb match,
4717 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4812 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4718 if (cpuctx->unique_pmu != pmu) 4813 if (cpuctx->unique_pmu != pmu)
4719 goto next; 4814 goto next;
4720 perf_event_aux_ctx(&cpuctx->ctx, match, output, data); 4815 perf_event_aux_ctx(&cpuctx->ctx, output, data);
4721 if (task_ctx) 4816 if (task_ctx)
4722 goto next; 4817 goto next;
4723 ctxn = pmu->task_ctx_nr; 4818 ctxn = pmu->task_ctx_nr;
@@ -4725,14 +4820,14 @@ perf_event_aux(perf_event_aux_match_cb match,
4725 goto next; 4820 goto next;
4726 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); 4821 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4727 if (ctx) 4822 if (ctx)
4728 perf_event_aux_ctx(ctx, match, output, data); 4823 perf_event_aux_ctx(ctx, output, data);
4729next: 4824next:
4730 put_cpu_ptr(pmu->pmu_cpu_context); 4825 put_cpu_ptr(pmu->pmu_cpu_context);
4731 } 4826 }
4732 4827
4733 if (task_ctx) { 4828 if (task_ctx) {
4734 preempt_disable(); 4829 preempt_disable();
4735 perf_event_aux_ctx(task_ctx, match, output, data); 4830 perf_event_aux_ctx(task_ctx, output, data);
4736 preempt_enable(); 4831 preempt_enable();
4737 } 4832 }
4738 rcu_read_unlock(); 4833 rcu_read_unlock();
@@ -4741,7 +4836,7 @@ next:
4741/* 4836/*
4742 * task tracking -- fork/exit 4837 * task tracking -- fork/exit
4743 * 4838 *
4744 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task 4839 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
4745 */ 4840 */
4746 4841
4747struct perf_task_event { 4842struct perf_task_event {
@@ -4759,6 +4854,13 @@ struct perf_task_event {
4759 } event_id; 4854 } event_id;
4760}; 4855};
4761 4856
4857static int perf_event_task_match(struct perf_event *event)
4858{
4859 return event->attr.comm || event->attr.mmap ||
4860 event->attr.mmap2 || event->attr.mmap_data ||
4861 event->attr.task;
4862}
4863
4762static void perf_event_task_output(struct perf_event *event, 4864static void perf_event_task_output(struct perf_event *event,
4763 void *data) 4865 void *data)
4764{ 4866{
@@ -4768,6 +4870,9 @@ static void perf_event_task_output(struct perf_event *event,
4768 struct task_struct *task = task_event->task; 4870 struct task_struct *task = task_event->task;
4769 int ret, size = task_event->event_id.header.size; 4871 int ret, size = task_event->event_id.header.size;
4770 4872
4873 if (!perf_event_task_match(event))
4874 return;
4875
4771 perf_event_header__init_id(&task_event->event_id.header, &sample, event); 4876 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4772 4877
4773 ret = perf_output_begin(&handle, event, 4878 ret = perf_output_begin(&handle, event,
@@ -4790,13 +4895,6 @@ out:
4790 task_event->event_id.header.size = size; 4895 task_event->event_id.header.size = size;
4791} 4896}
4792 4897
4793static int perf_event_task_match(struct perf_event *event,
4794 void *data __maybe_unused)
4795{
4796 return event->attr.comm || event->attr.mmap ||
4797 event->attr.mmap_data || event->attr.task;
4798}
4799
4800static void perf_event_task(struct task_struct *task, 4898static void perf_event_task(struct task_struct *task,
4801 struct perf_event_context *task_ctx, 4899 struct perf_event_context *task_ctx,
4802 int new) 4900 int new)
@@ -4825,8 +4923,7 @@ static void perf_event_task(struct task_struct *task,
4825 }, 4923 },
4826 }; 4924 };
4827 4925
4828 perf_event_aux(perf_event_task_match, 4926 perf_event_aux(perf_event_task_output,
4829 perf_event_task_output,
4830 &task_event, 4927 &task_event,
4831 task_ctx); 4928 task_ctx);
4832} 4929}
@@ -4853,6 +4950,11 @@ struct perf_comm_event {
4853 } event_id; 4950 } event_id;
4854}; 4951};
4855 4952
4953static int perf_event_comm_match(struct perf_event *event)
4954{
4955 return event->attr.comm;
4956}
4957
4856static void perf_event_comm_output(struct perf_event *event, 4958static void perf_event_comm_output(struct perf_event *event,
4857 void *data) 4959 void *data)
4858{ 4960{
@@ -4862,6 +4964,9 @@ static void perf_event_comm_output(struct perf_event *event,
4862 int size = comm_event->event_id.header.size; 4964 int size = comm_event->event_id.header.size;
4863 int ret; 4965 int ret;
4864 4966
4967 if (!perf_event_comm_match(event))
4968 return;
4969
4865 perf_event_header__init_id(&comm_event->event_id.header, &sample, event); 4970 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4866 ret = perf_output_begin(&handle, event, 4971 ret = perf_output_begin(&handle, event,
4867 comm_event->event_id.header.size); 4972 comm_event->event_id.header.size);
@@ -4883,12 +4988,6 @@ out:
4883 comm_event->event_id.header.size = size; 4988 comm_event->event_id.header.size = size;
4884} 4989}
4885 4990
4886static int perf_event_comm_match(struct perf_event *event,
4887 void *data __maybe_unused)
4888{
4889 return event->attr.comm;
4890}
4891
4892static void perf_event_comm_event(struct perf_comm_event *comm_event) 4991static void perf_event_comm_event(struct perf_comm_event *comm_event)
4893{ 4992{
4894 char comm[TASK_COMM_LEN]; 4993 char comm[TASK_COMM_LEN];
@@ -4903,8 +5002,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
4903 5002
4904 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 5003 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
4905 5004
4906 perf_event_aux(perf_event_comm_match, 5005 perf_event_aux(perf_event_comm_output,
4907 perf_event_comm_output,
4908 comm_event, 5006 comm_event,
4909 NULL); 5007 NULL);
4910} 5008}
@@ -4955,6 +5053,9 @@ struct perf_mmap_event {
4955 5053
4956 const char *file_name; 5054 const char *file_name;
4957 int file_size; 5055 int file_size;
5056 int maj, min;
5057 u64 ino;
5058 u64 ino_generation;
4958 5059
4959 struct { 5060 struct {
4960 struct perf_event_header header; 5061 struct perf_event_header header;
@@ -4967,6 +5068,17 @@ struct perf_mmap_event {
4967 } event_id; 5068 } event_id;
4968}; 5069};
4969 5070
5071static int perf_event_mmap_match(struct perf_event *event,
5072 void *data)
5073{
5074 struct perf_mmap_event *mmap_event = data;
5075 struct vm_area_struct *vma = mmap_event->vma;
5076 int executable = vma->vm_flags & VM_EXEC;
5077
5078 return (!executable && event->attr.mmap_data) ||
5079 (executable && (event->attr.mmap || event->attr.mmap2));
5080}
5081
4970static void perf_event_mmap_output(struct perf_event *event, 5082static void perf_event_mmap_output(struct perf_event *event,
4971 void *data) 5083 void *data)
4972{ 5084{
@@ -4976,6 +5088,17 @@ static void perf_event_mmap_output(struct perf_event *event,
4976 int size = mmap_event->event_id.header.size; 5088 int size = mmap_event->event_id.header.size;
4977 int ret; 5089 int ret;
4978 5090
5091 if (!perf_event_mmap_match(event, data))
5092 return;
5093
5094 if (event->attr.mmap2) {
5095 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5096 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5097 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5098 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5099 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5100 }
5101
4979 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 5102 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4980 ret = perf_output_begin(&handle, event, 5103 ret = perf_output_begin(&handle, event,
4981 mmap_event->event_id.header.size); 5104 mmap_event->event_id.header.size);
@@ -4986,6 +5109,14 @@ static void perf_event_mmap_output(struct perf_event *event,
4986 mmap_event->event_id.tid = perf_event_tid(event, current); 5109 mmap_event->event_id.tid = perf_event_tid(event, current);
4987 5110
4988 perf_output_put(&handle, mmap_event->event_id); 5111 perf_output_put(&handle, mmap_event->event_id);
5112
5113 if (event->attr.mmap2) {
5114 perf_output_put(&handle, mmap_event->maj);
5115 perf_output_put(&handle, mmap_event->min);
5116 perf_output_put(&handle, mmap_event->ino);
5117 perf_output_put(&handle, mmap_event->ino_generation);
5118 }
5119
4989 __output_copy(&handle, mmap_event->file_name, 5120 __output_copy(&handle, mmap_event->file_name,
4990 mmap_event->file_size); 5121 mmap_event->file_size);
4991 5122
@@ -4996,82 +5127,89 @@ out:
4996 mmap_event->event_id.header.size = size; 5127 mmap_event->event_id.header.size = size;
4997} 5128}
4998 5129
4999static int perf_event_mmap_match(struct perf_event *event,
5000 void *data)
5001{
5002 struct perf_mmap_event *mmap_event = data;
5003 struct vm_area_struct *vma = mmap_event->vma;
5004 int executable = vma->vm_flags & VM_EXEC;
5005
5006 return (!executable && event->attr.mmap_data) ||
5007 (executable && event->attr.mmap);
5008}
5009
5010static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 5130static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5011{ 5131{
5012 struct vm_area_struct *vma = mmap_event->vma; 5132 struct vm_area_struct *vma = mmap_event->vma;
5013 struct file *file = vma->vm_file; 5133 struct file *file = vma->vm_file;
5134 int maj = 0, min = 0;
5135 u64 ino = 0, gen = 0;
5014 unsigned int size; 5136 unsigned int size;
5015 char tmp[16]; 5137 char tmp[16];
5016 char *buf = NULL; 5138 char *buf = NULL;
5017 const char *name; 5139 char *name;
5018
5019 memset(tmp, 0, sizeof(tmp));
5020 5140
5021 if (file) { 5141 if (file) {
5142 struct inode *inode;
5143 dev_t dev;
5144
5145 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5146 if (!buf) {
5147 name = "//enomem";
5148 goto cpy_name;
5149 }
5022 /* 5150 /*
5023 * d_path works from the end of the rb backwards, so we 5151 * d_path() works from the end of the rb backwards, so we
5024 * need to add enough zero bytes after the string to handle 5152 * need to add enough zero bytes after the string to handle
5025 * the 64bit alignment we do later. 5153 * the 64bit alignment we do later.
5026 */ 5154 */
5027 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); 5155 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
5028 if (!buf) {
5029 name = strncpy(tmp, "//enomem", sizeof(tmp));
5030 goto got_name;
5031 }
5032 name = d_path(&file->f_path, buf, PATH_MAX);
5033 if (IS_ERR(name)) { 5156 if (IS_ERR(name)) {
5034 name = strncpy(tmp, "//toolong", sizeof(tmp)); 5157 name = "//toolong";
5035 goto got_name; 5158 goto cpy_name;
5036 } 5159 }
5160 inode = file_inode(vma->vm_file);
5161 dev = inode->i_sb->s_dev;
5162 ino = inode->i_ino;
5163 gen = inode->i_generation;
5164 maj = MAJOR(dev);
5165 min = MINOR(dev);
5166 goto got_name;
5037 } else { 5167 } else {
5038 if (arch_vma_name(mmap_event->vma)) { 5168 name = (char *)arch_vma_name(vma);
5039 name = strncpy(tmp, arch_vma_name(mmap_event->vma), 5169 if (name)
5040 sizeof(tmp) - 1); 5170 goto cpy_name;
5041 tmp[sizeof(tmp) - 1] = '\0';
5042 goto got_name;
5043 }
5044 5171
5045 if (!vma->vm_mm) { 5172 if (vma->vm_start <= vma->vm_mm->start_brk &&
5046 name = strncpy(tmp, "[vdso]", sizeof(tmp));
5047 goto got_name;
5048 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
5049 vma->vm_end >= vma->vm_mm->brk) { 5173 vma->vm_end >= vma->vm_mm->brk) {
5050 name = strncpy(tmp, "[heap]", sizeof(tmp)); 5174 name = "[heap]";
5051 goto got_name; 5175 goto cpy_name;
5052 } else if (vma->vm_start <= vma->vm_mm->start_stack && 5176 }
5177 if (vma->vm_start <= vma->vm_mm->start_stack &&
5053 vma->vm_end >= vma->vm_mm->start_stack) { 5178 vma->vm_end >= vma->vm_mm->start_stack) {
5054 name = strncpy(tmp, "[stack]", sizeof(tmp)); 5179 name = "[stack]";
5055 goto got_name; 5180 goto cpy_name;
5056 } 5181 }
5057 5182
5058 name = strncpy(tmp, "//anon", sizeof(tmp)); 5183 name = "//anon";
5059 goto got_name; 5184 goto cpy_name;
5060 } 5185 }
5061 5186
5187cpy_name:
5188 strlcpy(tmp, name, sizeof(tmp));
5189 name = tmp;
5062got_name: 5190got_name:
5063 size = ALIGN(strlen(name)+1, sizeof(u64)); 5191 /*
5192 * Since our buffer works in 8 byte units we need to align our string
5193 * size to a multiple of 8. However, we must guarantee the tail end is
5194 * zero'd out to avoid leaking random bits to userspace.
5195 */
5196 size = strlen(name)+1;
5197 while (!IS_ALIGNED(size, sizeof(u64)))
5198 name[size++] = '\0';
5064 5199
5065 mmap_event->file_name = name; 5200 mmap_event->file_name = name;
5066 mmap_event->file_size = size; 5201 mmap_event->file_size = size;
5202 mmap_event->maj = maj;
5203 mmap_event->min = min;
5204 mmap_event->ino = ino;
5205 mmap_event->ino_generation = gen;
5067 5206
5068 if (!(vma->vm_flags & VM_EXEC)) 5207 if (!(vma->vm_flags & VM_EXEC))
5069 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; 5208 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
5070 5209
5071 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 5210 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
5072 5211
5073 perf_event_aux(perf_event_mmap_match, 5212 perf_event_aux(perf_event_mmap_output,
5074 perf_event_mmap_output,
5075 mmap_event, 5213 mmap_event,
5076 NULL); 5214 NULL);
5077 5215
@@ -5101,6 +5239,10 @@ void perf_event_mmap(struct vm_area_struct *vma)
5101 .len = vma->vm_end - vma->vm_start, 5239 .len = vma->vm_end - vma->vm_start,
5102 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, 5240 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
5103 }, 5241 },
5242 /* .maj (attr_mmap2 only) */
5243 /* .min (attr_mmap2 only) */
5244 /* .ino (attr_mmap2 only) */
5245 /* .ino_generation (attr_mmap2 only) */
5104 }; 5246 };
5105 5247
5106 perf_event_mmap_event(&mmap_event); 5248 perf_event_mmap_event(&mmap_event);
@@ -5178,6 +5320,7 @@ static int __perf_event_overflow(struct perf_event *event,
5178 __this_cpu_inc(perf_throttled_count); 5320 __this_cpu_inc(perf_throttled_count);
5179 hwc->interrupts = MAX_INTERRUPTS; 5321 hwc->interrupts = MAX_INTERRUPTS;
5180 perf_log_throttle(event, 0); 5322 perf_log_throttle(event, 0);
5323 tick_nohz_full_kick();
5181 ret = 1; 5324 ret = 1;
5182 } 5325 }
5183 } 5326 }
@@ -6189,6 +6332,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
6189 6332
6190 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); 6333 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
6191} 6334}
6335static DEVICE_ATTR_RO(type);
6192 6336
6193static ssize_t 6337static ssize_t
6194perf_event_mux_interval_ms_show(struct device *dev, 6338perf_event_mux_interval_ms_show(struct device *dev,
@@ -6233,17 +6377,19 @@ perf_event_mux_interval_ms_store(struct device *dev,
6233 6377
6234 return count; 6378 return count;
6235} 6379}
6380static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
6236 6381
6237static struct device_attribute pmu_dev_attrs[] = { 6382static struct attribute *pmu_dev_attrs[] = {
6238 __ATTR_RO(type), 6383 &dev_attr_type.attr,
6239 __ATTR_RW(perf_event_mux_interval_ms), 6384 &dev_attr_perf_event_mux_interval_ms.attr,
6240 __ATTR_NULL, 6385 NULL,
6241}; 6386};
6387ATTRIBUTE_GROUPS(pmu_dev);
6242 6388
6243static int pmu_bus_running; 6389static int pmu_bus_running;
6244static struct bus_type pmu_bus = { 6390static struct bus_type pmu_bus = {
6245 .name = "event_source", 6391 .name = "event_source",
6246 .dev_attrs = pmu_dev_attrs, 6392 .dev_groups = pmu_dev_groups,
6247}; 6393};
6248 6394
6249static void pmu_dev_release(struct device *dev) 6395static void pmu_dev_release(struct device *dev)
@@ -6443,6 +6589,44 @@ unlock:
6443 return pmu; 6589 return pmu;
6444} 6590}
6445 6591
6592static void account_event_cpu(struct perf_event *event, int cpu)
6593{
6594 if (event->parent)
6595 return;
6596
6597 if (has_branch_stack(event)) {
6598 if (!(event->attach_state & PERF_ATTACH_TASK))
6599 atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
6600 }
6601 if (is_cgroup_event(event))
6602 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
6603}
6604
6605static void account_event(struct perf_event *event)
6606{
6607 if (event->parent)
6608 return;
6609
6610 if (event->attach_state & PERF_ATTACH_TASK)
6611 static_key_slow_inc(&perf_sched_events.key);
6612 if (event->attr.mmap || event->attr.mmap_data)
6613 atomic_inc(&nr_mmap_events);
6614 if (event->attr.comm)
6615 atomic_inc(&nr_comm_events);
6616 if (event->attr.task)
6617 atomic_inc(&nr_task_events);
6618 if (event->attr.freq) {
6619 if (atomic_inc_return(&nr_freq_events) == 1)
6620 tick_nohz_full_kick_all();
6621 }
6622 if (has_branch_stack(event))
6623 static_key_slow_inc(&perf_sched_events.key);
6624 if (is_cgroup_event(event))
6625 static_key_slow_inc(&perf_sched_events.key);
6626
6627 account_event_cpu(event, event->cpu);
6628}
6629
6446/* 6630/*
6447 * Allocate and initialize a event structure 6631 * Allocate and initialize a event structure
6448 */ 6632 */
@@ -6457,7 +6641,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6457 struct pmu *pmu; 6641 struct pmu *pmu;
6458 struct perf_event *event; 6642 struct perf_event *event;
6459 struct hw_perf_event *hwc; 6643 struct hw_perf_event *hwc;
6460 long err; 6644 long err = -EINVAL;
6461 6645
6462 if ((unsigned)cpu >= nr_cpu_ids) { 6646 if ((unsigned)cpu >= nr_cpu_ids) {
6463 if (!task || cpu != -1) 6647 if (!task || cpu != -1)
@@ -6540,49 +6724,35 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6540 * we currently do not support PERF_FORMAT_GROUP on inherited events 6724 * we currently do not support PERF_FORMAT_GROUP on inherited events
6541 */ 6725 */
6542 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 6726 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
6543 goto done; 6727 goto err_ns;
6544 6728
6545 pmu = perf_init_event(event); 6729 pmu = perf_init_event(event);
6546
6547done:
6548 err = 0;
6549 if (!pmu) 6730 if (!pmu)
6550 err = -EINVAL; 6731 goto err_ns;
6551 else if (IS_ERR(pmu)) 6732 else if (IS_ERR(pmu)) {
6552 err = PTR_ERR(pmu); 6733 err = PTR_ERR(pmu);
6553 6734 goto err_ns;
6554 if (err) {
6555 if (event->ns)
6556 put_pid_ns(event->ns);
6557 kfree(event);
6558 return ERR_PTR(err);
6559 } 6735 }
6560 6736
6561 if (!event->parent) { 6737 if (!event->parent) {
6562 if (event->attach_state & PERF_ATTACH_TASK)
6563 static_key_slow_inc(&perf_sched_events.key);
6564 if (event->attr.mmap || event->attr.mmap_data)
6565 atomic_inc(&nr_mmap_events);
6566 if (event->attr.comm)
6567 atomic_inc(&nr_comm_events);
6568 if (event->attr.task)
6569 atomic_inc(&nr_task_events);
6570 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { 6738 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
6571 err = get_callchain_buffers(); 6739 err = get_callchain_buffers();
6572 if (err) { 6740 if (err)
6573 free_event(event); 6741 goto err_pmu;
6574 return ERR_PTR(err);
6575 }
6576 }
6577 if (has_branch_stack(event)) {
6578 static_key_slow_inc(&perf_sched_events.key);
6579 if (!(event->attach_state & PERF_ATTACH_TASK))
6580 atomic_inc(&per_cpu(perf_branch_stack_events,
6581 event->cpu));
6582 } 6742 }
6583 } 6743 }
6584 6744
6585 return event; 6745 return event;
6746
6747err_pmu:
6748 if (event->destroy)
6749 event->destroy(event);
6750err_ns:
6751 if (event->ns)
6752 put_pid_ns(event->ns);
6753 kfree(event);
6754
6755 return ERR_PTR(err);
6586} 6756}
6587 6757
6588static int perf_copy_attr(struct perf_event_attr __user *uattr, 6758static int perf_copy_attr(struct perf_event_attr __user *uattr,
@@ -6640,6 +6810,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6640 if (ret) 6810 if (ret)
6641 return -EFAULT; 6811 return -EFAULT;
6642 6812
6813 /* disabled for now */
6814 if (attr->mmap2)
6815 return -EINVAL;
6816
6643 if (attr->__reserved_1) 6817 if (attr->__reserved_1)
6644 return -EINVAL; 6818 return -EINVAL;
6645 6819
@@ -6864,17 +7038,14 @@ SYSCALL_DEFINE5(perf_event_open,
6864 7038
6865 if (flags & PERF_FLAG_PID_CGROUP) { 7039 if (flags & PERF_FLAG_PID_CGROUP) {
6866 err = perf_cgroup_connect(pid, event, &attr, group_leader); 7040 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6867 if (err) 7041 if (err) {
6868 goto err_alloc; 7042 __free_event(event);
6869 /* 7043 goto err_task;
6870 * one more event: 7044 }
6871 * - that has cgroup constraint on event->cpu
6872 * - that may need work on context switch
6873 */
6874 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6875 static_key_slow_inc(&perf_sched_events.key);
6876 } 7045 }
6877 7046
7047 account_event(event);
7048
6878 /* 7049 /*
6879 * Special case software events and allow them to be part of 7050 * Special case software events and allow them to be part of
6880 * any hardware group. 7051 * any hardware group.
@@ -6998,7 +7169,6 @@ SYSCALL_DEFINE5(perf_event_open,
6998 } 7169 }
6999 7170
7000 perf_install_in_context(ctx, event, event->cpu); 7171 perf_install_in_context(ctx, event, event->cpu);
7001 ++ctx->generation;
7002 perf_unpin_context(ctx); 7172 perf_unpin_context(ctx);
7003 mutex_unlock(&ctx->mutex); 7173 mutex_unlock(&ctx->mutex);
7004 7174
@@ -7070,6 +7240,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7070 goto err; 7240 goto err;
7071 } 7241 }
7072 7242
7243 account_event(event);
7244
7073 ctx = find_get_context(event->pmu, task, cpu); 7245 ctx = find_get_context(event->pmu, task, cpu);
7074 if (IS_ERR(ctx)) { 7246 if (IS_ERR(ctx)) {
7075 err = PTR_ERR(ctx); 7247 err = PTR_ERR(ctx);
@@ -7079,7 +7251,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7079 WARN_ON_ONCE(ctx->parent_ctx); 7251 WARN_ON_ONCE(ctx->parent_ctx);
7080 mutex_lock(&ctx->mutex); 7252 mutex_lock(&ctx->mutex);
7081 perf_install_in_context(ctx, event, cpu); 7253 perf_install_in_context(ctx, event, cpu);
7082 ++ctx->generation;
7083 perf_unpin_context(ctx); 7254 perf_unpin_context(ctx);
7084 mutex_unlock(&ctx->mutex); 7255 mutex_unlock(&ctx->mutex);
7085 7256
@@ -7106,18 +7277,20 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7106 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 7277 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
7107 event_entry) { 7278 event_entry) {
7108 perf_remove_from_context(event); 7279 perf_remove_from_context(event);
7280 unaccount_event_cpu(event, src_cpu);
7109 put_ctx(src_ctx); 7281 put_ctx(src_ctx);
7110 list_add(&event->event_entry, &events); 7282 list_add(&event->migrate_entry, &events);
7111 } 7283 }
7112 mutex_unlock(&src_ctx->mutex); 7284 mutex_unlock(&src_ctx->mutex);
7113 7285
7114 synchronize_rcu(); 7286 synchronize_rcu();
7115 7287
7116 mutex_lock(&dst_ctx->mutex); 7288 mutex_lock(&dst_ctx->mutex);
7117 list_for_each_entry_safe(event, tmp, &events, event_entry) { 7289 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
7118 list_del(&event->event_entry); 7290 list_del(&event->migrate_entry);
7119 if (event->state >= PERF_EVENT_STATE_OFF) 7291 if (event->state >= PERF_EVENT_STATE_OFF)
7120 event->state = PERF_EVENT_STATE_INACTIVE; 7292 event->state = PERF_EVENT_STATE_INACTIVE;
7293 account_event_cpu(event, dst_cpu);
7121 perf_install_in_context(dst_ctx, event, dst_cpu); 7294 perf_install_in_context(dst_ctx, event, dst_cpu);
7122 get_ctx(dst_ctx); 7295 get_ctx(dst_ctx);
7123 } 7296 }
@@ -7798,7 +7971,8 @@ unlock:
7798device_initcall(perf_event_sysfs_init); 7971device_initcall(perf_event_sysfs_init);
7799 7972
7800#ifdef CONFIG_CGROUP_PERF 7973#ifdef CONFIG_CGROUP_PERF
7801static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) 7974static struct cgroup_subsys_state *
7975perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7802{ 7976{
7803 struct perf_cgroup *jc; 7977 struct perf_cgroup *jc;
7804 7978
@@ -7815,11 +7989,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7815 return &jc->css; 7989 return &jc->css;
7816} 7990}
7817 7991
7818static void perf_cgroup_css_free(struct cgroup *cont) 7992static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
7819{ 7993{
7820 struct perf_cgroup *jc; 7994 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
7821 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7995
7822 struct perf_cgroup, css);
7823 free_percpu(jc->info); 7996 free_percpu(jc->info);
7824 kfree(jc); 7997 kfree(jc);
7825} 7998}
@@ -7831,15 +8004,17 @@ static int __perf_cgroup_move(void *info)
7831 return 0; 8004 return 0;
7832} 8005}
7833 8006
7834static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 8007static void perf_cgroup_attach(struct cgroup_subsys_state *css,
8008 struct cgroup_taskset *tset)
7835{ 8009{
7836 struct task_struct *task; 8010 struct task_struct *task;
7837 8011
7838 cgroup_taskset_for_each(task, cgrp, tset) 8012 cgroup_taskset_for_each(task, css, tset)
7839 task_function_call(task, __perf_cgroup_move, task); 8013 task_function_call(task, __perf_cgroup_move, task);
7840} 8014}
7841 8015
7842static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 8016static void perf_cgroup_exit(struct cgroup_subsys_state *css,
8017 struct cgroup_subsys_state *old_css,
7843 struct task_struct *task) 8018 struct task_struct *task)
7844{ 8019{
7845 /* 8020 /*
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index ca6599723be5..569b218782ad 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
82} 82}
83 83
84#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ 84#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
85static inline unsigned int \ 85static inline unsigned long \
86func_name(struct perf_output_handle *handle, \ 86func_name(struct perf_output_handle *handle, \
87 const void *buf, unsigned int len) \ 87 const void *buf, unsigned long len) \
88{ \ 88{ \
89 unsigned long size, written; \ 89 unsigned long size, written; \
90 \ 90 \
91 do { \ 91 do { \
92 size = min_t(unsigned long, handle->size, len); \ 92 size = min(handle->size, len); \
93 \
94 written = memcpy_func(handle->addr, buf, size); \ 93 written = memcpy_func(handle->addr, buf, size); \
94 written = size - written; \
95 \ 95 \
96 len -= written; \ 96 len -= written; \
97 handle->addr += written; \ 97 handle->addr += written; \
@@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle, \
110 return len; \ 110 return len; \
111} 111}
112 112
113static inline int memcpy_common(void *dst, const void *src, size_t n) 113static inline unsigned long
114memcpy_common(void *dst, const void *src, unsigned long n)
114{ 115{
115 memcpy(dst, src, n); 116 memcpy(dst, src, n);
116 return n; 117 return 0;
117} 118}
118 119
119DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) 120DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
120 121
121#define MEMCPY_SKIP(dst, src, n) (n) 122static inline unsigned long
123memcpy_skip(void *dst, const void *src, unsigned long n)
124{
125 return 0;
126}
122 127
123DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) 128DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
124 129
125#ifndef arch_perf_out_copy_user 130#ifndef arch_perf_out_copy_user
126#define arch_perf_out_copy_user __copy_from_user_inatomic 131#define arch_perf_out_copy_user arch_perf_out_copy_user
132
133static inline unsigned long
134arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
135{
136 unsigned long ret;
137
138 pagefault_disable();
139 ret = __copy_from_user_inatomic(dst, src, n);
140 pagefault_enable();
141
142 return ret;
143}
127#endif 144#endif
128 145
129DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) 146DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index cd55144270b5..e8b168af135b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -12,40 +12,10 @@
12#include <linux/perf_event.h> 12#include <linux/perf_event.h>
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/circ_buf.h>
15 16
16#include "internal.h" 17#include "internal.h"
17 18
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head)
20{
21 unsigned long sz = perf_data_size(rb);
22 unsigned long mask = sz - 1;
23
24 /*
25 * check if user-writable
26 * overwrite : over-write its own tail
27 * !overwrite: buffer possibly drops events.
28 */
29 if (rb->overwrite)
30 return true;
31
32 /*
33 * verify that payload is not bigger than buffer
34 * otherwise masking logic may fail to detect
35 * the "not enough space" condition
36 */
37 if ((head - offset) > sz)
38 return false;
39
40 offset = (offset - tail) & mask;
41 head = (head - tail) & mask;
42
43 if ((int)(head - offset) < 0)
44 return false;
45
46 return true;
47}
48
49static void perf_output_wakeup(struct perf_output_handle *handle) 19static void perf_output_wakeup(struct perf_output_handle *handle)
50{ 20{
51 atomic_set(&handle->rb->poll, POLL_IN); 21 atomic_set(&handle->rb->poll, POLL_IN);
@@ -87,15 +57,36 @@ again:
87 goto out; 57 goto out;
88 58
89 /* 59 /*
90 * Publish the known good head. Rely on the full barrier implied 60 * Since the mmap() consumer (userspace) can run on a different CPU:
91 * by atomic_dec_and_test() order the rb->head read and this 61 *
92 * write. 62 * kernel user
63 *
64 * READ ->data_tail READ ->data_head
65 * smp_mb() (A) smp_rmb() (C)
66 * WRITE $data READ $data
67 * smp_wmb() (B) smp_mb() (D)
68 * STORE ->data_head WRITE ->data_tail
69 *
70 * Where A pairs with D, and B pairs with C.
71 *
72 * I don't think A needs to be a full barrier because we won't in fact
73 * write data until we see the store from userspace. So we simply don't
74 * issue the data WRITE until we observe it. Be conservative for now.
75 *
76 * OTOH, D needs to be a full barrier since it separates the data READ
77 * from the tail WRITE.
78 *
79 * For B a WMB is sufficient since it separates two WRITEs, and for C
80 * an RMB is sufficient since it separates two READs.
81 *
82 * See perf_output_begin().
93 */ 83 */
84 smp_wmb();
94 rb->user_page->data_head = head; 85 rb->user_page->data_head = head;
95 86
96 /* 87 /*
97 * Now check if we missed an update, rely on the (compiler) 88 * Now check if we missed an update -- rely on previous implied
98 * barrier in atomic_dec_and_test() to re-read rb->head. 89 * compiler barriers to force a re-read.
99 */ 90 */
100 if (unlikely(head != local_read(&rb->head))) { 91 if (unlikely(head != local_read(&rb->head))) {
101 local_inc(&rb->nest); 92 local_inc(&rb->nest);
@@ -114,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle,
114{ 105{
115 struct ring_buffer *rb; 106 struct ring_buffer *rb;
116 unsigned long tail, offset, head; 107 unsigned long tail, offset, head;
117 int have_lost; 108 int have_lost, page_shift;
118 struct perf_sample_data sample_data;
119 struct { 109 struct {
120 struct perf_event_header header; 110 struct perf_event_header header;
121 u64 id; 111 u64 id;
@@ -130,55 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle,
130 event = event->parent; 120 event = event->parent;
131 121
132 rb = rcu_dereference(event->rb); 122 rb = rcu_dereference(event->rb);
133 if (!rb) 123 if (unlikely(!rb))
134 goto out; 124 goto out;
135 125
136 handle->rb = rb; 126 if (unlikely(!rb->nr_pages))
137 handle->event = event;
138
139 if (!rb->nr_pages)
140 goto out; 127 goto out;
141 128
129 handle->rb = rb;
130 handle->event = event;
131
142 have_lost = local_read(&rb->lost); 132 have_lost = local_read(&rb->lost);
143 if (have_lost) { 133 if (unlikely(have_lost)) {
144 lost_event.header.size = sizeof(lost_event); 134 size += sizeof(lost_event);
145 perf_event_header__init_id(&lost_event.header, &sample_data, 135 if (event->attr.sample_id_all)
146 event); 136 size += event->id_header_size;
147 size += lost_event.header.size;
148 } 137 }
149 138
150 perf_output_get_handle(handle); 139 perf_output_get_handle(handle);
151 140
152 do { 141 do {
153 /*
154 * Userspace could choose to issue a mb() before updating the
155 * tail pointer. So that all reads will be completed before the
156 * write is issued.
157 */
158 tail = ACCESS_ONCE(rb->user_page->data_tail); 142 tail = ACCESS_ONCE(rb->user_page->data_tail);
159 smp_rmb();
160 offset = head = local_read(&rb->head); 143 offset = head = local_read(&rb->head);
161 head += size; 144 if (!rb->overwrite &&
162 if (unlikely(!perf_output_space(rb, tail, offset, head))) 145 unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
163 goto fail; 146 goto fail;
147 head += size;
164 } while (local_cmpxchg(&rb->head, offset, head) != offset); 148 } while (local_cmpxchg(&rb->head, offset, head) != offset);
165 149
166 if (head - local_read(&rb->wakeup) > rb->watermark) 150 /*
151 * Separate the userpage->tail read from the data stores below.
152 * Matches the MB userspace SHOULD issue after reading the data
153 * and before storing the new tail position.
154 *
155 * See perf_output_put_handle().
156 */
157 smp_mb();
158
159 if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
167 local_add(rb->watermark, &rb->wakeup); 160 local_add(rb->watermark, &rb->wakeup);
168 161
169 handle->page = offset >> (PAGE_SHIFT + page_order(rb)); 162 page_shift = PAGE_SHIFT + page_order(rb);
170 handle->page &= rb->nr_pages - 1;
171 handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
172 handle->addr = rb->data_pages[handle->page];
173 handle->addr += handle->size;
174 handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
175 163
176 if (have_lost) { 164 handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
165 offset &= (1UL << page_shift) - 1;
166 handle->addr = rb->data_pages[handle->page] + offset;
167 handle->size = (1UL << page_shift) - offset;
168
169 if (unlikely(have_lost)) {
170 struct perf_sample_data sample_data;
171
172 lost_event.header.size = sizeof(lost_event);
177 lost_event.header.type = PERF_RECORD_LOST; 173 lost_event.header.type = PERF_RECORD_LOST;
178 lost_event.header.misc = 0; 174 lost_event.header.misc = 0;
179 lost_event.id = event->id; 175 lost_event.id = event->id;
180 lost_event.lost = local_xchg(&rb->lost, 0); 176 lost_event.lost = local_xchg(&rb->lost, 0);
181 177
178 perf_event_header__init_id(&lost_event.header,
179 &sample_data, event);
182 perf_output_put(handle, lost_event); 180 perf_output_put(handle, lost_event);
183 perf_event__output_id_sample(event, handle, &sample_data); 181 perf_event__output_id_sample(event, handle, &sample_data);
184 } 182 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f3569747d629..24b7d6ca871b 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -35,6 +35,7 @@
35#include <linux/kdebug.h> /* notifier mechanism */ 35#include <linux/kdebug.h> /* notifier mechanism */
36#include "../../mm/internal.h" /* munlock_vma_page */ 36#include "../../mm/internal.h" /* munlock_vma_page */
37#include <linux/percpu-rwsem.h> 37#include <linux/percpu-rwsem.h>
38#include <linux/task_work.h>
38 39
39#include <linux/uprobes.h> 40#include <linux/uprobes.h>
40 41
@@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
244 * the architecture. If an arch has variable length instruction and the 245 * the architecture. If an arch has variable length instruction and the
245 * breakpoint instruction is not of the smallest length instruction 246 * breakpoint instruction is not of the smallest length instruction
246 * supported by that architecture then we need to modify is_trap_at_addr and 247 * supported by that architecture then we need to modify is_trap_at_addr and
247 * write_opcode accordingly. This would never be a problem for archs that 248 * uprobe_write_opcode accordingly. This would never be a problem for archs
248 * have fixed length instructions. 249 * that have fixed length instructions.
249 */ 250 */
250 251
251/* 252/*
252 * write_opcode - write the opcode at a given virtual address. 253 * uprobe_write_opcode - write the opcode at a given virtual address.
253 * @mm: the probed process address space. 254 * @mm: the probed process address space.
254 * @vaddr: the virtual address to store the opcode. 255 * @vaddr: the virtual address to store the opcode.
255 * @opcode: opcode to be written at @vaddr. 256 * @opcode: opcode to be written at @vaddr.
@@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
260 * For mm @mm, write the opcode at @vaddr. 261 * For mm @mm, write the opcode at @vaddr.
261 * Return 0 (success) or a negative errno. 262 * Return 0 (success) or a negative errno.
262 */ 263 */
263static int write_opcode(struct mm_struct *mm, unsigned long vaddr, 264int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
264 uprobe_opcode_t opcode) 265 uprobe_opcode_t opcode)
265{ 266{
266 struct page *old_page, *new_page; 267 struct page *old_page, *new_page;
@@ -314,7 +315,7 @@ put_old:
314 */ 315 */
315int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 316int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
316{ 317{
317 return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); 318 return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
318} 319}
319 320
320/** 321/**
@@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
329int __weak 330int __weak
330set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 331set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
331{ 332{
332 return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); 333 return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
333} 334}
334 335
335static int match_uprobe(struct uprobe *l, struct uprobe *r) 336static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
503 return ret; 504 return ret;
504} 505}
505 506
506static int 507static int __copy_insn(struct address_space *mapping, struct file *filp,
507__copy_insn(struct address_space *mapping, struct file *filp, char *insn, 508 void *insn, int nbytes, loff_t offset)
508 unsigned long nbytes, loff_t offset)
509{ 509{
510 struct page *page; 510 struct page *page;
511 511
@@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
527 527
528static int copy_insn(struct uprobe *uprobe, struct file *filp) 528static int copy_insn(struct uprobe *uprobe, struct file *filp)
529{ 529{
530 struct address_space *mapping; 530 struct address_space *mapping = uprobe->inode->i_mapping;
531 unsigned long nbytes; 531 loff_t offs = uprobe->offset;
532 int bytes; 532 void *insn = uprobe->arch.insn;
533 533 int size = MAX_UINSN_BYTES;
534 nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); 534 int len, err = -EIO;
535 mapping = uprobe->inode->i_mapping;
536 535
537 /* Instruction at end of binary; copy only available bytes */ 536 /* Copy only available bytes, -EIO if nothing was read */
538 if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) 537 do {
539 bytes = uprobe->inode->i_size - uprobe->offset; 538 if (offs >= i_size_read(uprobe->inode))
540 else 539 break;
541 bytes = MAX_UINSN_BYTES;
542 540
543 /* Instruction at the page-boundary; copy bytes in second page */ 541 len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
544 if (nbytes < bytes) { 542 err = __copy_insn(mapping, filp, insn, len, offs);
545 int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
546 bytes - nbytes, uprobe->offset + nbytes);
547 if (err) 543 if (err)
548 return err; 544 break;
549 bytes = nbytes; 545
550 } 546 insn += len;
551 return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); 547 offs += len;
548 size -= len;
549 } while (size);
550
551 return err;
552} 552}
553 553
554static int prepare_uprobe(struct uprobe *uprobe, struct file *file, 554static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
@@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
576 if (ret) 576 if (ret)
577 goto out; 577 goto out;
578 578
579 /* write_opcode() assumes we don't cross page boundary */ 579 /* uprobe_write_opcode() assumes we don't cross page boundary */
580 BUG_ON((uprobe->offset & ~PAGE_MASK) + 580 BUG_ON((uprobe->offset & ~PAGE_MASK) +
581 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); 581 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
582 582
@@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1096} 1096}
1097 1097
1098/* Slot allocation for XOL */ 1098/* Slot allocation for XOL */
1099static int xol_add_vma(struct xol_area *area) 1099static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
1100{ 1100{
1101 struct mm_struct *mm = current->mm;
1102 int ret = -EALREADY; 1101 int ret = -EALREADY;
1103 1102
1104 down_write(&mm->mmap_sem); 1103 down_write(&mm->mmap_sem);
1105 if (mm->uprobes_state.xol_area) 1104 if (mm->uprobes_state.xol_area)
1106 goto fail; 1105 goto fail;
1107 1106
1108 ret = -ENOMEM; 1107 if (!area->vaddr) {
1109 /* Try to map as high as possible, this is only a hint. */ 1108 /* Try to map as high as possible, this is only a hint. */
1110 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); 1109 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
1111 if (area->vaddr & ~PAGE_MASK) { 1110 PAGE_SIZE, 0, 0);
1112 ret = area->vaddr; 1111 if (area->vaddr & ~PAGE_MASK) {
1113 goto fail; 1112 ret = area->vaddr;
1113 goto fail;
1114 }
1114 } 1115 }
1115 1116
1116 ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, 1117 ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
@@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area)
1120 1121
1121 smp_wmb(); /* pairs with get_xol_area() */ 1122 smp_wmb(); /* pairs with get_xol_area() */
1122 mm->uprobes_state.xol_area = area; 1123 mm->uprobes_state.xol_area = area;
1123 ret = 0;
1124 fail: 1124 fail:
1125 up_write(&mm->mmap_sem); 1125 up_write(&mm->mmap_sem);
1126 1126
1127 return ret; 1127 return ret;
1128} 1128}
1129 1129
1130/* 1130static struct xol_area *__create_xol_area(unsigned long vaddr)
1131 * get_xol_area - Allocate process's xol_area if necessary.
1132 * This area will be used for storing instructions for execution out of line.
1133 *
1134 * Returns the allocated area or NULL.
1135 */
1136static struct xol_area *get_xol_area(void)
1137{ 1131{
1138 struct mm_struct *mm = current->mm; 1132 struct mm_struct *mm = current->mm;
1139 struct xol_area *area;
1140 uprobe_opcode_t insn = UPROBE_SWBP_INSN; 1133 uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1134 struct xol_area *area;
1141 1135
1142 area = mm->uprobes_state.xol_area; 1136 area = kmalloc(sizeof(*area), GFP_KERNEL);
1143 if (area)
1144 goto ret;
1145
1146 area = kzalloc(sizeof(*area), GFP_KERNEL);
1147 if (unlikely(!area)) 1137 if (unlikely(!area))
1148 goto out; 1138 goto out;
1149 1139
@@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void)
1155 if (!area->page) 1145 if (!area->page)
1156 goto free_bitmap; 1146 goto free_bitmap;
1157 1147
1158 /* allocate first slot of task's xol_area for the return probes */ 1148 area->vaddr = vaddr;
1149 init_waitqueue_head(&area->wq);
1150 /* Reserve the 1st slot for get_trampoline_vaddr() */
1159 set_bit(0, area->bitmap); 1151 set_bit(0, area->bitmap);
1160 copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1161 atomic_set(&area->slot_count, 1); 1152 atomic_set(&area->slot_count, 1);
1162 init_waitqueue_head(&area->wq); 1153 copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1163 1154
1164 if (!xol_add_vma(area)) 1155 if (!xol_add_vma(mm, area))
1165 return area; 1156 return area;
1166 1157
1167 __free_page(area->page); 1158 __free_page(area->page);
@@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void)
1170 free_area: 1161 free_area:
1171 kfree(area); 1162 kfree(area);
1172 out: 1163 out:
1164 return NULL;
1165}
1166
1167/*
1168 * get_xol_area - Allocate process's xol_area if necessary.
1169 * This area will be used for storing instructions for execution out of line.
1170 *
1171 * Returns the allocated area or NULL.
1172 */
1173static struct xol_area *get_xol_area(void)
1174{
1175 struct mm_struct *mm = current->mm;
1176 struct xol_area *area;
1177
1178 if (!mm->uprobes_state.xol_area)
1179 __create_xol_area(0);
1180
1173 area = mm->uprobes_state.xol_area; 1181 area = mm->uprobes_state.xol_area;
1174 ret: 1182 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1175 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1176 return area; 1183 return area;
1177} 1184}
1178 1185
@@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1256 return 0; 1263 return 0;
1257 1264
1258 /* Initialize the slot */ 1265 /* Initialize the slot */
1259 copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); 1266 copy_to_page(area->page, xol_vaddr,
1267 uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1260 /* 1268 /*
1261 * We probably need flush_icache_user_range() but it needs vma. 1269 * We probably need flush_icache_user_range() but it needs vma.
1262 * This should work on supported architectures too. 1270 * This should work on supported architectures too.
@@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t)
1345} 1353}
1346 1354
1347/* 1355/*
1348 * Called in context of a new clone/fork from copy_process.
1349 */
1350void uprobe_copy_process(struct task_struct *t)
1351{
1352 t->utask = NULL;
1353}
1354
1355/*
1356 * Allocate a uprobe_task object for the task if if necessary. 1356 * Allocate a uprobe_task object for the task if if necessary.
1357 * Called when the thread hits a breakpoint. 1357 * Called when the thread hits a breakpoint.
1358 * 1358 *
@@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void)
1367 return current->utask; 1367 return current->utask;
1368} 1368}
1369 1369
1370static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
1371{
1372 struct uprobe_task *n_utask;
1373 struct return_instance **p, *o, *n;
1374
1375 n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1376 if (!n_utask)
1377 return -ENOMEM;
1378 t->utask = n_utask;
1379
1380 p = &n_utask->return_instances;
1381 for (o = o_utask->return_instances; o; o = o->next) {
1382 n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
1383 if (!n)
1384 return -ENOMEM;
1385
1386 *n = *o;
1387 atomic_inc(&n->uprobe->ref);
1388 n->next = NULL;
1389
1390 *p = n;
1391 p = &n->next;
1392 n_utask->depth++;
1393 }
1394
1395 return 0;
1396}
1397
1398static void uprobe_warn(struct task_struct *t, const char *msg)
1399{
1400 pr_warn("uprobe: %s:%d failed to %s\n",
1401 current->comm, current->pid, msg);
1402}
1403
1404static void dup_xol_work(struct callback_head *work)
1405{
1406 kfree(work);
1407
1408 if (current->flags & PF_EXITING)
1409 return;
1410
1411 if (!__create_xol_area(current->utask->vaddr))
1412 uprobe_warn(current, "dup xol area");
1413}
1414
1415/*
1416 * Called in context of a new clone/fork from copy_process.
1417 */
1418void uprobe_copy_process(struct task_struct *t, unsigned long flags)
1419{
1420 struct uprobe_task *utask = current->utask;
1421 struct mm_struct *mm = current->mm;
1422 struct callback_head *work;
1423 struct xol_area *area;
1424
1425 t->utask = NULL;
1426
1427 if (!utask || !utask->return_instances)
1428 return;
1429
1430 if (mm == t->mm && !(flags & CLONE_VFORK))
1431 return;
1432
1433 if (dup_utask(t, utask))
1434 return uprobe_warn(t, "dup ret instances");
1435
1436 /* The task can fork() after dup_xol_work() fails */
1437 area = mm->uprobes_state.xol_area;
1438 if (!area)
1439 return uprobe_warn(t, "dup xol area");
1440
1441 if (mm == t->mm)
1442 return;
1443
1444 /* TODO: move it into the union in uprobe_task */
1445 work = kmalloc(sizeof(*work), GFP_KERNEL);
1446 if (!work)
1447 return uprobe_warn(t, "dup xol area");
1448
1449 t->utask->vaddr = area->vaddr;
1450 init_task_work(work, dup_xol_work);
1451 task_work_add(t, work, true);
1452}
1453
1370/* 1454/*
1371 * Current area->vaddr notion assume the trampoline address is always 1455 * Current area->vaddr notion assume the trampoline address is always
1372 * equal area->vaddr. 1456 * equal area->vaddr.
@@ -1682,12 +1766,10 @@ static bool handle_trampoline(struct pt_regs *regs)
1682 tmp = ri; 1766 tmp = ri;
1683 ri = ri->next; 1767 ri = ri->next;
1684 kfree(tmp); 1768 kfree(tmp);
1769 utask->depth--;
1685 1770
1686 if (!chained) 1771 if (!chained)
1687 break; 1772 break;
1688
1689 utask->depth--;
1690
1691 BUG_ON(!ri); 1773 BUG_ON(!ri);
1692 } 1774 }
1693 1775
@@ -1859,9 +1941,4 @@ static int __init init_uprobes(void)
1859 1941
1860 return register_die_notifier(&uprobe_exception_nb); 1942 return register_die_notifier(&uprobe_exception_nb);
1861} 1943}
1862module_init(init_uprobes); 1944__initcall(init_uprobes);
1863
1864static void __exit exit_uprobes(void)
1865{
1866}
1867module_exit(exit_uprobes);
diff --git a/kernel/extable.c b/kernel/extable.c
index 67460b93b1a1..832cb28105bb 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,7 +41,7 @@ u32 __initdata main_extable_sort_needed = 1;
41/* Sort the kernel's built-in exception table */ 41/* Sort the kernel's built-in exception table */
42void __init sort_main_extable(void) 42void __init sort_main_extable(void)
43{ 43{
44 if (main_extable_sort_needed) { 44 if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
45 pr_notice("Sorting __ex_table...\n"); 45 pr_notice("Sorting __ex_table...\n");
46 sort_extable(__start___ex_table, __stop___ex_table); 46 sort_extable(__start___ex_table, __stop___ex_table);
47 } 47 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 403d2bb8a968..f6d11fc67f72 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -351,7 +351,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
351 struct rb_node **rb_link, *rb_parent; 351 struct rb_node **rb_link, *rb_parent;
352 int retval; 352 int retval;
353 unsigned long charge; 353 unsigned long charge;
354 struct mempolicy *pol;
355 354
356 uprobe_start_dup_mmap(); 355 uprobe_start_dup_mmap();
357 down_write(&oldmm->mmap_sem); 356 down_write(&oldmm->mmap_sem);
@@ -400,11 +399,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
400 goto fail_nomem; 399 goto fail_nomem;
401 *tmp = *mpnt; 400 *tmp = *mpnt;
402 INIT_LIST_HEAD(&tmp->anon_vma_chain); 401 INIT_LIST_HEAD(&tmp->anon_vma_chain);
403 pol = mpol_dup(vma_policy(mpnt)); 402 retval = vma_dup_policy(mpnt, tmp);
404 retval = PTR_ERR(pol); 403 if (retval)
405 if (IS_ERR(pol))
406 goto fail_nomem_policy; 404 goto fail_nomem_policy;
407 vma_set_policy(tmp, pol);
408 tmp->vm_mm = mm; 405 tmp->vm_mm = mm;
409 if (anon_vma_fork(tmp, mpnt)) 406 if (anon_vma_fork(tmp, mpnt))
410 goto fail_nomem_anon_vma_fork; 407 goto fail_nomem_anon_vma_fork;
@@ -472,7 +469,7 @@ out:
472 uprobe_end_dup_mmap(); 469 uprobe_end_dup_mmap();
473 return retval; 470 return retval;
474fail_nomem_anon_vma_fork: 471fail_nomem_anon_vma_fork:
475 mpol_put(pol); 472 mpol_put(vma_policy(tmp));
476fail_nomem_policy: 473fail_nomem_policy:
477 kmem_cache_free(vm_area_cachep, tmp); 474 kmem_cache_free(vm_area_cachep, tmp);
478fail_nomem: 475fail_nomem:
@@ -522,7 +519,7 @@ static void mm_init_aio(struct mm_struct *mm)
522{ 519{
523#ifdef CONFIG_AIO 520#ifdef CONFIG_AIO
524 spin_lock_init(&mm->ioctx_lock); 521 spin_lock_init(&mm->ioctx_lock);
525 INIT_HLIST_HEAD(&mm->ioctx_list); 522 mm->ioctx_table = NULL;
526#endif 523#endif
527} 524}
528 525
@@ -820,9 +817,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
820#ifdef CONFIG_TRANSPARENT_HUGEPAGE 817#ifdef CONFIG_TRANSPARENT_HUGEPAGE
821 mm->pmd_huge_pte = NULL; 818 mm->pmd_huge_pte = NULL;
822#endif 819#endif
823#ifdef CONFIG_NUMA_BALANCING
824 mm->first_nid = NUMA_PTE_SCAN_INIT;
825#endif
826 if (!mm_init(mm, tsk)) 820 if (!mm_init(mm, tsk))
827 goto fail_nomem; 821 goto fail_nomem;
828 822
@@ -1173,12 +1167,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1173 return ERR_PTR(-EINVAL); 1167 return ERR_PTR(-EINVAL);
1174 1168
1175 /* 1169 /*
1176 * If the new process will be in a different pid namespace 1170 * If the new process will be in a different pid or user namespace
1177 * don't allow the creation of threads. 1171 * do not allow it to share a thread group or signal handlers or
1172 * parent with the forking task.
1178 */ 1173 */
1179 if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && 1174 if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) {
1180 (task_active_pid_ns(current) != current->nsproxy->pid_ns)) 1175 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1181 return ERR_PTR(-EINVAL); 1176 (task_active_pid_ns(current) !=
1177 current->nsproxy->pid_ns_for_children))
1178 return ERR_PTR(-EINVAL);
1179 }
1182 1180
1183 retval = security_task_create(clone_flags); 1181 retval = security_task_create(clone_flags);
1184 if (retval) 1182 if (retval)
@@ -1312,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1312#endif 1310#endif
1313 1311
1314 /* Perform scheduler related setup. Assign this task to a CPU. */ 1312 /* Perform scheduler related setup. Assign this task to a CPU. */
1315 sched_fork(p); 1313 sched_fork(clone_flags, p);
1316 1314
1317 retval = perf_event_init_task(p); 1315 retval = perf_event_init_task(p);
1318 if (retval) 1316 if (retval)
@@ -1351,7 +1349,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1351 1349
1352 if (pid != &init_struct_pid) { 1350 if (pid != &init_struct_pid) {
1353 retval = -ENOMEM; 1351 retval = -ENOMEM;
1354 pid = alloc_pid(p->nsproxy->pid_ns); 1352 pid = alloc_pid(p->nsproxy->pid_ns_for_children);
1355 if (!pid) 1353 if (!pid)
1356 goto bad_fork_cleanup_io; 1354 goto bad_fork_cleanup_io;
1357 } 1355 }
@@ -1372,7 +1370,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1372 INIT_LIST_HEAD(&p->pi_state_list); 1370 INIT_LIST_HEAD(&p->pi_state_list);
1373 p->pi_state_cache = NULL; 1371 p->pi_state_cache = NULL;
1374#endif 1372#endif
1375 uprobe_copy_process(p);
1376 /* 1373 /*
1377 * sigaltstack should be cleared when sharing the same VM 1374 * sigaltstack should be cleared when sharing the same VM
1378 */ 1375 */
@@ -1489,6 +1486,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1489 perf_event_fork(p); 1486 perf_event_fork(p);
1490 1487
1491 trace_task_newtask(p, clone_flags); 1488 trace_task_newtask(p, clone_flags);
1489 uprobe_copy_process(p, clone_flags);
1492 1490
1493 return p; 1491 return p;
1494 1492
@@ -1575,15 +1573,6 @@ long do_fork(unsigned long clone_flags,
1575 long nr; 1573 long nr;
1576 1574
1577 /* 1575 /*
1578 * Do some preliminary argument and permissions checking before we
1579 * actually start allocating stuff
1580 */
1581 if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
1582 if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
1583 return -EINVAL;
1584 }
1585
1586 /*
1587 * Determine whether and which event to report to ptracer. When 1576 * Determine whether and which event to report to ptracer. When
1588 * called from kernel_thread or CLONE_UNTRACED is explicitly 1577 * called from kernel_thread or CLONE_UNTRACED is explicitly
1589 * requested, no event is reported; otherwise, report if the event 1578 * requested, no event is reported; otherwise, report if the event
@@ -1679,6 +1668,12 @@ SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
1679 int __user *, parent_tidptr, 1668 int __user *, parent_tidptr,
1680 int __user *, child_tidptr, 1669 int __user *, child_tidptr,
1681 int, tls_val) 1670 int, tls_val)
1671#elif defined(CONFIG_CLONE_BACKWARDS3)
1672SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
1673 int, stack_size,
1674 int __user *, parent_tidptr,
1675 int __user *, child_tidptr,
1676 int, tls_val)
1682#else 1677#else
1683SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, 1678SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1684 int __user *, parent_tidptr, 1679 int __user *, parent_tidptr,
@@ -1818,11 +1813,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1818 if (unshare_flags & CLONE_NEWUSER) 1813 if (unshare_flags & CLONE_NEWUSER)
1819 unshare_flags |= CLONE_THREAD | CLONE_FS; 1814 unshare_flags |= CLONE_THREAD | CLONE_FS;
1820 /* 1815 /*
1821 * If unsharing a pid namespace must also unshare the thread.
1822 */
1823 if (unshare_flags & CLONE_NEWPID)
1824 unshare_flags |= CLONE_THREAD;
1825 /*
1826 * If unsharing a thread from a thread group, must also unshare vm. 1816 * If unsharing a thread from a thread group, must also unshare vm.
1827 */ 1817 */
1828 if (unshare_flags & CLONE_THREAD) 1818 if (unshare_flags & CLONE_THREAD)
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 9bd0934f6c33..7a7d2ee96d42 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -74,7 +74,7 @@ static int __init gcov_persist_setup(char *str)
74{ 74{
75 unsigned long val; 75 unsigned long val;
76 76
77 if (strict_strtoul(str, 0, &val)) { 77 if (kstrtoul(str, 0, &val)) {
78 pr_warning("invalid gcov_persist parameter '%s'\n", str); 78 pr_warning("invalid gcov_persist parameter '%s'\n", str);
79 return 0; 79 return 0;
80 } 80 }
diff --git a/kernel/groups.c b/kernel/groups.c
index 6b2588dd04ff..90cf1c38c8ea 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
233 struct group_info *group_info; 233 struct group_info *group_info;
234 int retval; 234 int retval;
235 235
236 if (!nsown_capable(CAP_SETGID)) 236 if (!ns_capable(current_user_ns(), CAP_SETGID))
237 return -EPERM; 237 return -EPERM;
238 if ((unsigned)gidsetsize > NGROUPS_MAX) 238 if ((unsigned)gidsetsize > NGROUPS_MAX)
239 return -EINVAL; 239 return -EINVAL;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 6df614912b9d..3e97fb126e6b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -15,6 +15,7 @@
15#include <linux/lockdep.h> 15#include <linux/lockdep.h>
16#include <linux/export.h> 16#include <linux/export.h>
17#include <linux/sysctl.h> 17#include <linux/sysctl.h>
18#include <linux/utsname.h>
18 19
19/* 20/*
20 * The number of tasks checked: 21 * The number of tasks checked:
@@ -99,10 +100,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
99 * Ok, the task did not get scheduled for more than 2 minutes, 100 * Ok, the task did not get scheduled for more than 2 minutes,
100 * complain: 101 * complain:
101 */ 102 */
102 printk(KERN_ERR "INFO: task %s:%d blocked for more than " 103 pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
103 "%ld seconds.\n", t->comm, t->pid, timeout); 104 t->comm, t->pid, timeout);
104 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" 105 pr_err(" %s %s %.*s\n",
105 " disables this message.\n"); 106 print_tainted(), init_utsname()->release,
107 (int)strcspn(init_utsname()->version, " "),
108 init_utsname()->version);
109 pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
110 " disables this message.\n");
106 sched_show_task(t); 111 sched_show_task(t);
107 debug_show_held_locks(t); 112 debug_show_held_locks(t);
108 113
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d1a758bc972a..4a1fef09f658 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,15 +1,4 @@
1# Select this to activate the generic irq options below
2config HAVE_GENERIC_HARDIRQS
3 bool
4
5if HAVE_GENERIC_HARDIRQS
6menu "IRQ subsystem" 1menu "IRQ subsystem"
7#
8# Interrupt subsystem related configuration options
9#
10config GENERIC_HARDIRQS
11 def_bool y
12
13# Options selectable by the architecture code 2# Options selectable by the architecture code
14 3
15# Make sparse irq Kconfig switch below available 4# Make sparse irq Kconfig switch below available
@@ -84,4 +73,3 @@ config SPARSE_IRQ
84 If you don't know what to do here, say N. 73 If you don't know what to do here, say N.
85 74
86endmenu 75endmenu
87endif
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 706724e9835d..cf68bb36fe58 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -465,27 +465,26 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
465} 465}
466EXPORT_SYMBOL_GPL(irq_create_strict_mappings); 466EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
467 467
468unsigned int irq_create_of_mapping(struct device_node *controller, 468unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
469 const u32 *intspec, unsigned int intsize)
470{ 469{
471 struct irq_domain *domain; 470 struct irq_domain *domain;
472 irq_hw_number_t hwirq; 471 irq_hw_number_t hwirq;
473 unsigned int type = IRQ_TYPE_NONE; 472 unsigned int type = IRQ_TYPE_NONE;
474 unsigned int virq; 473 unsigned int virq;
475 474
476 domain = controller ? irq_find_host(controller) : irq_default_domain; 475 domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
477 if (!domain) { 476 if (!domain) {
478 pr_warn("no irq domain found for %s !\n", 477 pr_warn("no irq domain found for %s !\n",
479 of_node_full_name(controller)); 478 of_node_full_name(irq_data->np));
480 return 0; 479 return 0;
481 } 480 }
482 481
483 /* If domain has no translation, then we assume interrupt line */ 482 /* If domain has no translation, then we assume interrupt line */
484 if (domain->ops->xlate == NULL) 483 if (domain->ops->xlate == NULL)
485 hwirq = intspec[0]; 484 hwirq = irq_data->args[0];
486 else { 485 else {
487 if (domain->ops->xlate(domain, controller, intspec, intsize, 486 if (domain->ops->xlate(domain, irq_data->np, irq_data->args,
488 &hwirq, &type)) 487 irq_data->args_count, &hwirq, &type))
489 return 0; 488 return 0;
490 } 489 }
491 490
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 514bcfd855a8..3e59f951d42f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -956,7 +956,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
956 goto out_mput; 956 goto out_mput;
957 } 957 }
958 958
959 sched_setscheduler(t, SCHED_FIFO, &param); 959 sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
960 960
961 /* 961 /*
962 * We keep the reference to the task struct even if 962 * We keep the reference to the task struct even if
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 60f48fa0fd0d..297a9247a3b3 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -13,6 +13,7 @@
13#include <linux/sort.h> 13#include <linux/sort.h>
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/static_key.h> 15#include <linux/static_key.h>
16#include <linux/jump_label_ratelimit.h>
16 17
17#ifdef HAVE_JUMP_LABEL 18#ifdef HAVE_JUMP_LABEL
18 19
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 59f7b55ba745..2a74f307c5ec 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1474,11 +1474,8 @@ static int __init __parse_crashkernel(char *cmdline,
1474 if (first_colon && (!first_space || first_colon < first_space)) 1474 if (first_colon && (!first_space || first_colon < first_space))
1475 return parse_crashkernel_mem(ck_cmdline, system_ram, 1475 return parse_crashkernel_mem(ck_cmdline, system_ram,
1476 crash_size, crash_base); 1476 crash_size, crash_base);
1477 else
1478 return parse_crashkernel_simple(ck_cmdline, crash_size,
1479 crash_base);
1480 1477
1481 return 0; 1478 return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
1482} 1479}
1483 1480
1484/* 1481/*
diff --git a/kernel/kmod.c b/kernel/kmod.c
index fb326365b694..b086006c59e7 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -571,6 +571,10 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
571 DECLARE_COMPLETION_ONSTACK(done); 571 DECLARE_COMPLETION_ONSTACK(done);
572 int retval = 0; 572 int retval = 0;
573 573
574 if (!sub_info->path) {
575 call_usermodehelper_freeinfo(sub_info);
576 return -EINVAL;
577 }
574 helper_lock(); 578 helper_lock();
575 if (!khelper_wq || usermodehelper_disabled) { 579 if (!khelper_wq || usermodehelper_disabled) {
576 retval = -EBUSY; 580 retval = -EBUSY;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6e33498d665c..a0d367a49122 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -112,6 +112,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
112struct kprobe_insn_page { 112struct kprobe_insn_page {
113 struct list_head list; 113 struct list_head list;
114 kprobe_opcode_t *insns; /* Page of instruction slots */ 114 kprobe_opcode_t *insns; /* Page of instruction slots */
115 struct kprobe_insn_cache *cache;
115 int nused; 116 int nused;
116 int ngarbage; 117 int ngarbage;
117 char slot_used[]; 118 char slot_used[];
@@ -121,12 +122,6 @@ struct kprobe_insn_page {
121 (offsetof(struct kprobe_insn_page, slot_used) + \ 122 (offsetof(struct kprobe_insn_page, slot_used) + \
122 (sizeof(char) * (slots))) 123 (sizeof(char) * (slots)))
123 124
124struct kprobe_insn_cache {
125 struct list_head pages; /* list of kprobe_insn_page */
126 size_t insn_size; /* size of instruction slot */
127 int nr_garbage;
128};
129
130static int slots_per_page(struct kprobe_insn_cache *c) 125static int slots_per_page(struct kprobe_insn_cache *c)
131{ 126{
132 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); 127 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
@@ -138,8 +133,20 @@ enum kprobe_slot_state {
138 SLOT_USED = 2, 133 SLOT_USED = 2,
139}; 134};
140 135
141static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ 136static void *alloc_insn_page(void)
142static struct kprobe_insn_cache kprobe_insn_slots = { 137{
138 return module_alloc(PAGE_SIZE);
139}
140
141static void free_insn_page(void *page)
142{
143 module_free(NULL, page);
144}
145
146struct kprobe_insn_cache kprobe_insn_slots = {
147 .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
148 .alloc = alloc_insn_page,
149 .free = free_insn_page,
143 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), 150 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
144 .insn_size = MAX_INSN_SIZE, 151 .insn_size = MAX_INSN_SIZE,
145 .nr_garbage = 0, 152 .nr_garbage = 0,
@@ -150,10 +157,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
150 * __get_insn_slot() - Find a slot on an executable page for an instruction. 157 * __get_insn_slot() - Find a slot on an executable page for an instruction.
151 * We allocate an executable page if there's no room on existing ones. 158 * We allocate an executable page if there's no room on existing ones.
152 */ 159 */
153static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) 160kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
154{ 161{
155 struct kprobe_insn_page *kip; 162 struct kprobe_insn_page *kip;
163 kprobe_opcode_t *slot = NULL;
156 164
165 mutex_lock(&c->mutex);
157 retry: 166 retry:
158 list_for_each_entry(kip, &c->pages, list) { 167 list_for_each_entry(kip, &c->pages, list) {
159 if (kip->nused < slots_per_page(c)) { 168 if (kip->nused < slots_per_page(c)) {
@@ -162,7 +171,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
162 if (kip->slot_used[i] == SLOT_CLEAN) { 171 if (kip->slot_used[i] == SLOT_CLEAN) {
163 kip->slot_used[i] = SLOT_USED; 172 kip->slot_used[i] = SLOT_USED;
164 kip->nused++; 173 kip->nused++;
165 return kip->insns + (i * c->insn_size); 174 slot = kip->insns + (i * c->insn_size);
175 goto out;
166 } 176 }
167 } 177 }
168 /* kip->nused is broken. Fix it. */ 178 /* kip->nused is broken. Fix it. */
@@ -178,37 +188,29 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
178 /* All out of space. Need to allocate a new page. */ 188 /* All out of space. Need to allocate a new page. */
179 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); 189 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
180 if (!kip) 190 if (!kip)
181 return NULL; 191 goto out;
182 192
183 /* 193 /*
184 * Use module_alloc so this page is within +/- 2GB of where the 194 * Use module_alloc so this page is within +/- 2GB of where the
185 * kernel image and loaded module images reside. This is required 195 * kernel image and loaded module images reside. This is required
186 * so x86_64 can correctly handle the %rip-relative fixups. 196 * so x86_64 can correctly handle the %rip-relative fixups.
187 */ 197 */
188 kip->insns = module_alloc(PAGE_SIZE); 198 kip->insns = c->alloc();
189 if (!kip->insns) { 199 if (!kip->insns) {
190 kfree(kip); 200 kfree(kip);
191 return NULL; 201 goto out;
192 } 202 }
193 INIT_LIST_HEAD(&kip->list); 203 INIT_LIST_HEAD(&kip->list);
194 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); 204 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
195 kip->slot_used[0] = SLOT_USED; 205 kip->slot_used[0] = SLOT_USED;
196 kip->nused = 1; 206 kip->nused = 1;
197 kip->ngarbage = 0; 207 kip->ngarbage = 0;
208 kip->cache = c;
198 list_add(&kip->list, &c->pages); 209 list_add(&kip->list, &c->pages);
199 return kip->insns; 210 slot = kip->insns;
200} 211out:
201 212 mutex_unlock(&c->mutex);
202 213 return slot;
203kprobe_opcode_t __kprobes *get_insn_slot(void)
204{
205 kprobe_opcode_t *ret = NULL;
206
207 mutex_lock(&kprobe_insn_mutex);
208 ret = __get_insn_slot(&kprobe_insn_slots);
209 mutex_unlock(&kprobe_insn_mutex);
210
211 return ret;
212} 214}
213 215
214/* Return 1 if all garbages are collected, otherwise 0. */ 216/* Return 1 if all garbages are collected, otherwise 0. */
@@ -225,7 +227,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
225 */ 227 */
226 if (!list_is_singular(&kip->list)) { 228 if (!list_is_singular(&kip->list)) {
227 list_del(&kip->list); 229 list_del(&kip->list);
228 module_free(NULL, kip->insns); 230 kip->cache->free(kip->insns);
229 kfree(kip); 231 kfree(kip);
230 } 232 }
231 return 1; 233 return 1;
@@ -255,11 +257,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
255 return 0; 257 return 0;
256} 258}
257 259
258static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, 260void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
259 kprobe_opcode_t *slot, int dirty) 261 kprobe_opcode_t *slot, int dirty)
260{ 262{
261 struct kprobe_insn_page *kip; 263 struct kprobe_insn_page *kip;
262 264
265 mutex_lock(&c->mutex);
263 list_for_each_entry(kip, &c->pages, list) { 266 list_for_each_entry(kip, &c->pages, list) {
264 long idx = ((long)slot - (long)kip->insns) / 267 long idx = ((long)slot - (long)kip->insns) /
265 (c->insn_size * sizeof(kprobe_opcode_t)); 268 (c->insn_size * sizeof(kprobe_opcode_t));
@@ -272,45 +275,25 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
272 collect_garbage_slots(c); 275 collect_garbage_slots(c);
273 } else 276 } else
274 collect_one_slot(kip, idx); 277 collect_one_slot(kip, idx);
275 return; 278 goto out;
276 } 279 }
277 } 280 }
278 /* Could not free this slot. */ 281 /* Could not free this slot. */
279 WARN_ON(1); 282 WARN_ON(1);
283out:
284 mutex_unlock(&c->mutex);
280} 285}
281 286
282void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
283{
284 mutex_lock(&kprobe_insn_mutex);
285 __free_insn_slot(&kprobe_insn_slots, slot, dirty);
286 mutex_unlock(&kprobe_insn_mutex);
287}
288#ifdef CONFIG_OPTPROBES 287#ifdef CONFIG_OPTPROBES
289/* For optimized_kprobe buffer */ 288/* For optimized_kprobe buffer */
290static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ 289struct kprobe_insn_cache kprobe_optinsn_slots = {
291static struct kprobe_insn_cache kprobe_optinsn_slots = { 290 .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
291 .alloc = alloc_insn_page,
292 .free = free_insn_page,
292 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), 293 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
293 /* .insn_size is initialized later */ 294 /* .insn_size is initialized later */
294 .nr_garbage = 0, 295 .nr_garbage = 0,
295}; 296};
296/* Get a slot for optimized_kprobe buffer */
297kprobe_opcode_t __kprobes *get_optinsn_slot(void)
298{
299 kprobe_opcode_t *ret = NULL;
300
301 mutex_lock(&kprobe_optinsn_mutex);
302 ret = __get_insn_slot(&kprobe_optinsn_slots);
303 mutex_unlock(&kprobe_optinsn_mutex);
304
305 return ret;
306}
307
308void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
309{
310 mutex_lock(&kprobe_optinsn_mutex);
311 __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
312 mutex_unlock(&kprobe_optinsn_mutex);
313}
314#endif 297#endif
315#endif 298#endif
316 299
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6ada93c23a9a..9659d38e008f 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -113,7 +113,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
113 unsigned long cnt; 113 unsigned long cnt;
114 int ret; 114 int ret;
115 115
116 if (strict_strtoul(buf, 0, &cnt)) 116 if (kstrtoul(buf, 0, &cnt))
117 return -EINVAL; 117 return -EINVAL;
118 118
119 ret = crash_shrink_memory(cnt); 119 ret = crash_shrink_memory(cnt);
diff --git a/kernel/lglock.c b/kernel/lglock.c
index 6535a667a5a7..86ae2aebf004 100644
--- a/kernel/lglock.c
+++ b/kernel/lglock.c
@@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg)
21 arch_spinlock_t *lock; 21 arch_spinlock_t *lock;
22 22
23 preempt_disable(); 23 preempt_disable();
24 rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); 24 lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
25 lock = this_cpu_ptr(lg->lock); 25 lock = this_cpu_ptr(lg->lock);
26 arch_spin_lock(lock); 26 arch_spin_lock(lock);
27} 27}
@@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg)
31{ 31{
32 arch_spinlock_t *lock; 32 arch_spinlock_t *lock;
33 33
34 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); 34 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
35 lock = this_cpu_ptr(lg->lock); 35 lock = this_cpu_ptr(lg->lock);
36 arch_spin_unlock(lock); 36 arch_spin_unlock(lock);
37 preempt_enable(); 37 preempt_enable();
@@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu)
43 arch_spinlock_t *lock; 43 arch_spinlock_t *lock;
44 44
45 preempt_disable(); 45 preempt_disable();
46 rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); 46 lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
47 lock = per_cpu_ptr(lg->lock, cpu); 47 lock = per_cpu_ptr(lg->lock, cpu);
48 arch_spin_lock(lock); 48 arch_spin_lock(lock);
49} 49}
@@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
53{ 53{
54 arch_spinlock_t *lock; 54 arch_spinlock_t *lock;
55 55
56 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); 56 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
57 lock = per_cpu_ptr(lg->lock, cpu); 57 lock = per_cpu_ptr(lg->lock, cpu);
58 arch_spin_unlock(lock); 58 arch_spin_unlock(lock);
59 preempt_enable(); 59 preempt_enable();
@@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg)
65 int i; 65 int i;
66 66
67 preempt_disable(); 67 preempt_disable();
68 rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); 68 lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
69 for_each_possible_cpu(i) { 69 for_each_possible_cpu(i) {
70 arch_spinlock_t *lock; 70 arch_spinlock_t *lock;
71 lock = per_cpu_ptr(lg->lock, i); 71 lock = per_cpu_ptr(lg->lock, i);
@@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg)
78{ 78{
79 int i; 79 int i;
80 80
81 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); 81 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
82 for_each_possible_cpu(i) { 82 for_each_possible_cpu(i) {
83 arch_spinlock_t *lock; 83 arch_spinlock_t *lock;
84 lock = per_cpu_ptr(lg->lock, i); 84 lock = per_cpu_ptr(lg->lock, i);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e16c45b9ee77..4e8e14c34e42 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4224,7 +4224,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4224 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", 4224 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
4225 !rcu_lockdep_current_cpu_online() 4225 !rcu_lockdep_current_cpu_online()
4226 ? "RCU used illegally from offline CPU!\n" 4226 ? "RCU used illegally from offline CPU!\n"
4227 : rcu_is_cpu_idle() 4227 : !rcu_is_watching()
4228 ? "RCU used illegally from idle CPU!\n" 4228 ? "RCU used illegally from idle CPU!\n"
4229 : "", 4229 : "",
4230 rcu_scheduler_active, debug_locks); 4230 rcu_scheduler_active, debug_locks);
@@ -4247,7 +4247,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4247 * So complain bitterly if someone does call rcu_read_lock(), 4247 * So complain bitterly if someone does call rcu_read_lock(),
4248 * rcu_read_lock_bh() and so on from extended quiescent states. 4248 * rcu_read_lock_bh() and so on from extended quiescent states.
4249 */ 4249 */
4250 if (rcu_is_cpu_idle()) 4250 if (!rcu_is_watching())
4251 printk("RCU used illegally from extended quiescent state!\n"); 4251 printk("RCU used illegally from extended quiescent state!\n");
4252 4252
4253 lockdep_print_held_locks(curr); 4253 lockdep_print_held_locks(curr);
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
index 2b6e69909c39..7cbd4507a7e6 100644
--- a/kernel/modsign_pubkey.c
+++ b/kernel/modsign_pubkey.c
@@ -18,14 +18,14 @@
18 18
19struct key *modsign_keyring; 19struct key *modsign_keyring;
20 20
21extern __initdata const u8 modsign_certificate_list[]; 21extern __initconst const u8 modsign_certificate_list[];
22extern __initdata const u8 modsign_certificate_list_end[]; 22extern __initconst const u8 modsign_certificate_list_end[];
23 23
24/* 24/*
25 * We need to make sure ccache doesn't cache the .o file as it doesn't notice 25 * We need to make sure ccache doesn't cache the .o file as it doesn't notice
26 * if modsign.pub changes. 26 * if modsign.pub changes.
27 */ 27 */
28static __initdata const char annoy_ccache[] = __TIME__ "foo"; 28static __initconst const char annoy_ccache[] = __TIME__ "foo";
29 29
30/* 30/*
31 * Load the compiled-in keys 31 * Load the compiled-in keys
diff --git a/kernel/module.c b/kernel/module.c
index 206915830d29..dc582749fa13 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -136,6 +136,7 @@ static int param_set_bool_enable_only(const char *val,
136} 136}
137 137
138static const struct kernel_param_ops param_ops_bool_enable_only = { 138static const struct kernel_param_ops param_ops_bool_enable_only = {
139 .flags = KERNEL_PARAM_FL_NOARG,
139 .set = param_set_bool_enable_only, 140 .set = param_set_bool_enable_only,
140 .get = param_get_bool, 141 .get = param_get_bool,
141}; 142};
@@ -603,7 +604,7 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \
603static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ 604static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
604 struct module_kobject *mk, char *buffer) \ 605 struct module_kobject *mk, char *buffer) \
605{ \ 606{ \
606 return sprintf(buffer, "%s\n", mk->mod->field); \ 607 return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \
607} \ 608} \
608static int modinfo_##field##_exists(struct module *mod) \ 609static int modinfo_##field##_exists(struct module *mod) \
609{ \ 610{ \
@@ -1611,6 +1612,14 @@ static void module_remove_modinfo_attrs(struct module *mod)
1611 kfree(mod->modinfo_attrs); 1612 kfree(mod->modinfo_attrs);
1612} 1613}
1613 1614
1615static void mod_kobject_put(struct module *mod)
1616{
1617 DECLARE_COMPLETION_ONSTACK(c);
1618 mod->mkobj.kobj_completion = &c;
1619 kobject_put(&mod->mkobj.kobj);
1620 wait_for_completion(&c);
1621}
1622
1614static int mod_sysfs_init(struct module *mod) 1623static int mod_sysfs_init(struct module *mod)
1615{ 1624{
1616 int err; 1625 int err;
@@ -1638,7 +1647,7 @@ static int mod_sysfs_init(struct module *mod)
1638 err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, 1647 err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
1639 "%s", mod->name); 1648 "%s", mod->name);
1640 if (err) 1649 if (err)
1641 kobject_put(&mod->mkobj.kobj); 1650 mod_kobject_put(mod);
1642 1651
1643 /* delay uevent until full sysfs population */ 1652 /* delay uevent until full sysfs population */
1644out: 1653out:
@@ -1682,7 +1691,7 @@ out_unreg_param:
1682out_unreg_holders: 1691out_unreg_holders:
1683 kobject_put(mod->holders_dir); 1692 kobject_put(mod->holders_dir);
1684out_unreg: 1693out_unreg:
1685 kobject_put(&mod->mkobj.kobj); 1694 mod_kobject_put(mod);
1686out: 1695out:
1687 return err; 1696 return err;
1688} 1697}
@@ -1691,7 +1700,7 @@ static void mod_sysfs_fini(struct module *mod)
1691{ 1700{
1692 remove_notes_attrs(mod); 1701 remove_notes_attrs(mod);
1693 remove_sect_attrs(mod); 1702 remove_sect_attrs(mod);
1694 kobject_put(&mod->mkobj.kobj); 1703 mod_kobject_put(mod);
1695} 1704}
1696 1705
1697#else /* !CONFIG_SYSFS */ 1706#else /* !CONFIG_SYSFS */
@@ -2540,21 +2549,20 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
2540/* Sets info->hdr and info->len. */ 2549/* Sets info->hdr and info->len. */
2541static int copy_module_from_fd(int fd, struct load_info *info) 2550static int copy_module_from_fd(int fd, struct load_info *info)
2542{ 2551{
2543 struct file *file; 2552 struct fd f = fdget(fd);
2544 int err; 2553 int err;
2545 struct kstat stat; 2554 struct kstat stat;
2546 loff_t pos; 2555 loff_t pos;
2547 ssize_t bytes = 0; 2556 ssize_t bytes = 0;
2548 2557
2549 file = fget(fd); 2558 if (!f.file)
2550 if (!file)
2551 return -ENOEXEC; 2559 return -ENOEXEC;
2552 2560
2553 err = security_kernel_module_from_file(file); 2561 err = security_kernel_module_from_file(f.file);
2554 if (err) 2562 if (err)
2555 goto out; 2563 goto out;
2556 2564
2557 err = vfs_getattr(&file->f_path, &stat); 2565 err = vfs_getattr(&f.file->f_path, &stat);
2558 if (err) 2566 if (err)
2559 goto out; 2567 goto out;
2560 2568
@@ -2577,7 +2585,7 @@ static int copy_module_from_fd(int fd, struct load_info *info)
2577 2585
2578 pos = 0; 2586 pos = 0;
2579 while (pos < stat.size) { 2587 while (pos < stat.size) {
2580 bytes = kernel_read(file, pos, (char *)(info->hdr) + pos, 2588 bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos,
2581 stat.size - pos); 2589 stat.size - pos);
2582 if (bytes < 0) { 2590 if (bytes < 0) {
2583 vfree(info->hdr); 2591 vfree(info->hdr);
@@ -2591,7 +2599,7 @@ static int copy_module_from_fd(int fd, struct load_info *info)
2591 info->len = pos; 2599 info->len = pos;
2592 2600
2593out: 2601out:
2594 fput(file); 2602 fdput(f);
2595 return err; 2603 return err;
2596} 2604}
2597 2605
diff --git a/kernel/mutex.c b/kernel/mutex.c
index ff05f4bd86eb..d24105b1b794 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -209,11 +209,13 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
209 */ 209 */
210static inline int mutex_can_spin_on_owner(struct mutex *lock) 210static inline int mutex_can_spin_on_owner(struct mutex *lock)
211{ 211{
212 struct task_struct *owner;
212 int retval = 1; 213 int retval = 1;
213 214
214 rcu_read_lock(); 215 rcu_read_lock();
215 if (lock->owner) 216 owner = ACCESS_ONCE(lock->owner);
216 retval = lock->owner->on_cpu; 217 if (owner)
218 retval = owner->on_cpu;
217 rcu_read_unlock(); 219 rcu_read_unlock();
218 /* 220 /*
219 * if lock->owner is not set, the mutex owner may have just acquired 221 * if lock->owner is not set, the mutex owner may have just acquired
@@ -408,7 +410,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
408static __always_inline int __sched 410static __always_inline int __sched
409__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, 411__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
410 struct lockdep_map *nest_lock, unsigned long ip, 412 struct lockdep_map *nest_lock, unsigned long ip,
411 struct ww_acquire_ctx *ww_ctx) 413 struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
412{ 414{
413 struct task_struct *task = current; 415 struct task_struct *task = current;
414 struct mutex_waiter waiter; 416 struct mutex_waiter waiter;
@@ -448,7 +450,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
448 struct task_struct *owner; 450 struct task_struct *owner;
449 struct mspin_node node; 451 struct mspin_node node;
450 452
451 if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { 453 if (use_ww_ctx && ww_ctx->acquired > 0) {
452 struct ww_mutex *ww; 454 struct ww_mutex *ww;
453 455
454 ww = container_of(lock, struct ww_mutex, base); 456 ww = container_of(lock, struct ww_mutex, base);
@@ -461,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
461 * performed the optimistic spinning cannot be done. 463 * performed the optimistic spinning cannot be done.
462 */ 464 */
463 if (ACCESS_ONCE(ww->ctx)) 465 if (ACCESS_ONCE(ww->ctx))
464 break; 466 goto slowpath;
465 } 467 }
466 468
467 /* 469 /*
@@ -472,13 +474,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
472 owner = ACCESS_ONCE(lock->owner); 474 owner = ACCESS_ONCE(lock->owner);
473 if (owner && !mutex_spin_on_owner(lock, owner)) { 475 if (owner && !mutex_spin_on_owner(lock, owner)) {
474 mspin_unlock(MLOCK(lock), &node); 476 mspin_unlock(MLOCK(lock), &node);
475 break; 477 goto slowpath;
476 } 478 }
477 479
478 if ((atomic_read(&lock->count) == 1) && 480 if ((atomic_read(&lock->count) == 1) &&
479 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { 481 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
480 lock_acquired(&lock->dep_map, ip); 482 lock_acquired(&lock->dep_map, ip);
481 if (!__builtin_constant_p(ww_ctx == NULL)) { 483 if (use_ww_ctx) {
482 struct ww_mutex *ww; 484 struct ww_mutex *ww;
483 ww = container_of(lock, struct ww_mutex, base); 485 ww = container_of(lock, struct ww_mutex, base);
484 486
@@ -499,7 +501,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
499 * the owner complete. 501 * the owner complete.
500 */ 502 */
501 if (!owner && (need_resched() || rt_task(task))) 503 if (!owner && (need_resched() || rt_task(task)))
502 break; 504 goto slowpath;
503 505
504 /* 506 /*
505 * The cpu_relax() call is a compiler barrier which forces 507 * The cpu_relax() call is a compiler barrier which forces
@@ -513,6 +515,10 @@ slowpath:
513#endif 515#endif
514 spin_lock_mutex(&lock->wait_lock, flags); 516 spin_lock_mutex(&lock->wait_lock, flags);
515 517
518 /* once more, can we acquire the lock? */
519 if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1))
520 goto skip_wait;
521
516 debug_mutex_lock_common(lock, &waiter); 522 debug_mutex_lock_common(lock, &waiter);
517 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); 523 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
518 524
@@ -520,9 +526,6 @@ slowpath:
520 list_add_tail(&waiter.list, &lock->wait_list); 526 list_add_tail(&waiter.list, &lock->wait_list);
521 waiter.task = task; 527 waiter.task = task;
522 528
523 if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
524 goto done;
525
526 lock_contended(&lock->dep_map, ip); 529 lock_contended(&lock->dep_map, ip);
527 530
528 for (;;) { 531 for (;;) {
@@ -536,7 +539,7 @@ slowpath:
536 * other waiters: 539 * other waiters:
537 */ 540 */
538 if (MUTEX_SHOW_NO_WAITER(lock) && 541 if (MUTEX_SHOW_NO_WAITER(lock) &&
539 (atomic_xchg(&lock->count, -1) == 1)) 542 (atomic_xchg(&lock->count, -1) == 1))
540 break; 543 break;
541 544
542 /* 545 /*
@@ -548,7 +551,7 @@ slowpath:
548 goto err; 551 goto err;
549 } 552 }
550 553
551 if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { 554 if (use_ww_ctx && ww_ctx->acquired > 0) {
552 ret = __mutex_lock_check_stamp(lock, ww_ctx); 555 ret = __mutex_lock_check_stamp(lock, ww_ctx);
553 if (ret) 556 if (ret)
554 goto err; 557 goto err;
@@ -561,24 +564,25 @@ slowpath:
561 schedule_preempt_disabled(); 564 schedule_preempt_disabled();
562 spin_lock_mutex(&lock->wait_lock, flags); 565 spin_lock_mutex(&lock->wait_lock, flags);
563 } 566 }
567 mutex_remove_waiter(lock, &waiter, current_thread_info());
568 /* set it to 0 if there are no waiters left: */
569 if (likely(list_empty(&lock->wait_list)))
570 atomic_set(&lock->count, 0);
571 debug_mutex_free_waiter(&waiter);
564 572
565done: 573skip_wait:
574 /* got the lock - cleanup and rejoice! */
566 lock_acquired(&lock->dep_map, ip); 575 lock_acquired(&lock->dep_map, ip);
567 /* got the lock - rejoice! */
568 mutex_remove_waiter(lock, &waiter, current_thread_info());
569 mutex_set_owner(lock); 576 mutex_set_owner(lock);
570 577
571 if (!__builtin_constant_p(ww_ctx == NULL)) { 578 if (use_ww_ctx) {
572 struct ww_mutex *ww = container_of(lock, 579 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
573 struct ww_mutex,
574 base);
575 struct mutex_waiter *cur; 580 struct mutex_waiter *cur;
576 581
577 /* 582 /*
578 * This branch gets optimized out for the common case, 583 * This branch gets optimized out for the common case,
579 * and is only important for ww_mutex_lock. 584 * and is only important for ww_mutex_lock.
580 */ 585 */
581
582 ww_mutex_lock_acquired(ww, ww_ctx); 586 ww_mutex_lock_acquired(ww, ww_ctx);
583 ww->ctx = ww_ctx; 587 ww->ctx = ww_ctx;
584 588
@@ -592,15 +596,8 @@ done:
592 } 596 }
593 } 597 }
594 598
595 /* set it to 0 if there are no waiters left: */
596 if (likely(list_empty(&lock->wait_list)))
597 atomic_set(&lock->count, 0);
598
599 spin_unlock_mutex(&lock->wait_lock, flags); 599 spin_unlock_mutex(&lock->wait_lock, flags);
600
601 debug_mutex_free_waiter(&waiter);
602 preempt_enable(); 600 preempt_enable();
603
604 return 0; 601 return 0;
605 602
606err: 603err:
@@ -618,7 +615,7 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
618{ 615{
619 might_sleep(); 616 might_sleep();
620 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 617 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
621 subclass, NULL, _RET_IP_, NULL); 618 subclass, NULL, _RET_IP_, NULL, 0);
622} 619}
623 620
624EXPORT_SYMBOL_GPL(mutex_lock_nested); 621EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -628,7 +625,7 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
628{ 625{
629 might_sleep(); 626 might_sleep();
630 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 627 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
631 0, nest, _RET_IP_, NULL); 628 0, nest, _RET_IP_, NULL, 0);
632} 629}
633 630
634EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); 631EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
@@ -638,7 +635,7 @@ mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
638{ 635{
639 might_sleep(); 636 might_sleep();
640 return __mutex_lock_common(lock, TASK_KILLABLE, 637 return __mutex_lock_common(lock, TASK_KILLABLE,
641 subclass, NULL, _RET_IP_, NULL); 638 subclass, NULL, _RET_IP_, NULL, 0);
642} 639}
643EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); 640EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
644 641
@@ -647,7 +644,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
647{ 644{
648 might_sleep(); 645 might_sleep();
649 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 646 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
650 subclass, NULL, _RET_IP_, NULL); 647 subclass, NULL, _RET_IP_, NULL, 0);
651} 648}
652 649
653EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 650EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -685,8 +682,8 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
685 682
686 might_sleep(); 683 might_sleep();
687 ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 684 ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
688 0, &ctx->dep_map, _RET_IP_, ctx); 685 0, &ctx->dep_map, _RET_IP_, ctx, 1);
689 if (!ret && ctx->acquired > 0) 686 if (!ret && ctx->acquired > 1)
690 return ww_mutex_deadlock_injection(lock, ctx); 687 return ww_mutex_deadlock_injection(lock, ctx);
691 688
692 return ret; 689 return ret;
@@ -700,9 +697,9 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
700 697
701 might_sleep(); 698 might_sleep();
702 ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 699 ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
703 0, &ctx->dep_map, _RET_IP_, ctx); 700 0, &ctx->dep_map, _RET_IP_, ctx, 1);
704 701
705 if (!ret && ctx->acquired > 0) 702 if (!ret && ctx->acquired > 1)
706 return ww_mutex_deadlock_injection(lock, ctx); 703 return ww_mutex_deadlock_injection(lock, ctx);
707 704
708 return ret; 705 return ret;
@@ -812,28 +809,28 @@ __mutex_lock_slowpath(atomic_t *lock_count)
812 struct mutex *lock = container_of(lock_count, struct mutex, count); 809 struct mutex *lock = container_of(lock_count, struct mutex, count);
813 810
814 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, 811 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
815 NULL, _RET_IP_, NULL); 812 NULL, _RET_IP_, NULL, 0);
816} 813}
817 814
818static noinline int __sched 815static noinline int __sched
819__mutex_lock_killable_slowpath(struct mutex *lock) 816__mutex_lock_killable_slowpath(struct mutex *lock)
820{ 817{
821 return __mutex_lock_common(lock, TASK_KILLABLE, 0, 818 return __mutex_lock_common(lock, TASK_KILLABLE, 0,
822 NULL, _RET_IP_, NULL); 819 NULL, _RET_IP_, NULL, 0);
823} 820}
824 821
825static noinline int __sched 822static noinline int __sched
826__mutex_lock_interruptible_slowpath(struct mutex *lock) 823__mutex_lock_interruptible_slowpath(struct mutex *lock)
827{ 824{
828 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, 825 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
829 NULL, _RET_IP_, NULL); 826 NULL, _RET_IP_, NULL, 0);
830} 827}
831 828
832static noinline int __sched 829static noinline int __sched
833__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) 830__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
834{ 831{
835 return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, 832 return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
836 NULL, _RET_IP_, ctx); 833 NULL, _RET_IP_, ctx, 1);
837} 834}
838 835
839static noinline int __sched 836static noinline int __sched
@@ -841,7 +838,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
841 struct ww_acquire_ctx *ctx) 838 struct ww_acquire_ctx *ctx)
842{ 839{
843 return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, 840 return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
844 NULL, _RET_IP_, ctx); 841 NULL, _RET_IP_, ctx, 1);
845} 842}
846 843
847#endif 844#endif
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 364ceab15f0c..8e7811086b82 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -29,15 +29,15 @@
29static struct kmem_cache *nsproxy_cachep; 29static struct kmem_cache *nsproxy_cachep;
30 30
31struct nsproxy init_nsproxy = { 31struct nsproxy init_nsproxy = {
32 .count = ATOMIC_INIT(1), 32 .count = ATOMIC_INIT(1),
33 .uts_ns = &init_uts_ns, 33 .uts_ns = &init_uts_ns,
34#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) 34#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
35 .ipc_ns = &init_ipc_ns, 35 .ipc_ns = &init_ipc_ns,
36#endif 36#endif
37 .mnt_ns = NULL, 37 .mnt_ns = NULL,
38 .pid_ns = &init_pid_ns, 38 .pid_ns_for_children = &init_pid_ns,
39#ifdef CONFIG_NET 39#ifdef CONFIG_NET
40 .net_ns = &init_net, 40 .net_ns = &init_net,
41#endif 41#endif
42}; 42};
43 43
@@ -85,9 +85,10 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
85 goto out_ipc; 85 goto out_ipc;
86 } 86 }
87 87
88 new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); 88 new_nsp->pid_ns_for_children =
89 if (IS_ERR(new_nsp->pid_ns)) { 89 copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);
90 err = PTR_ERR(new_nsp->pid_ns); 90 if (IS_ERR(new_nsp->pid_ns_for_children)) {
91 err = PTR_ERR(new_nsp->pid_ns_for_children);
91 goto out_pid; 92 goto out_pid;
92 } 93 }
93 94
@@ -100,8 +101,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
100 return new_nsp; 101 return new_nsp;
101 102
102out_net: 103out_net:
103 if (new_nsp->pid_ns) 104 if (new_nsp->pid_ns_for_children)
104 put_pid_ns(new_nsp->pid_ns); 105 put_pid_ns(new_nsp->pid_ns_for_children);
105out_pid: 106out_pid:
106 if (new_nsp->ipc_ns) 107 if (new_nsp->ipc_ns)
107 put_ipc_ns(new_nsp->ipc_ns); 108 put_ipc_ns(new_nsp->ipc_ns);
@@ -125,22 +126,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
125 struct nsproxy *old_ns = tsk->nsproxy; 126 struct nsproxy *old_ns = tsk->nsproxy;
126 struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); 127 struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
127 struct nsproxy *new_ns; 128 struct nsproxy *new_ns;
128 int err = 0;
129 129
130 if (!old_ns) 130 if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
131 CLONE_NEWPID | CLONE_NEWNET)))) {
132 get_nsproxy(old_ns);
131 return 0; 133 return 0;
132
133 get_nsproxy(old_ns);
134
135 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
136 CLONE_NEWPID | CLONE_NEWNET)))
137 return 0;
138
139 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
140 err = -EPERM;
141 goto out;
142 } 134 }
143 135
136 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
137 return -EPERM;
138
144 /* 139 /*
145 * CLONE_NEWIPC must detach from the undolist: after switching 140 * CLONE_NEWIPC must detach from the undolist: after switching
146 * to a new ipc namespace, the semaphore arrays from the old 141 * to a new ipc namespace, the semaphore arrays from the old
@@ -148,22 +143,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
148 * means share undolist with parent, so we must forbid using 143 * means share undolist with parent, so we must forbid using
149 * it along with CLONE_NEWIPC. 144 * it along with CLONE_NEWIPC.
150 */ 145 */
151 if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) { 146 if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
152 err = -EINVAL; 147 (CLONE_NEWIPC | CLONE_SYSVSEM))
153 goto out; 148 return -EINVAL;
154 }
155 149
156 new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); 150 new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
157 if (IS_ERR(new_ns)) { 151 if (IS_ERR(new_ns))
158 err = PTR_ERR(new_ns); 152 return PTR_ERR(new_ns);
159 goto out;
160 }
161 153
162 tsk->nsproxy = new_ns; 154 tsk->nsproxy = new_ns;
163 155 return 0;
164out:
165 put_nsproxy(old_ns);
166 return err;
167} 156}
168 157
169void free_nsproxy(struct nsproxy *ns) 158void free_nsproxy(struct nsproxy *ns)
@@ -174,8 +163,8 @@ void free_nsproxy(struct nsproxy *ns)
174 put_uts_ns(ns->uts_ns); 163 put_uts_ns(ns->uts_ns);
175 if (ns->ipc_ns) 164 if (ns->ipc_ns)
176 put_ipc_ns(ns->ipc_ns); 165 put_ipc_ns(ns->ipc_ns);
177 if (ns->pid_ns) 166 if (ns->pid_ns_for_children)
178 put_pid_ns(ns->pid_ns); 167 put_pid_ns(ns->pid_ns_for_children);
179 put_net(ns->net_ns); 168 put_net(ns->net_ns);
180 kmem_cache_free(nsproxy_cachep, ns); 169 kmem_cache_free(nsproxy_cachep, ns);
181} 170}
diff --git a/kernel/padata.c b/kernel/padata.c
index 072f4ee4eb89..07af2c95dcfe 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -846,6 +846,8 @@ static int padata_cpu_callback(struct notifier_block *nfb,
846 switch (action) { 846 switch (action) {
847 case CPU_ONLINE: 847 case CPU_ONLINE:
848 case CPU_ONLINE_FROZEN: 848 case CPU_ONLINE_FROZEN:
849 case CPU_DOWN_FAILED:
850 case CPU_DOWN_FAILED_FROZEN:
849 if (!pinst_has_cpu(pinst, cpu)) 851 if (!pinst_has_cpu(pinst, cpu))
850 break; 852 break;
851 mutex_lock(&pinst->lock); 853 mutex_lock(&pinst->lock);
@@ -857,6 +859,8 @@ static int padata_cpu_callback(struct notifier_block *nfb,
857 859
858 case CPU_DOWN_PREPARE: 860 case CPU_DOWN_PREPARE:
859 case CPU_DOWN_PREPARE_FROZEN: 861 case CPU_DOWN_PREPARE_FROZEN:
862 case CPU_UP_CANCELED:
863 case CPU_UP_CANCELED_FROZEN:
860 if (!pinst_has_cpu(pinst, cpu)) 864 if (!pinst_has_cpu(pinst, cpu))
861 break; 865 break;
862 mutex_lock(&pinst->lock); 866 mutex_lock(&pinst->lock);
@@ -865,22 +869,6 @@ static int padata_cpu_callback(struct notifier_block *nfb,
865 if (err) 869 if (err)
866 return notifier_from_errno(err); 870 return notifier_from_errno(err);
867 break; 871 break;
868
869 case CPU_UP_CANCELED:
870 case CPU_UP_CANCELED_FROZEN:
871 if (!pinst_has_cpu(pinst, cpu))
872 break;
873 mutex_lock(&pinst->lock);
874 __padata_remove_cpu(pinst, cpu);
875 mutex_unlock(&pinst->lock);
876
877 case CPU_DOWN_FAILED:
878 case CPU_DOWN_FAILED_FROZEN:
879 if (!pinst_has_cpu(pinst, cpu))
880 break;
881 mutex_lock(&pinst->lock);
882 __padata_add_cpu(pinst, cpu);
883 mutex_unlock(&pinst->lock);
884 } 872 }
885 873
886 return NOTIFY_OK; 874 return NOTIFY_OK;
@@ -1086,18 +1074,18 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq,
1086 1074
1087 pinst->flags = 0; 1075 pinst->flags = 0;
1088 1076
1089#ifdef CONFIG_HOTPLUG_CPU
1090 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
1091 pinst->cpu_notifier.priority = 0;
1092 register_hotcpu_notifier(&pinst->cpu_notifier);
1093#endif
1094
1095 put_online_cpus(); 1077 put_online_cpus();
1096 1078
1097 BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); 1079 BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
1098 kobject_init(&pinst->kobj, &padata_attr_type); 1080 kobject_init(&pinst->kobj, &padata_attr_type);
1099 mutex_init(&pinst->lock); 1081 mutex_init(&pinst->lock);
1100 1082
1083#ifdef CONFIG_HOTPLUG_CPU
1084 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
1085 pinst->cpu_notifier.priority = 0;
1086 register_hotcpu_notifier(&pinst->cpu_notifier);
1087#endif
1088
1101 return pinst; 1089 return pinst;
1102 1090
1103err_free_masks: 1091err_free_masks:
diff --git a/kernel/panic.c b/kernel/panic.c
index 801864600514..b6c482ccc5db 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -123,10 +123,14 @@ void panic(const char *fmt, ...)
123 */ 123 */
124 smp_send_stop(); 124 smp_send_stop();
125 125
126 kmsg_dump(KMSG_DUMP_PANIC); 126 /*
127 127 * Run any panic handlers, including those that might need to
128 * add information to the kmsg dump output.
129 */
128 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 130 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
129 131
132 kmsg_dump(KMSG_DUMP_PANIC);
133
130 bust_spinlocks(0); 134 bust_spinlocks(0);
131 135
132 if (!panic_blink) 136 if (!panic_blink)
diff --git a/kernel/params.c b/kernel/params.c
index 440e65d1a544..c00d5b502aa4 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -103,8 +103,8 @@ static int parse_one(char *param,
103 || params[i].level > max_level) 103 || params[i].level > max_level)
104 return 0; 104 return 0;
105 /* No one handled NULL, so do it here. */ 105 /* No one handled NULL, so do it here. */
106 if (!val && params[i].ops->set != param_set_bool 106 if (!val &&
107 && params[i].ops->set != param_set_bint) 107 !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG))
108 return -EINVAL; 108 return -EINVAL;
109 pr_debug("handling %s with %p\n", param, 109 pr_debug("handling %s with %p\n", param,
110 params[i].ops->set); 110 params[i].ops->set);
@@ -241,7 +241,8 @@ int parse_args(const char *doing,
241 } \ 241 } \
242 int param_get_##name(char *buffer, const struct kernel_param *kp) \ 242 int param_get_##name(char *buffer, const struct kernel_param *kp) \
243 { \ 243 { \
244 return sprintf(buffer, format, *((type *)kp->arg)); \ 244 return scnprintf(buffer, PAGE_SIZE, format, \
245 *((type *)kp->arg)); \
245 } \ 246 } \
246 struct kernel_param_ops param_ops_##name = { \ 247 struct kernel_param_ops param_ops_##name = { \
247 .set = param_set_##name, \ 248 .set = param_set_##name, \
@@ -252,13 +253,13 @@ int parse_args(const char *doing,
252 EXPORT_SYMBOL(param_ops_##name) 253 EXPORT_SYMBOL(param_ops_##name)
253 254
254 255
255STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); 256STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul);
256STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); 257STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol);
257STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); 258STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul);
258STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); 259STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol);
259STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); 260STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul);
260STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); 261STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol);
261STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); 262STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul);
262 263
263int param_set_charp(const char *val, const struct kernel_param *kp) 264int param_set_charp(const char *val, const struct kernel_param *kp)
264{ 265{
@@ -285,7 +286,7 @@ EXPORT_SYMBOL(param_set_charp);
285 286
286int param_get_charp(char *buffer, const struct kernel_param *kp) 287int param_get_charp(char *buffer, const struct kernel_param *kp)
287{ 288{
288 return sprintf(buffer, "%s", *((char **)kp->arg)); 289 return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg));
289} 290}
290EXPORT_SYMBOL(param_get_charp); 291EXPORT_SYMBOL(param_get_charp);
291 292
@@ -320,6 +321,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp)
320EXPORT_SYMBOL(param_get_bool); 321EXPORT_SYMBOL(param_get_bool);
321 322
322struct kernel_param_ops param_ops_bool = { 323struct kernel_param_ops param_ops_bool = {
324 .flags = KERNEL_PARAM_FL_NOARG,
323 .set = param_set_bool, 325 .set = param_set_bool,
324 .get = param_get_bool, 326 .get = param_get_bool,
325}; 327};
@@ -370,6 +372,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
370EXPORT_SYMBOL(param_set_bint); 372EXPORT_SYMBOL(param_set_bint);
371 373
372struct kernel_param_ops param_ops_bint = { 374struct kernel_param_ops param_ops_bint = {
375 .flags = KERNEL_PARAM_FL_NOARG,
373 .set = param_set_bint, 376 .set = param_set_bint,
374 .get = param_get_int, 377 .get = param_get_int,
375}; 378};
@@ -827,7 +830,7 @@ ssize_t __modver_version_show(struct module_attribute *mattr,
827 struct module_version_attribute *vattr = 830 struct module_version_attribute *vattr =
828 container_of(mattr, struct module_version_attribute, mattr); 831 container_of(mattr, struct module_version_attribute, mattr);
829 832
830 return sprintf(buf, "%s\n", vattr->version); 833 return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version);
831} 834}
832 835
833extern const struct module_version_attribute *__start___modver[]; 836extern const struct module_version_attribute *__start___modver[];
@@ -912,7 +915,14 @@ static const struct kset_uevent_ops module_uevent_ops = {
912struct kset *module_kset; 915struct kset *module_kset;
913int module_sysfs_initialized; 916int module_sysfs_initialized;
914 917
918static void module_kobj_release(struct kobject *kobj)
919{
920 struct module_kobject *mk = to_module_kobject(kobj);
921 complete(mk->kobj_completion);
922}
923
915struct kobj_type module_ktype = { 924struct kobj_type module_ktype = {
925 .release = module_kobj_release,
916 .sysfs_ops = &module_sysfs_ops, 926 .sysfs_ops = &module_sysfs_ops,
917}; 927};
918 928
diff --git a/kernel/pid.c b/kernel/pid.c
index 66505c1dfc51..9b9a26698144 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -265,6 +265,7 @@ void free_pid(struct pid *pid)
265 struct pid_namespace *ns = upid->ns; 265 struct pid_namespace *ns = upid->ns;
266 hlist_del_rcu(&upid->pid_chain); 266 hlist_del_rcu(&upid->pid_chain);
267 switch(--ns->nr_hashed) { 267 switch(--ns->nr_hashed) {
268 case 2:
268 case 1: 269 case 1:
269 /* When all that is left in the pid namespace 270 /* When all that is left in the pid namespace
270 * is the reaper wake up the reaper. The reaper 271 * is the reaper wake up the reaper. The reaper
@@ -272,6 +273,11 @@ void free_pid(struct pid *pid)
272 */ 273 */
273 wake_up_process(ns->child_reaper); 274 wake_up_process(ns->child_reaper);
274 break; 275 break;
276 case PIDNS_HASH_ADDING:
277 /* Handle a fork failure of the first process */
278 WARN_ON(ns->child_reaper);
279 ns->nr_hashed = 0;
280 /* fall through */
275 case 0: 281 case 0:
276 schedule_work(&ns->proc_work); 282 schedule_work(&ns->proc_work);
277 break; 283 break;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6917e8edb48e..42086551a24a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -329,7 +329,7 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
329 struct pid_namespace *ancestor, *new = ns; 329 struct pid_namespace *ancestor, *new = ns;
330 330
331 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || 331 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
332 !nsown_capable(CAP_SYS_ADMIN)) 332 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
333 return -EPERM; 333 return -EPERM;
334 334
335 /* 335 /*
@@ -349,8 +349,8 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
349 if (ancestor != active) 349 if (ancestor != active)
350 return -EINVAL; 350 return -EINVAL;
351 351
352 put_pid_ns(nsproxy->pid_ns); 352 put_pid_ns(nsproxy->pid_ns_for_children);
353 nsproxy->pid_ns = get_pid_ns(new); 353 nsproxy->pid_ns_for_children = get_pid_ns(new);
354 return 0; 354 return 0;
355} 355}
356 356
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b26f5f1e773e..0121dab83f43 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -39,7 +39,7 @@ static int resume_delay;
39static char resume_file[256] = CONFIG_PM_STD_PARTITION; 39static char resume_file[256] = CONFIG_PM_STD_PARTITION;
40dev_t swsusp_resume_device; 40dev_t swsusp_resume_device;
41sector_t swsusp_resume_block; 41sector_t swsusp_resume_block;
42int in_suspend __nosavedata; 42__visible int in_suspend __nosavedata;
43 43
44enum { 44enum {
45 HIBERNATION_INVALID, 45 HIBERNATION_INVALID,
@@ -644,22 +644,23 @@ int hibernate(void)
644 if (error) 644 if (error)
645 goto Exit; 645 goto Exit;
646 646
647 /* Allocate memory management structures */
648 error = create_basic_memory_bitmaps();
649 if (error)
650 goto Exit;
651
652 printk(KERN_INFO "PM: Syncing filesystems ... "); 647 printk(KERN_INFO "PM: Syncing filesystems ... ");
653 sys_sync(); 648 sys_sync();
654 printk("done.\n"); 649 printk("done.\n");
655 650
656 error = freeze_processes(); 651 error = freeze_processes();
657 if (error) 652 if (error)
658 goto Free_bitmaps; 653 goto Exit;
654
655 lock_device_hotplug();
656 /* Allocate memory management structures */
657 error = create_basic_memory_bitmaps();
658 if (error)
659 goto Thaw;
659 660
660 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 661 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
661 if (error || freezer_test_done) 662 if (error || freezer_test_done)
662 goto Thaw; 663 goto Free_bitmaps;
663 664
664 if (in_suspend) { 665 if (in_suspend) {
665 unsigned int flags = 0; 666 unsigned int flags = 0;
@@ -682,14 +683,14 @@ int hibernate(void)
682 pr_debug("PM: Image restored successfully.\n"); 683 pr_debug("PM: Image restored successfully.\n");
683 } 684 }
684 685
686 Free_bitmaps:
687 free_basic_memory_bitmaps();
685 Thaw: 688 Thaw:
689 unlock_device_hotplug();
686 thaw_processes(); 690 thaw_processes();
687 691
688 /* Don't bother checking whether freezer_test_done is true */ 692 /* Don't bother checking whether freezer_test_done is true */
689 freezer_test_done = false; 693 freezer_test_done = false;
690
691 Free_bitmaps:
692 free_basic_memory_bitmaps();
693 Exit: 694 Exit:
694 pm_notifier_call_chain(PM_POST_HIBERNATION); 695 pm_notifier_call_chain(PM_POST_HIBERNATION);
695 pm_restore_console(); 696 pm_restore_console();
@@ -806,21 +807,20 @@ static int software_resume(void)
806 pm_prepare_console(); 807 pm_prepare_console();
807 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 808 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
808 if (error) 809 if (error)
809 goto close_finish; 810 goto Close_Finish;
810
811 error = create_basic_memory_bitmaps();
812 if (error)
813 goto close_finish;
814 811
815 pr_debug("PM: Preparing processes for restore.\n"); 812 pr_debug("PM: Preparing processes for restore.\n");
816 error = freeze_processes(); 813 error = freeze_processes();
817 if (error) { 814 if (error)
818 swsusp_close(FMODE_READ); 815 goto Close_Finish;
819 goto Done;
820 }
821 816
822 pr_debug("PM: Loading hibernation image.\n"); 817 pr_debug("PM: Loading hibernation image.\n");
823 818
819 lock_device_hotplug();
820 error = create_basic_memory_bitmaps();
821 if (error)
822 goto Thaw;
823
824 error = swsusp_read(&flags); 824 error = swsusp_read(&flags);
825 swsusp_close(FMODE_READ); 825 swsusp_close(FMODE_READ);
826 if (!error) 826 if (!error)
@@ -828,9 +828,10 @@ static int software_resume(void)
828 828
829 printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); 829 printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
830 swsusp_free(); 830 swsusp_free();
831 thaw_processes();
832 Done:
833 free_basic_memory_bitmaps(); 831 free_basic_memory_bitmaps();
832 Thaw:
833 unlock_device_hotplug();
834 thaw_processes();
834 Finish: 835 Finish:
835 pm_notifier_call_chain(PM_POST_RESTORE); 836 pm_notifier_call_chain(PM_POST_RESTORE);
836 pm_restore_console(); 837 pm_restore_console();
@@ -840,12 +841,12 @@ static int software_resume(void)
840 mutex_unlock(&pm_mutex); 841 mutex_unlock(&pm_mutex);
841 pr_debug("PM: Hibernation image not present or could not be loaded.\n"); 842 pr_debug("PM: Hibernation image not present or could not be loaded.\n");
842 return error; 843 return error;
843close_finish: 844 Close_Finish:
844 swsusp_close(FMODE_READ); 845 swsusp_close(FMODE_READ);
845 goto Finish; 846 goto Finish;
846} 847}
847 848
848late_initcall(software_resume); 849late_initcall_sync(software_resume);
849 850
850 851
851static const char * const hibernation_modes[] = { 852static const char * const hibernation_modes[] = {
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 06fe28589e9c..a394297f8b2f 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -296,6 +296,17 @@ int pm_qos_request_active(struct pm_qos_request *req)
296} 296}
297EXPORT_SYMBOL_GPL(pm_qos_request_active); 297EXPORT_SYMBOL_GPL(pm_qos_request_active);
298 298
299static void __pm_qos_update_request(struct pm_qos_request *req,
300 s32 new_value)
301{
302 trace_pm_qos_update_request(req->pm_qos_class, new_value);
303
304 if (new_value != req->node.prio)
305 pm_qos_update_target(
306 pm_qos_array[req->pm_qos_class]->constraints,
307 &req->node, PM_QOS_UPDATE_REQ, new_value);
308}
309
299/** 310/**
300 * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout 311 * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout
301 * @work: work struct for the delayed work (timeout) 312 * @work: work struct for the delayed work (timeout)
@@ -308,7 +319,7 @@ static void pm_qos_work_fn(struct work_struct *work)
308 struct pm_qos_request, 319 struct pm_qos_request,
309 work); 320 work);
310 321
311 pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); 322 __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
312} 323}
313 324
314/** 325/**
@@ -364,12 +375,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
364 } 375 }
365 376
366 cancel_delayed_work_sync(&req->work); 377 cancel_delayed_work_sync(&req->work);
367 378 __pm_qos_update_request(req, new_value);
368 trace_pm_qos_update_request(req->pm_qos_class, new_value);
369 if (new_value != req->node.prio)
370 pm_qos_update_target(
371 pm_qos_array[req->pm_qos_class]->constraints,
372 &req->node, PM_QOS_UPDATE_REQ, new_value);
373} 379}
374EXPORT_SYMBOL_GPL(pm_qos_update_request); 380EXPORT_SYMBOL_GPL(pm_qos_update_request);
375 381
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 349587bb03e1..98c3b34a4cff 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -352,7 +352,7 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
352 struct mem_extent *ext, *cur, *aux; 352 struct mem_extent *ext, *cur, *aux;
353 353
354 zone_start = zone->zone_start_pfn; 354 zone_start = zone->zone_start_pfn;
355 zone_end = zone->zone_start_pfn + zone->spanned_pages; 355 zone_end = zone_end_pfn(zone);
356 356
357 list_for_each_entry(ext, list, hook) 357 list_for_each_entry(ext, list, hook)
358 if (zone_start <= ext->end) 358 if (zone_start <= ext->end)
@@ -743,7 +743,10 @@ int create_basic_memory_bitmaps(void)
743 struct memory_bitmap *bm1, *bm2; 743 struct memory_bitmap *bm1, *bm2;
744 int error = 0; 744 int error = 0;
745 745
746 BUG_ON(forbidden_pages_map || free_pages_map); 746 if (forbidden_pages_map && free_pages_map)
747 return 0;
748 else
749 BUG_ON(forbidden_pages_map || free_pages_map);
747 750
748 bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); 751 bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
749 if (!bm1) 752 if (!bm1)
@@ -884,7 +887,7 @@ static unsigned int count_highmem_pages(void)
884 continue; 887 continue;
885 888
886 mark_free_pages(zone); 889 mark_free_pages(zone);
887 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 890 max_zone_pfn = zone_end_pfn(zone);
888 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 891 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
889 if (saveable_highmem_page(zone, pfn)) 892 if (saveable_highmem_page(zone, pfn))
890 n++; 893 n++;
@@ -948,7 +951,7 @@ static unsigned int count_data_pages(void)
948 continue; 951 continue;
949 952
950 mark_free_pages(zone); 953 mark_free_pages(zone);
951 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 954 max_zone_pfn = zone_end_pfn(zone);
952 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 955 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
953 if (saveable_page(zone, pfn)) 956 if (saveable_page(zone, pfn))
954 n++; 957 n++;
@@ -1041,7 +1044,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1041 unsigned long max_zone_pfn; 1044 unsigned long max_zone_pfn;
1042 1045
1043 mark_free_pages(zone); 1046 mark_free_pages(zone);
1044 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1047 max_zone_pfn = zone_end_pfn(zone);
1045 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1048 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1046 if (page_is_saveable(zone, pfn)) 1049 if (page_is_saveable(zone, pfn))
1047 memory_bm_set_bit(orig_bm, pfn); 1050 memory_bm_set_bit(orig_bm, pfn);
@@ -1093,7 +1096,7 @@ void swsusp_free(void)
1093 unsigned long pfn, max_zone_pfn; 1096 unsigned long pfn, max_zone_pfn;
1094 1097
1095 for_each_populated_zone(zone) { 1098 for_each_populated_zone(zone) {
1096 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1099 max_zone_pfn = zone_end_pfn(zone);
1097 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1100 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1098 if (pfn_valid(pfn)) { 1101 if (pfn_valid(pfn)) {
1099 struct page *page = pfn_to_page(pfn); 1102 struct page *page = pfn_to_page(pfn);
@@ -1755,7 +1758,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1755 1758
1756 /* Clear page flags */ 1759 /* Clear page flags */
1757 for_each_populated_zone(zone) { 1760 for_each_populated_zone(zone) {
1758 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1761 max_zone_pfn = zone_end_pfn(zone);
1759 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1762 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1760 if (pfn_valid(pfn)) 1763 if (pfn_valid(pfn))
1761 swsusp_unset_page_free(pfn_to_page(pfn)); 1764 swsusp_unset_page_free(pfn_to_page(pfn));
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ece04223bb1e..62ee437b5c7e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -210,6 +210,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
210 goto Platform_wake; 210 goto Platform_wake;
211 } 211 }
212 212
213 ftrace_stop();
213 error = disable_nonboot_cpus(); 214 error = disable_nonboot_cpus();
214 if (error || suspend_test(TEST_CPUS)) 215 if (error || suspend_test(TEST_CPUS))
215 goto Enable_cpus; 216 goto Enable_cpus;
@@ -232,6 +233,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
232 233
233 Enable_cpus: 234 Enable_cpus:
234 enable_nonboot_cpus(); 235 enable_nonboot_cpus();
236 ftrace_start();
235 237
236 Platform_wake: 238 Platform_wake:
237 if (need_suspend_ops(state) && suspend_ops->wake) 239 if (need_suspend_ops(state) && suspend_ops->wake)
@@ -265,7 +267,6 @@ int suspend_devices_and_enter(suspend_state_t state)
265 goto Close; 267 goto Close;
266 } 268 }
267 suspend_console(); 269 suspend_console();
268 ftrace_stop();
269 suspend_test_start(); 270 suspend_test_start();
270 error = dpm_suspend_start(PMSG_SUSPEND); 271 error = dpm_suspend_start(PMSG_SUSPEND);
271 if (error) { 272 if (error) {
@@ -285,7 +286,6 @@ int suspend_devices_and_enter(suspend_state_t state)
285 suspend_test_start(); 286 suspend_test_start();
286 dpm_resume_end(PMSG_RESUME); 287 dpm_resume_end(PMSG_RESUME);
287 suspend_test_finish("resume devices"); 288 suspend_test_finish("resume devices");
288 ftrace_start();
289 resume_console(); 289 resume_console();
290 Close: 290 Close:
291 if (need_suspend_ops(state) && suspend_ops->end) 291 if (need_suspend_ops(state) && suspend_ops->end)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 4ed81e74f86f..957f06164ad1 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -39,6 +39,7 @@ static struct snapshot_data {
39 char frozen; 39 char frozen;
40 char ready; 40 char ready;
41 char platform_support; 41 char platform_support;
42 bool free_bitmaps;
42} snapshot_state; 43} snapshot_state;
43 44
44atomic_t snapshot_device_available = ATOMIC_INIT(1); 45atomic_t snapshot_device_available = ATOMIC_INIT(1);
@@ -60,11 +61,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
60 error = -ENOSYS; 61 error = -ENOSYS;
61 goto Unlock; 62 goto Unlock;
62 } 63 }
63 if(create_basic_memory_bitmaps()) {
64 atomic_inc(&snapshot_device_available);
65 error = -ENOMEM;
66 goto Unlock;
67 }
68 nonseekable_open(inode, filp); 64 nonseekable_open(inode, filp);
69 data = &snapshot_state; 65 data = &snapshot_state;
70 filp->private_data = data; 66 filp->private_data = data;
@@ -87,13 +83,16 @@ static int snapshot_open(struct inode *inode, struct file *filp)
87 data->swap = -1; 83 data->swap = -1;
88 data->mode = O_WRONLY; 84 data->mode = O_WRONLY;
89 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 85 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
86 if (!error) {
87 error = create_basic_memory_bitmaps();
88 data->free_bitmaps = !error;
89 }
90 if (error) 90 if (error)
91 pm_notifier_call_chain(PM_POST_RESTORE); 91 pm_notifier_call_chain(PM_POST_RESTORE);
92 } 92 }
93 if (error) { 93 if (error)
94 free_basic_memory_bitmaps();
95 atomic_inc(&snapshot_device_available); 94 atomic_inc(&snapshot_device_available);
96 } 95
97 data->frozen = 0; 96 data->frozen = 0;
98 data->ready = 0; 97 data->ready = 0;
99 data->platform_support = 0; 98 data->platform_support = 0;
@@ -111,12 +110,14 @@ static int snapshot_release(struct inode *inode, struct file *filp)
111 lock_system_sleep(); 110 lock_system_sleep();
112 111
113 swsusp_free(); 112 swsusp_free();
114 free_basic_memory_bitmaps();
115 data = filp->private_data; 113 data = filp->private_data;
116 free_all_swap_pages(data->swap); 114 free_all_swap_pages(data->swap);
117 if (data->frozen) { 115 if (data->frozen) {
118 pm_restore_gfp_mask(); 116 pm_restore_gfp_mask();
117 free_basic_memory_bitmaps();
119 thaw_processes(); 118 thaw_processes();
119 } else if (data->free_bitmaps) {
120 free_basic_memory_bitmaps();
120 } 121 }
121 pm_notifier_call_chain(data->mode == O_RDONLY ? 122 pm_notifier_call_chain(data->mode == O_RDONLY ?
122 PM_POST_HIBERNATION : PM_POST_RESTORE); 123 PM_POST_HIBERNATION : PM_POST_RESTORE);
@@ -207,6 +208,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
207 if (!mutex_trylock(&pm_mutex)) 208 if (!mutex_trylock(&pm_mutex))
208 return -EBUSY; 209 return -EBUSY;
209 210
211 lock_device_hotplug();
210 data = filp->private_data; 212 data = filp->private_data;
211 213
212 switch (cmd) { 214 switch (cmd) {
@@ -220,14 +222,23 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
220 printk("done.\n"); 222 printk("done.\n");
221 223
222 error = freeze_processes(); 224 error = freeze_processes();
223 if (!error) 225 if (error)
226 break;
227
228 error = create_basic_memory_bitmaps();
229 if (error)
230 thaw_processes();
231 else
224 data->frozen = 1; 232 data->frozen = 1;
233
225 break; 234 break;
226 235
227 case SNAPSHOT_UNFREEZE: 236 case SNAPSHOT_UNFREEZE:
228 if (!data->frozen || data->ready) 237 if (!data->frozen || data->ready)
229 break; 238 break;
230 pm_restore_gfp_mask(); 239 pm_restore_gfp_mask();
240 free_basic_memory_bitmaps();
241 data->free_bitmaps = false;
231 thaw_processes(); 242 thaw_processes();
232 data->frozen = 0; 243 data->frozen = 0;
233 break; 244 break;
@@ -371,6 +382,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
371 382
372 } 383 }
373 384
385 unlock_device_hotplug();
374 mutex_unlock(&pm_mutex); 386 mutex_unlock(&pm_mutex);
375 387
376 return error; 388 return error;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5b5a7080e2a5..b4e8500afdb3 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2226,6 +2226,13 @@ void register_console(struct console *newcon)
2226 struct console *bcon = NULL; 2226 struct console *bcon = NULL;
2227 struct console_cmdline *c; 2227 struct console_cmdline *c;
2228 2228
2229 if (console_drivers)
2230 for_each_console(bcon)
2231 if (WARN(bcon == newcon,
2232 "console '%s%d' already registered\n",
2233 bcon->name, bcon->index))
2234 return;
2235
2229 /* 2236 /*
2230 * before we register a new CON_BOOT console, make sure we don't 2237 * before we register a new CON_BOOT console, make sure we don't
2231 * already have a valid console 2238 * already have a valid console
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a146ee327f6a..dd562e9aa2c8 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -236,7 +236,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
236 */ 236 */
237 int dumpable = 0; 237 int dumpable = 0;
238 /* Don't let security modules deny introspection */ 238 /* Don't let security modules deny introspection */
239 if (task == current) 239 if (same_thread_group(task, current))
240 return 0; 240 return 0;
241 rcu_read_lock(); 241 rcu_read_lock();
242 tcred = __task_cred(task); 242 tcred = __task_cred(task);
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
new file mode 100644
index 000000000000..01e9ec37a3e3
--- /dev/null
+++ b/kernel/rcu/Makefile
@@ -0,0 +1,6 @@
1obj-y += update.o srcu.o
2obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o
3obj-$(CONFIG_TREE_RCU) += tree.o
4obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
5obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
6obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu.h b/kernel/rcu/rcu.h
index 7f8e7590e3e5..7859a0a3951e 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -67,12 +67,15 @@
67 67
68extern struct debug_obj_descr rcuhead_debug_descr; 68extern struct debug_obj_descr rcuhead_debug_descr;
69 69
70static inline void debug_rcu_head_queue(struct rcu_head *head) 70static inline int debug_rcu_head_queue(struct rcu_head *head)
71{ 71{
72 debug_object_activate(head, &rcuhead_debug_descr); 72 int r1;
73
74 r1 = debug_object_activate(head, &rcuhead_debug_descr);
73 debug_object_active_state(head, &rcuhead_debug_descr, 75 debug_object_active_state(head, &rcuhead_debug_descr,
74 STATE_RCU_HEAD_READY, 76 STATE_RCU_HEAD_READY,
75 STATE_RCU_HEAD_QUEUED); 77 STATE_RCU_HEAD_QUEUED);
78 return r1;
76} 79}
77 80
78static inline void debug_rcu_head_unqueue(struct rcu_head *head) 81static inline void debug_rcu_head_unqueue(struct rcu_head *head)
@@ -83,8 +86,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
83 debug_object_deactivate(head, &rcuhead_debug_descr); 86 debug_object_deactivate(head, &rcuhead_debug_descr);
84} 87}
85#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 88#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
86static inline void debug_rcu_head_queue(struct rcu_head *head) 89static inline int debug_rcu_head_queue(struct rcu_head *head)
87{ 90{
91 return 0;
88} 92}
89 93
90static inline void debug_rcu_head_unqueue(struct rcu_head *head) 94static inline void debug_rcu_head_unqueue(struct rcu_head *head)
@@ -94,7 +98,7 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
94 98
95extern void kfree(const void *); 99extern void kfree(const void *);
96 100
97static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) 101static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
98{ 102{
99 unsigned long offset = (unsigned long)head->func; 103 unsigned long offset = (unsigned long)head->func;
100 104
@@ -118,4 +122,11 @@ int rcu_jiffies_till_stall_check(void);
118 122
119#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ 123#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
120 124
125/*
126 * Strings used in tracepoints need to be exported via the
127 * tracing system such that tools like perf and trace-cmd can
128 * translate the string address pointers to actual text.
129 */
130#define TPS(x) tracepoint_string(x)
131
121#endif /* __LINUX_RCU_H */ 132#endif /* __LINUX_RCU_H */
diff --git a/kernel/srcu.c b/kernel/rcu/srcu.c
index 01d5ccb8bfe3..01d5ccb8bfe3 100644
--- a/kernel/srcu.c
+++ b/kernel/rcu/srcu.c
diff --git a/kernel/rcutiny.c b/kernel/rcu/tiny.c
index aa344111de3e..0c9a934cfec1 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,6 +35,7 @@
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/cpu.h> 36#include <linux/cpu.h>
37#include <linux/prefetch.h> 37#include <linux/prefetch.h>
38#include <linux/ftrace_event.h>
38 39
39#ifdef CONFIG_RCU_TRACE 40#ifdef CONFIG_RCU_TRACE
40#include <trace/events/rcu.h> 41#include <trace/events/rcu.h>
@@ -42,7 +43,7 @@
42 43
43#include "rcu.h" 44#include "rcu.h"
44 45
45/* Forward declarations for rcutiny_plugin.h. */ 46/* Forward declarations for tiny_plugin.h. */
46struct rcu_ctrlblk; 47struct rcu_ctrlblk;
47static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); 48static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
48static void rcu_process_callbacks(struct softirq_action *unused); 49static void rcu_process_callbacks(struct softirq_action *unused);
@@ -52,22 +53,23 @@ static void __call_rcu(struct rcu_head *head,
52 53
53static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 54static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
54 55
55#include "rcutiny_plugin.h" 56#include "tiny_plugin.h"
56 57
57/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
58static void rcu_idle_enter_common(long long newval) 59static void rcu_idle_enter_common(long long newval)
59{ 60{
60 if (newval) { 61 if (newval) {
61 RCU_TRACE(trace_rcu_dyntick("--=", 62 RCU_TRACE(trace_rcu_dyntick(TPS("--="),
62 rcu_dynticks_nesting, newval)); 63 rcu_dynticks_nesting, newval));
63 rcu_dynticks_nesting = newval; 64 rcu_dynticks_nesting = newval;
64 return; 65 return;
65 } 66 }
66 RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); 67 RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
68 rcu_dynticks_nesting, newval));
67 if (!is_idle_task(current)) { 69 if (!is_idle_task(current)) {
68 struct task_struct *idle = idle_task(smp_processor_id()); 70 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
69 71
70 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", 72 RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
71 rcu_dynticks_nesting, newval)); 73 rcu_dynticks_nesting, newval));
72 ftrace_dump(DUMP_ALL); 74 ftrace_dump(DUMP_ALL);
73 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 75 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -120,15 +122,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit);
120static void rcu_idle_exit_common(long long oldval) 122static void rcu_idle_exit_common(long long oldval)
121{ 123{
122 if (oldval) { 124 if (oldval) {
123 RCU_TRACE(trace_rcu_dyntick("++=", 125 RCU_TRACE(trace_rcu_dyntick(TPS("++="),
124 oldval, rcu_dynticks_nesting)); 126 oldval, rcu_dynticks_nesting));
125 return; 127 return;
126 } 128 }
127 RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); 129 RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
128 if (!is_idle_task(current)) { 130 if (!is_idle_task(current)) {
129 struct task_struct *idle = idle_task(smp_processor_id()); 131 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
130 132
131 RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", 133 RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
132 oldval, rcu_dynticks_nesting)); 134 oldval, rcu_dynticks_nesting));
133 ftrace_dump(DUMP_ALL); 135 ftrace_dump(DUMP_ALL);
134 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 136 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -174,18 +176,18 @@ void rcu_irq_enter(void)
174} 176}
175EXPORT_SYMBOL_GPL(rcu_irq_enter); 177EXPORT_SYMBOL_GPL(rcu_irq_enter);
176 178
177#ifdef CONFIG_DEBUG_LOCK_ALLOC 179#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
178 180
179/* 181/*
180 * Test whether RCU thinks that the current CPU is idle. 182 * Test whether RCU thinks that the current CPU is idle.
181 */ 183 */
182int rcu_is_cpu_idle(void) 184bool __rcu_is_watching(void)
183{ 185{
184 return !rcu_dynticks_nesting; 186 return rcu_dynticks_nesting;
185} 187}
186EXPORT_SYMBOL(rcu_is_cpu_idle); 188EXPORT_SYMBOL(__rcu_is_watching);
187 189
188#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 190#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
189 191
190/* 192/*
191 * Test whether the current CPU was interrupted from idle. Nested 193 * Test whether the current CPU was interrupted from idle. Nested
@@ -264,7 +266,7 @@ void rcu_check_callbacks(int cpu, int user)
264 */ 266 */
265static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) 267static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
266{ 268{
267 char *rn = NULL; 269 const char *rn = NULL;
268 struct rcu_head *next, *list; 270 struct rcu_head *next, *list;
269 unsigned long flags; 271 unsigned long flags;
270 RCU_TRACE(int cb_count = 0); 272 RCU_TRACE(int cb_count = 0);
@@ -273,7 +275,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
273 if (&rcp->rcucblist == rcp->donetail) { 275 if (&rcp->rcucblist == rcp->donetail) {
274 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); 276 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
275 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, 277 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
276 ACCESS_ONCE(rcp->rcucblist), 278 !!ACCESS_ONCE(rcp->rcucblist),
277 need_resched(), 279 need_resched(),
278 is_idle_task(current), 280 is_idle_task(current),
279 false)); 281 false));
@@ -304,7 +306,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
304 RCU_TRACE(cb_count++); 306 RCU_TRACE(cb_count++);
305 } 307 }
306 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 308 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
307 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), 309 RCU_TRACE(trace_rcu_batch_end(rcp->name,
310 cb_count, 0, need_resched(),
308 is_idle_task(current), 311 is_idle_task(current),
309 false)); 312 false));
310} 313}
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 0cd385acccfa..280d06cae352 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -36,7 +36,7 @@ struct rcu_ctrlblk {
36 RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ 36 RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
37 RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ 37 RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
38 RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ 38 RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
39 RCU_TRACE(char *name); /* Name of RCU type. */ 39 RCU_TRACE(const char *name); /* Name of RCU type. */
40}; 40};
41 41
42/* Definition for rcupdate control block. */ 42/* Definition for rcupdate control block. */
diff --git a/kernel/rcutorture.c b/kernel/rcu/torture.c
index f4871e52c546..3929cd451511 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcu/torture.c
@@ -52,72 +52,84 @@
52MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); 53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
54 54
55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 55MODULE_ALIAS("rcutorture");
56static int nfakewriters = 4; /* # fake writer threads */ 56#ifdef MODULE_PARAM_PREFIX
57static int stat_interval = 60; /* Interval between stats, in seconds. */ 57#undef MODULE_PARAM_PREFIX
58 /* Zero means "only at end of test". */ 58#endif
59static bool verbose; /* Print more debug info. */ 59#define MODULE_PARAM_PREFIX "rcutorture."
60static bool test_no_idle_hz = true;
61 /* Test RCU support for tickless idle CPUs. */
62static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
63static int stutter = 5; /* Start/stop testing interval (in sec) */
64static int irqreader = 1; /* RCU readers from irq (timers). */
65static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
66static int fqs_holdoff; /* Hold time within burst (us). */
67static int fqs_stutter = 3; /* Wait time between bursts (s). */
68static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */
69static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
70static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
71static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
72static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */
73static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */
74static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
75static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
76static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
77static char *torture_type = "rcu"; /* What RCU implementation to torture. */
78 60
79module_param(nreaders, int, 0444); 61static int fqs_duration;
80MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
81module_param(nfakewriters, int, 0444);
82MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
83module_param(stat_interval, int, 0644);
84MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
85module_param(verbose, bool, 0444);
86MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
87module_param(test_no_idle_hz, bool, 0444);
88MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
89module_param(shuffle_interval, int, 0444);
90MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
91module_param(stutter, int, 0444);
92MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
93module_param(irqreader, int, 0444);
94MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
95module_param(fqs_duration, int, 0444); 62module_param(fqs_duration, int, 0444);
96MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); 63MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
64static int fqs_holdoff;
97module_param(fqs_holdoff, int, 0444); 65module_param(fqs_holdoff, int, 0444);
98MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 66MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
67static int fqs_stutter = 3;
99module_param(fqs_stutter, int, 0444); 68module_param(fqs_stutter, int, 0444);
100MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 69MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
70static bool gp_exp;
71module_param(gp_exp, bool, 0444);
72MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives");
73static bool gp_normal;
74module_param(gp_normal, bool, 0444);
75MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives");
76static int irqreader = 1;
77module_param(irqreader, int, 0444);
78MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
79static int n_barrier_cbs;
101module_param(n_barrier_cbs, int, 0444); 80module_param(n_barrier_cbs, int, 0444);
102MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); 81MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
103module_param(onoff_interval, int, 0444); 82static int nfakewriters = 4;
104MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 83module_param(nfakewriters, int, 0444);
84MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
85static int nreaders = -1;
86module_param(nreaders, int, 0444);
87MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
88static int object_debug;
89module_param(object_debug, int, 0444);
90MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing");
91static int onoff_holdoff;
105module_param(onoff_holdoff, int, 0444); 92module_param(onoff_holdoff, int, 0444);
106MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); 93MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
94static int onoff_interval;
95module_param(onoff_interval, int, 0444);
96MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
97static int shuffle_interval = 3;
98module_param(shuffle_interval, int, 0444);
99MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
100static int shutdown_secs;
107module_param(shutdown_secs, int, 0444); 101module_param(shutdown_secs, int, 0444);
108MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); 102MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable.");
103static int stall_cpu;
109module_param(stall_cpu, int, 0444); 104module_param(stall_cpu, int, 0444);
110MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); 105MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
106static int stall_cpu_holdoff = 10;
111module_param(stall_cpu_holdoff, int, 0444); 107module_param(stall_cpu_holdoff, int, 0444);
112MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); 108MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
109static int stat_interval = 60;
110module_param(stat_interval, int, 0644);
111MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
112static int stutter = 5;
113module_param(stutter, int, 0444);
114MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
115static int test_boost = 1;
113module_param(test_boost, int, 0444); 116module_param(test_boost, int, 0444);
114MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 117MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
115module_param(test_boost_interval, int, 0444); 118static int test_boost_duration = 4;
116MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
117module_param(test_boost_duration, int, 0444); 119module_param(test_boost_duration, int, 0444);
118MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); 120MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
121static int test_boost_interval = 7;
122module_param(test_boost_interval, int, 0444);
123MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
124static bool test_no_idle_hz = true;
125module_param(test_no_idle_hz, bool, 0444);
126MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
127static char *torture_type = "rcu";
119module_param(torture_type, charp, 0444); 128module_param(torture_type, charp, 0444);
120MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 129MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
130static bool verbose;
131module_param(verbose, bool, 0444);
132MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
121 133
122#define TORTURE_FLAG "-torture:" 134#define TORTURE_FLAG "-torture:"
123#define PRINTK_STRING(s) \ 135#define PRINTK_STRING(s) \
@@ -267,7 +279,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
267 * Absorb kthreads into a kernel function that won't return, so that 279 * Absorb kthreads into a kernel function that won't return, so that
268 * they won't ever access module text or data again. 280 * they won't ever access module text or data again.
269 */ 281 */
270static void rcutorture_shutdown_absorb(char *title) 282static void rcutorture_shutdown_absorb(const char *title)
271{ 283{
272 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { 284 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
273 pr_notice( 285 pr_notice(
@@ -337,7 +349,7 @@ rcu_random(struct rcu_random_state *rrsp)
337} 349}
338 350
339static void 351static void
340rcu_stutter_wait(char *title) 352rcu_stutter_wait(const char *title)
341{ 353{
342 while (stutter_pause_test || !rcutorture_runnable) { 354 while (stutter_pause_test || !rcutorture_runnable) {
343 if (rcutorture_runnable) 355 if (rcutorture_runnable)
@@ -360,13 +372,14 @@ struct rcu_torture_ops {
360 int (*completed)(void); 372 int (*completed)(void);
361 void (*deferred_free)(struct rcu_torture *p); 373 void (*deferred_free)(struct rcu_torture *p);
362 void (*sync)(void); 374 void (*sync)(void);
375 void (*exp_sync)(void);
363 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 376 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
364 void (*cb_barrier)(void); 377 void (*cb_barrier)(void);
365 void (*fqs)(void); 378 void (*fqs)(void);
366 int (*stats)(char *page); 379 int (*stats)(char *page);
367 int irq_capable; 380 int irq_capable;
368 int can_boost; 381 int can_boost;
369 char *name; 382 const char *name;
370}; 383};
371 384
372static struct rcu_torture_ops *cur_ops; 385static struct rcu_torture_ops *cur_ops;
@@ -443,81 +456,27 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
443 call_rcu(&p->rtort_rcu, rcu_torture_cb); 456 call_rcu(&p->rtort_rcu, rcu_torture_cb);
444} 457}
445 458
446static struct rcu_torture_ops rcu_ops = {
447 .init = NULL,
448 .readlock = rcu_torture_read_lock,
449 .read_delay = rcu_read_delay,
450 .readunlock = rcu_torture_read_unlock,
451 .completed = rcu_torture_completed,
452 .deferred_free = rcu_torture_deferred_free,
453 .sync = synchronize_rcu,
454 .call = call_rcu,
455 .cb_barrier = rcu_barrier,
456 .fqs = rcu_force_quiescent_state,
457 .stats = NULL,
458 .irq_capable = 1,
459 .can_boost = rcu_can_boost(),
460 .name = "rcu"
461};
462
463static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
464{
465 int i;
466 struct rcu_torture *rp;
467 struct rcu_torture *rp1;
468
469 cur_ops->sync();
470 list_add(&p->rtort_free, &rcu_torture_removed);
471 list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
472 i = rp->rtort_pipe_count;
473 if (i > RCU_TORTURE_PIPE_LEN)
474 i = RCU_TORTURE_PIPE_LEN;
475 atomic_inc(&rcu_torture_wcount[i]);
476 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
477 rp->rtort_mbtest = 0;
478 list_del(&rp->rtort_free);
479 rcu_torture_free(rp);
480 }
481 }
482}
483
484static void rcu_sync_torture_init(void) 459static void rcu_sync_torture_init(void)
485{ 460{
486 INIT_LIST_HEAD(&rcu_torture_removed); 461 INIT_LIST_HEAD(&rcu_torture_removed);
487} 462}
488 463
489static struct rcu_torture_ops rcu_sync_ops = { 464static struct rcu_torture_ops rcu_ops = {
490 .init = rcu_sync_torture_init, 465 .init = rcu_sync_torture_init,
491 .readlock = rcu_torture_read_lock, 466 .readlock = rcu_torture_read_lock,
492 .read_delay = rcu_read_delay, 467 .read_delay = rcu_read_delay,
493 .readunlock = rcu_torture_read_unlock, 468 .readunlock = rcu_torture_read_unlock,
494 .completed = rcu_torture_completed, 469 .completed = rcu_torture_completed,
495 .deferred_free = rcu_sync_torture_deferred_free, 470 .deferred_free = rcu_torture_deferred_free,
496 .sync = synchronize_rcu, 471 .sync = synchronize_rcu,
497 .call = NULL, 472 .exp_sync = synchronize_rcu_expedited,
498 .cb_barrier = NULL, 473 .call = call_rcu,
499 .fqs = rcu_force_quiescent_state, 474 .cb_barrier = rcu_barrier,
500 .stats = NULL,
501 .irq_capable = 1,
502 .can_boost = rcu_can_boost(),
503 .name = "rcu_sync"
504};
505
506static struct rcu_torture_ops rcu_expedited_ops = {
507 .init = rcu_sync_torture_init,
508 .readlock = rcu_torture_read_lock,
509 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
510 .readunlock = rcu_torture_read_unlock,
511 .completed = rcu_no_completed,
512 .deferred_free = rcu_sync_torture_deferred_free,
513 .sync = synchronize_rcu_expedited,
514 .call = NULL,
515 .cb_barrier = NULL,
516 .fqs = rcu_force_quiescent_state, 475 .fqs = rcu_force_quiescent_state,
517 .stats = NULL, 476 .stats = NULL,
518 .irq_capable = 1, 477 .irq_capable = 1,
519 .can_boost = rcu_can_boost(), 478 .can_boost = rcu_can_boost(),
520 .name = "rcu_expedited" 479 .name = "rcu"
521}; 480};
522 481
523/* 482/*
@@ -546,13 +505,14 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
546} 505}
547 506
548static struct rcu_torture_ops rcu_bh_ops = { 507static struct rcu_torture_ops rcu_bh_ops = {
549 .init = NULL, 508 .init = rcu_sync_torture_init,
550 .readlock = rcu_bh_torture_read_lock, 509 .readlock = rcu_bh_torture_read_lock,
551 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 510 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
552 .readunlock = rcu_bh_torture_read_unlock, 511 .readunlock = rcu_bh_torture_read_unlock,
553 .completed = rcu_bh_torture_completed, 512 .completed = rcu_bh_torture_completed,
554 .deferred_free = rcu_bh_torture_deferred_free, 513 .deferred_free = rcu_bh_torture_deferred_free,
555 .sync = synchronize_rcu_bh, 514 .sync = synchronize_rcu_bh,
515 .exp_sync = synchronize_rcu_bh_expedited,
556 .call = call_rcu_bh, 516 .call = call_rcu_bh,
557 .cb_barrier = rcu_barrier_bh, 517 .cb_barrier = rcu_barrier_bh,
558 .fqs = rcu_bh_force_quiescent_state, 518 .fqs = rcu_bh_force_quiescent_state,
@@ -561,38 +521,6 @@ static struct rcu_torture_ops rcu_bh_ops = {
561 .name = "rcu_bh" 521 .name = "rcu_bh"
562}; 522};
563 523
564static struct rcu_torture_ops rcu_bh_sync_ops = {
565 .init = rcu_sync_torture_init,
566 .readlock = rcu_bh_torture_read_lock,
567 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
568 .readunlock = rcu_bh_torture_read_unlock,
569 .completed = rcu_bh_torture_completed,
570 .deferred_free = rcu_sync_torture_deferred_free,
571 .sync = synchronize_rcu_bh,
572 .call = NULL,
573 .cb_barrier = NULL,
574 .fqs = rcu_bh_force_quiescent_state,
575 .stats = NULL,
576 .irq_capable = 1,
577 .name = "rcu_bh_sync"
578};
579
580static struct rcu_torture_ops rcu_bh_expedited_ops = {
581 .init = rcu_sync_torture_init,
582 .readlock = rcu_bh_torture_read_lock,
583 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
584 .readunlock = rcu_bh_torture_read_unlock,
585 .completed = rcu_bh_torture_completed,
586 .deferred_free = rcu_sync_torture_deferred_free,
587 .sync = synchronize_rcu_bh_expedited,
588 .call = NULL,
589 .cb_barrier = NULL,
590 .fqs = rcu_bh_force_quiescent_state,
591 .stats = NULL,
592 .irq_capable = 1,
593 .name = "rcu_bh_expedited"
594};
595
596/* 524/*
597 * Definitions for srcu torture testing. 525 * Definitions for srcu torture testing.
598 */ 526 */
@@ -667,6 +595,11 @@ static int srcu_torture_stats(char *page)
667 return cnt; 595 return cnt;
668} 596}
669 597
598static void srcu_torture_synchronize_expedited(void)
599{
600 synchronize_srcu_expedited(&srcu_ctl);
601}
602
670static struct rcu_torture_ops srcu_ops = { 603static struct rcu_torture_ops srcu_ops = {
671 .init = rcu_sync_torture_init, 604 .init = rcu_sync_torture_init,
672 .readlock = srcu_torture_read_lock, 605 .readlock = srcu_torture_read_lock,
@@ -675,45 +608,13 @@ static struct rcu_torture_ops srcu_ops = {
675 .completed = srcu_torture_completed, 608 .completed = srcu_torture_completed,
676 .deferred_free = srcu_torture_deferred_free, 609 .deferred_free = srcu_torture_deferred_free,
677 .sync = srcu_torture_synchronize, 610 .sync = srcu_torture_synchronize,
611 .exp_sync = srcu_torture_synchronize_expedited,
678 .call = srcu_torture_call, 612 .call = srcu_torture_call,
679 .cb_barrier = srcu_torture_barrier, 613 .cb_barrier = srcu_torture_barrier,
680 .stats = srcu_torture_stats, 614 .stats = srcu_torture_stats,
681 .name = "srcu" 615 .name = "srcu"
682}; 616};
683 617
684static struct rcu_torture_ops srcu_sync_ops = {
685 .init = rcu_sync_torture_init,
686 .readlock = srcu_torture_read_lock,
687 .read_delay = srcu_read_delay,
688 .readunlock = srcu_torture_read_unlock,
689 .completed = srcu_torture_completed,
690 .deferred_free = rcu_sync_torture_deferred_free,
691 .sync = srcu_torture_synchronize,
692 .call = NULL,
693 .cb_barrier = NULL,
694 .stats = srcu_torture_stats,
695 .name = "srcu_sync"
696};
697
698static void srcu_torture_synchronize_expedited(void)
699{
700 synchronize_srcu_expedited(&srcu_ctl);
701}
702
703static struct rcu_torture_ops srcu_expedited_ops = {
704 .init = rcu_sync_torture_init,
705 .readlock = srcu_torture_read_lock,
706 .read_delay = srcu_read_delay,
707 .readunlock = srcu_torture_read_unlock,
708 .completed = srcu_torture_completed,
709 .deferred_free = rcu_sync_torture_deferred_free,
710 .sync = srcu_torture_synchronize_expedited,
711 .call = NULL,
712 .cb_barrier = NULL,
713 .stats = srcu_torture_stats,
714 .name = "srcu_expedited"
715};
716
717/* 618/*
718 * Definitions for sched torture testing. 619 * Definitions for sched torture testing.
719 */ 620 */
@@ -742,6 +643,8 @@ static struct rcu_torture_ops sched_ops = {
742 .completed = rcu_no_completed, 643 .completed = rcu_no_completed,
743 .deferred_free = rcu_sched_torture_deferred_free, 644 .deferred_free = rcu_sched_torture_deferred_free,
744 .sync = synchronize_sched, 645 .sync = synchronize_sched,
646 .exp_sync = synchronize_sched_expedited,
647 .call = call_rcu_sched,
745 .cb_barrier = rcu_barrier_sched, 648 .cb_barrier = rcu_barrier_sched,
746 .fqs = rcu_sched_force_quiescent_state, 649 .fqs = rcu_sched_force_quiescent_state,
747 .stats = NULL, 650 .stats = NULL,
@@ -749,35 +652,6 @@ static struct rcu_torture_ops sched_ops = {
749 .name = "sched" 652 .name = "sched"
750}; 653};
751 654
752static struct rcu_torture_ops sched_sync_ops = {
753 .init = rcu_sync_torture_init,
754 .readlock = sched_torture_read_lock,
755 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
756 .readunlock = sched_torture_read_unlock,
757 .completed = rcu_no_completed,
758 .deferred_free = rcu_sync_torture_deferred_free,
759 .sync = synchronize_sched,
760 .cb_barrier = NULL,
761 .fqs = rcu_sched_force_quiescent_state,
762 .stats = NULL,
763 .name = "sched_sync"
764};
765
766static struct rcu_torture_ops sched_expedited_ops = {
767 .init = rcu_sync_torture_init,
768 .readlock = sched_torture_read_lock,
769 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
770 .readunlock = sched_torture_read_unlock,
771 .completed = rcu_no_completed,
772 .deferred_free = rcu_sync_torture_deferred_free,
773 .sync = synchronize_sched_expedited,
774 .cb_barrier = NULL,
775 .fqs = rcu_sched_force_quiescent_state,
776 .stats = NULL,
777 .irq_capable = 1,
778 .name = "sched_expedited"
779};
780
781/* 655/*
782 * RCU torture priority-boost testing. Runs one real-time thread per 656 * RCU torture priority-boost testing. Runs one real-time thread per
783 * CPU for moderate bursts, repeatedly registering RCU callbacks and 657 * CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -927,9 +801,10 @@ rcu_torture_fqs(void *arg)
927static int 801static int
928rcu_torture_writer(void *arg) 802rcu_torture_writer(void *arg)
929{ 803{
804 bool exp;
930 int i; 805 int i;
931 long oldbatch = rcu_batches_completed();
932 struct rcu_torture *rp; 806 struct rcu_torture *rp;
807 struct rcu_torture *rp1;
933 struct rcu_torture *old_rp; 808 struct rcu_torture *old_rp;
934 static DEFINE_RCU_RANDOM(rand); 809 static DEFINE_RCU_RANDOM(rand);
935 810
@@ -954,10 +829,33 @@ rcu_torture_writer(void *arg)
954 i = RCU_TORTURE_PIPE_LEN; 829 i = RCU_TORTURE_PIPE_LEN;
955 atomic_inc(&rcu_torture_wcount[i]); 830 atomic_inc(&rcu_torture_wcount[i]);
956 old_rp->rtort_pipe_count++; 831 old_rp->rtort_pipe_count++;
957 cur_ops->deferred_free(old_rp); 832 if (gp_normal == gp_exp)
833 exp = !!(rcu_random(&rand) & 0x80);
834 else
835 exp = gp_exp;
836 if (!exp) {
837 cur_ops->deferred_free(old_rp);
838 } else {
839 cur_ops->exp_sync();
840 list_add(&old_rp->rtort_free,
841 &rcu_torture_removed);
842 list_for_each_entry_safe(rp, rp1,
843 &rcu_torture_removed,
844 rtort_free) {
845 i = rp->rtort_pipe_count;
846 if (i > RCU_TORTURE_PIPE_LEN)
847 i = RCU_TORTURE_PIPE_LEN;
848 atomic_inc(&rcu_torture_wcount[i]);
849 if (++rp->rtort_pipe_count >=
850 RCU_TORTURE_PIPE_LEN) {
851 rp->rtort_mbtest = 0;
852 list_del(&rp->rtort_free);
853 rcu_torture_free(rp);
854 }
855 }
856 }
958 } 857 }
959 rcutorture_record_progress(++rcu_torture_current_version); 858 rcutorture_record_progress(++rcu_torture_current_version);
960 oldbatch = cur_ops->completed();
961 rcu_stutter_wait("rcu_torture_writer"); 859 rcu_stutter_wait("rcu_torture_writer");
962 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 860 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
963 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 861 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
@@ -983,10 +881,18 @@ rcu_torture_fakewriter(void *arg)
983 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 881 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
984 udelay(rcu_random(&rand) & 0x3ff); 882 udelay(rcu_random(&rand) & 0x3ff);
985 if (cur_ops->cb_barrier != NULL && 883 if (cur_ops->cb_barrier != NULL &&
986 rcu_random(&rand) % (nfakewriters * 8) == 0) 884 rcu_random(&rand) % (nfakewriters * 8) == 0) {
987 cur_ops->cb_barrier(); 885 cur_ops->cb_barrier();
988 else 886 } else if (gp_normal == gp_exp) {
887 if (rcu_random(&rand) & 0x80)
888 cur_ops->sync();
889 else
890 cur_ops->exp_sync();
891 } else if (gp_normal) {
989 cur_ops->sync(); 892 cur_ops->sync();
893 } else {
894 cur_ops->exp_sync();
895 }
990 rcu_stutter_wait("rcu_torture_fakewriter"); 896 rcu_stutter_wait("rcu_torture_fakewriter");
991 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 897 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
992 898
@@ -1364,7 +1270,7 @@ rcu_torture_stutter(void *arg)
1364} 1270}
1365 1271
1366static inline void 1272static inline void
1367rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) 1273rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
1368{ 1274{
1369 pr_alert("%s" TORTURE_FLAG 1275 pr_alert("%s" TORTURE_FLAG
1370 "--- %s: nreaders=%d nfakewriters=%d " 1276 "--- %s: nreaders=%d nfakewriters=%d "
@@ -1534,7 +1440,13 @@ rcu_torture_onoff(void *arg)
1534 torture_type, cpu); 1440 torture_type, cpu);
1535 starttime = jiffies; 1441 starttime = jiffies;
1536 n_online_attempts++; 1442 n_online_attempts++;
1537 if (cpu_up(cpu) == 0) { 1443 ret = cpu_up(cpu);
1444 if (ret) {
1445 if (verbose)
1446 pr_alert("%s" TORTURE_FLAG
1447 "rcu_torture_onoff task: online %d failed: errno %d\n",
1448 torture_type, cpu, ret);
1449 } else {
1538 if (verbose) 1450 if (verbose)
1539 pr_alert("%s" TORTURE_FLAG 1451 pr_alert("%s" TORTURE_FLAG
1540 "rcu_torture_onoff task: onlined %d\n", 1452 "rcu_torture_onoff task: onlined %d\n",
@@ -1934,6 +1846,62 @@ rcu_torture_cleanup(void)
1934 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); 1846 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1935} 1847}
1936 1848
1849#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
1850static void rcu_torture_leak_cb(struct rcu_head *rhp)
1851{
1852}
1853
1854static void rcu_torture_err_cb(struct rcu_head *rhp)
1855{
1856 /*
1857 * This -might- happen due to race conditions, but is unlikely.
1858 * The scenario that leads to this happening is that the
1859 * first of the pair of duplicate callbacks is queued,
1860 * someone else starts a grace period that includes that
1861 * callback, then the second of the pair must wait for the
1862 * next grace period. Unlikely, but can happen. If it
1863 * does happen, the debug-objects subsystem won't have splatted.
1864 */
1865 pr_alert("rcutorture: duplicated callback was invoked.\n");
1866}
1867#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
1868
1869/*
1870 * Verify that double-free causes debug-objects to complain, but only
1871 * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test
1872 * cannot be carried out.
1873 */
1874static void rcu_test_debug_objects(void)
1875{
1876#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
1877 struct rcu_head rh1;
1878 struct rcu_head rh2;
1879
1880 init_rcu_head_on_stack(&rh1);
1881 init_rcu_head_on_stack(&rh2);
1882 pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n");
1883
1884 /* Try to queue the rh2 pair of callbacks for the same grace period. */
1885 preempt_disable(); /* Prevent preemption from interrupting test. */
1886 rcu_read_lock(); /* Make it impossible to finish a grace period. */
1887 call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
1888 local_irq_disable(); /* Make it harder to start a new grace period. */
1889 call_rcu(&rh2, rcu_torture_leak_cb);
1890 call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
1891 local_irq_enable();
1892 rcu_read_unlock();
1893 preempt_enable();
1894
1895 /* Wait for them all to get done so we can safely return. */
1896 rcu_barrier();
1897 pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n");
1898 destroy_rcu_head_on_stack(&rh1);
1899 destroy_rcu_head_on_stack(&rh2);
1900#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
1901 pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n");
1902#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
1903}
1904
1937static int __init 1905static int __init
1938rcu_torture_init(void) 1906rcu_torture_init(void)
1939{ 1907{
@@ -1941,11 +1909,9 @@ rcu_torture_init(void)
1941 int cpu; 1909 int cpu;
1942 int firsterr = 0; 1910 int firsterr = 0;
1943 int retval; 1911 int retval;
1944 static struct rcu_torture_ops *torture_ops[] = 1912 static struct rcu_torture_ops *torture_ops[] = {
1945 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1913 &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
1946 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1914 };
1947 &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
1948 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1949 1915
1950 mutex_lock(&fullstop_mutex); 1916 mutex_lock(&fullstop_mutex);
1951 1917
@@ -2163,6 +2129,8 @@ rcu_torture_init(void)
2163 firsterr = retval; 2129 firsterr = retval;
2164 goto unwind; 2130 goto unwind;
2165 } 2131 }
2132 if (object_debug)
2133 rcu_test_debug_objects();
2166 rcutorture_record_test_transition(); 2134 rcutorture_record_test_transition();
2167 mutex_unlock(&fullstop_mutex); 2135 mutex_unlock(&fullstop_mutex);
2168 return 0; 2136 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcu/tree.c
index 068de3a93606..4c06ddfea7cd 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcu/tree.c
@@ -41,6 +41,7 @@
41#include <linux/export.h> 41#include <linux/export.h>
42#include <linux/completion.h> 42#include <linux/completion.h>
43#include <linux/moduleparam.h> 43#include <linux/moduleparam.h>
44#include <linux/module.h>
44#include <linux/percpu.h> 45#include <linux/percpu.h>
45#include <linux/notifier.h> 46#include <linux/notifier.h>
46#include <linux/cpu.h> 47#include <linux/cpu.h>
@@ -53,18 +54,37 @@
53#include <linux/delay.h> 54#include <linux/delay.h>
54#include <linux/stop_machine.h> 55#include <linux/stop_machine.h>
55#include <linux/random.h> 56#include <linux/random.h>
57#include <linux/ftrace_event.h>
58#include <linux/suspend.h>
56 59
57#include "rcutree.h" 60#include "tree.h"
58#include <trace/events/rcu.h> 61#include <trace/events/rcu.h>
59 62
60#include "rcu.h" 63#include "rcu.h"
61 64
65MODULE_ALIAS("rcutree");
66#ifdef MODULE_PARAM_PREFIX
67#undef MODULE_PARAM_PREFIX
68#endif
69#define MODULE_PARAM_PREFIX "rcutree."
70
62/* Data structures. */ 71/* Data structures. */
63 72
64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 73static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 74static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
66 75
67#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ 76/*
77 * In order to export the rcu_state name to the tracing tools, it
78 * needs to be added in the __tracepoint_string section.
79 * This requires defining a separate variable tp_<sname>_varname
80 * that points to the string being used, and this will allow
81 * the tracing userspace tools to be able to decipher the string
82 * address to the matching string.
83 */
84#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
85static char sname##_varname[] = #sname; \
86static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \
87struct rcu_state sname##_state = { \
68 .level = { &sname##_state.node[0] }, \ 88 .level = { &sname##_state.node[0] }, \
69 .call = cr, \ 89 .call = cr, \
70 .fqs_state = RCU_GP_IDLE, \ 90 .fqs_state = RCU_GP_IDLE, \
@@ -75,16 +95,13 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
75 .orphan_donetail = &sname##_state.orphan_donelist, \ 95 .orphan_donetail = &sname##_state.orphan_donelist, \
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 96 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ 97 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
78 .name = #sname, \ 98 .name = sname##_varname, \
79 .abbr = sabbr, \ 99 .abbr = sabbr, \
80} 100}; \
81 101DEFINE_PER_CPU(struct rcu_data, sname##_data)
82struct rcu_state rcu_sched_state =
83 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
84DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
85 102
86struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 103RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
87DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 104RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
88 105
89static struct rcu_state *rcu_state; 106static struct rcu_state *rcu_state;
90LIST_HEAD(rcu_struct_flavors); 107LIST_HEAD(rcu_struct_flavors);
@@ -178,7 +195,7 @@ void rcu_sched_qs(int cpu)
178 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); 195 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
179 196
180 if (rdp->passed_quiesce == 0) 197 if (rdp->passed_quiesce == 0)
181 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); 198 trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs"));
182 rdp->passed_quiesce = 1; 199 rdp->passed_quiesce = 1;
183} 200}
184 201
@@ -187,7 +204,7 @@ void rcu_bh_qs(int cpu)
187 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 204 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
188 205
189 if (rdp->passed_quiesce == 0) 206 if (rdp->passed_quiesce == 0)
190 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); 207 trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs"));
191 rdp->passed_quiesce = 1; 208 rdp->passed_quiesce = 1;
192} 209}
193 210
@@ -198,16 +215,20 @@ void rcu_bh_qs(int cpu)
198 */ 215 */
199void rcu_note_context_switch(int cpu) 216void rcu_note_context_switch(int cpu)
200{ 217{
201 trace_rcu_utilization("Start context switch"); 218 trace_rcu_utilization(TPS("Start context switch"));
202 rcu_sched_qs(cpu); 219 rcu_sched_qs(cpu);
203 rcu_preempt_note_context_switch(cpu); 220 rcu_preempt_note_context_switch(cpu);
204 trace_rcu_utilization("End context switch"); 221 trace_rcu_utilization(TPS("End context switch"));
205} 222}
206EXPORT_SYMBOL_GPL(rcu_note_context_switch); 223EXPORT_SYMBOL_GPL(rcu_note_context_switch);
207 224
208DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 225static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
209 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 226 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
210 .dynticks = ATOMIC_INIT(1), 227 .dynticks = ATOMIC_INIT(1),
228#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
229 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
230 .dynticks_idle = ATOMIC_INIT(1),
231#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
211}; 232};
212 233
213static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 234static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
@@ -226,7 +247,10 @@ module_param(jiffies_till_next_fqs, ulong, 0644);
226 247
227static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 248static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
228 struct rcu_data *rdp); 249 struct rcu_data *rdp);
229static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); 250static void force_qs_rnp(struct rcu_state *rsp,
251 int (*f)(struct rcu_data *rsp, bool *isidle,
252 unsigned long *maxj),
253 bool *isidle, unsigned long *maxj);
230static void force_quiescent_state(struct rcu_state *rsp); 254static void force_quiescent_state(struct rcu_state *rsp);
231static int rcu_pending(int cpu); 255static int rcu_pending(int cpu);
232 256
@@ -345,11 +369,12 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
345static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, 369static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
346 bool user) 370 bool user)
347{ 371{
348 trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); 372 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
349 if (!user && !is_idle_task(current)) { 373 if (!user && !is_idle_task(current)) {
350 struct task_struct *idle = idle_task(smp_processor_id()); 374 struct task_struct *idle __maybe_unused =
375 idle_task(smp_processor_id());
351 376
352 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); 377 trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
353 ftrace_dump(DUMP_ORIG); 378 ftrace_dump(DUMP_ORIG);
354 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 379 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
355 current->pid, current->comm, 380 current->pid, current->comm,
@@ -383,7 +408,7 @@ static void rcu_eqs_enter(bool user)
383 long long oldval; 408 long long oldval;
384 struct rcu_dynticks *rdtp; 409 struct rcu_dynticks *rdtp;
385 410
386 rdtp = &__get_cpu_var(rcu_dynticks); 411 rdtp = this_cpu_ptr(&rcu_dynticks);
387 oldval = rdtp->dynticks_nesting; 412 oldval = rdtp->dynticks_nesting;
388 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); 413 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
389 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) 414 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
@@ -411,6 +436,7 @@ void rcu_idle_enter(void)
411 436
412 local_irq_save(flags); 437 local_irq_save(flags);
413 rcu_eqs_enter(false); 438 rcu_eqs_enter(false);
439 rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0);
414 local_irq_restore(flags); 440 local_irq_restore(flags);
415} 441}
416EXPORT_SYMBOL_GPL(rcu_idle_enter); 442EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -428,27 +454,6 @@ void rcu_user_enter(void)
428{ 454{
429 rcu_eqs_enter(1); 455 rcu_eqs_enter(1);
430} 456}
431
432/**
433 * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace
434 * after the current irq returns.
435 *
436 * This is similar to rcu_user_enter() but in the context of a non-nesting
437 * irq. After this call, RCU enters into idle mode when the interrupt
438 * returns.
439 */
440void rcu_user_enter_after_irq(void)
441{
442 unsigned long flags;
443 struct rcu_dynticks *rdtp;
444
445 local_irq_save(flags);
446 rdtp = &__get_cpu_var(rcu_dynticks);
447 /* Ensure this irq is interrupting a non-idle RCU state. */
448 WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK));
449 rdtp->dynticks_nesting = 1;
450 local_irq_restore(flags);
451}
452#endif /* CONFIG_RCU_USER_QS */ 457#endif /* CONFIG_RCU_USER_QS */
453 458
454/** 459/**
@@ -474,14 +479,15 @@ void rcu_irq_exit(void)
474 struct rcu_dynticks *rdtp; 479 struct rcu_dynticks *rdtp;
475 480
476 local_irq_save(flags); 481 local_irq_save(flags);
477 rdtp = &__get_cpu_var(rcu_dynticks); 482 rdtp = this_cpu_ptr(&rcu_dynticks);
478 oldval = rdtp->dynticks_nesting; 483 oldval = rdtp->dynticks_nesting;
479 rdtp->dynticks_nesting--; 484 rdtp->dynticks_nesting--;
480 WARN_ON_ONCE(rdtp->dynticks_nesting < 0); 485 WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
481 if (rdtp->dynticks_nesting) 486 if (rdtp->dynticks_nesting)
482 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); 487 trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
483 else 488 else
484 rcu_eqs_enter_common(rdtp, oldval, true); 489 rcu_eqs_enter_common(rdtp, oldval, true);
490 rcu_sysidle_enter(rdtp, 1);
485 local_irq_restore(flags); 491 local_irq_restore(flags);
486} 492}
487 493
@@ -501,11 +507,12 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
501 smp_mb__after_atomic_inc(); /* See above. */ 507 smp_mb__after_atomic_inc(); /* See above. */
502 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 508 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
503 rcu_cleanup_after_idle(smp_processor_id()); 509 rcu_cleanup_after_idle(smp_processor_id());
504 trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); 510 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
505 if (!user && !is_idle_task(current)) { 511 if (!user && !is_idle_task(current)) {
506 struct task_struct *idle = idle_task(smp_processor_id()); 512 struct task_struct *idle __maybe_unused =
513 idle_task(smp_processor_id());
507 514
508 trace_rcu_dyntick("Error on exit: not idle task", 515 trace_rcu_dyntick(TPS("Error on exit: not idle task"),
509 oldval, rdtp->dynticks_nesting); 516 oldval, rdtp->dynticks_nesting);
510 ftrace_dump(DUMP_ORIG); 517 ftrace_dump(DUMP_ORIG);
511 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 518 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -523,7 +530,7 @@ static void rcu_eqs_exit(bool user)
523 struct rcu_dynticks *rdtp; 530 struct rcu_dynticks *rdtp;
524 long long oldval; 531 long long oldval;
525 532
526 rdtp = &__get_cpu_var(rcu_dynticks); 533 rdtp = this_cpu_ptr(&rcu_dynticks);
527 oldval = rdtp->dynticks_nesting; 534 oldval = rdtp->dynticks_nesting;
528 WARN_ON_ONCE(oldval < 0); 535 WARN_ON_ONCE(oldval < 0);
529 if (oldval & DYNTICK_TASK_NEST_MASK) 536 if (oldval & DYNTICK_TASK_NEST_MASK)
@@ -550,6 +557,7 @@ void rcu_idle_exit(void)
550 557
551 local_irq_save(flags); 558 local_irq_save(flags);
552 rcu_eqs_exit(false); 559 rcu_eqs_exit(false);
560 rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0);
553 local_irq_restore(flags); 561 local_irq_restore(flags);
554} 562}
555EXPORT_SYMBOL_GPL(rcu_idle_exit); 563EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -565,28 +573,6 @@ void rcu_user_exit(void)
565{ 573{
566 rcu_eqs_exit(1); 574 rcu_eqs_exit(1);
567} 575}
568
569/**
570 * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace
571 * idle mode after the current non-nesting irq returns.
572 *
573 * This is similar to rcu_user_exit() but in the context of an irq.
574 * This is called when the irq has interrupted a userspace RCU idle mode
575 * context. When the current non-nesting interrupt returns after this call,
576 * the CPU won't restore the RCU idle mode.
577 */
578void rcu_user_exit_after_irq(void)
579{
580 unsigned long flags;
581 struct rcu_dynticks *rdtp;
582
583 local_irq_save(flags);
584 rdtp = &__get_cpu_var(rcu_dynticks);
585 /* Ensure we are interrupting an RCU idle mode. */
586 WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK);
587 rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE;
588 local_irq_restore(flags);
589}
590#endif /* CONFIG_RCU_USER_QS */ 576#endif /* CONFIG_RCU_USER_QS */
591 577
592/** 578/**
@@ -615,14 +601,15 @@ void rcu_irq_enter(void)
615 long long oldval; 601 long long oldval;
616 602
617 local_irq_save(flags); 603 local_irq_save(flags);
618 rdtp = &__get_cpu_var(rcu_dynticks); 604 rdtp = this_cpu_ptr(&rcu_dynticks);
619 oldval = rdtp->dynticks_nesting; 605 oldval = rdtp->dynticks_nesting;
620 rdtp->dynticks_nesting++; 606 rdtp->dynticks_nesting++;
621 WARN_ON_ONCE(rdtp->dynticks_nesting == 0); 607 WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
622 if (oldval) 608 if (oldval)
623 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); 609 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
624 else 610 else
625 rcu_eqs_exit_common(rdtp, oldval, true); 611 rcu_eqs_exit_common(rdtp, oldval, true);
612 rcu_sysidle_exit(rdtp, 1);
626 local_irq_restore(flags); 613 local_irq_restore(flags);
627} 614}
628 615
@@ -635,7 +622,7 @@ void rcu_irq_enter(void)
635 */ 622 */
636void rcu_nmi_enter(void) 623void rcu_nmi_enter(void)
637{ 624{
638 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 625 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
639 626
640 if (rdtp->dynticks_nmi_nesting == 0 && 627 if (rdtp->dynticks_nmi_nesting == 0 &&
641 (atomic_read(&rdtp->dynticks) & 0x1)) 628 (atomic_read(&rdtp->dynticks) & 0x1))
@@ -657,7 +644,7 @@ void rcu_nmi_enter(void)
657 */ 644 */
658void rcu_nmi_exit(void) 645void rcu_nmi_exit(void)
659{ 646{
660 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 647 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
661 648
662 if (rdtp->dynticks_nmi_nesting == 0 || 649 if (rdtp->dynticks_nmi_nesting == 0 ||
663 --rdtp->dynticks_nmi_nesting != 0) 650 --rdtp->dynticks_nmi_nesting != 0)
@@ -670,21 +657,34 @@ void rcu_nmi_exit(void)
670} 657}
671 658
672/** 659/**
673 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle 660 * __rcu_is_watching - are RCU read-side critical sections safe?
661 *
662 * Return true if RCU is watching the running CPU, which means that
663 * this CPU can safely enter RCU read-side critical sections. Unlike
664 * rcu_is_watching(), the caller of __rcu_is_watching() must have at
665 * least disabled preemption.
666 */
667bool __rcu_is_watching(void)
668{
669 return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1;
670}
671
672/**
673 * rcu_is_watching - see if RCU thinks that the current CPU is idle
674 * 674 *
675 * If the current CPU is in its idle loop and is neither in an interrupt 675 * If the current CPU is in its idle loop and is neither in an interrupt
676 * or NMI handler, return true. 676 * or NMI handler, return true.
677 */ 677 */
678int rcu_is_cpu_idle(void) 678bool rcu_is_watching(void)
679{ 679{
680 int ret; 680 int ret;
681 681
682 preempt_disable(); 682 preempt_disable();
683 ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; 683 ret = __rcu_is_watching();
684 preempt_enable(); 684 preempt_enable();
685 return ret; 685 return ret;
686} 686}
687EXPORT_SYMBOL(rcu_is_cpu_idle); 687EXPORT_SYMBOL_GPL(rcu_is_watching);
688 688
689#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 689#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
690 690
@@ -718,7 +718,7 @@ bool rcu_lockdep_current_cpu_online(void)
718 if (in_nmi()) 718 if (in_nmi())
719 return 1; 719 return 1;
720 preempt_disable(); 720 preempt_disable();
721 rdp = &__get_cpu_var(rcu_sched_data); 721 rdp = this_cpu_ptr(&rcu_sched_data);
722 rnp = rdp->mynode; 722 rnp = rdp->mynode;
723 ret = (rdp->grpmask & rnp->qsmaskinit) || 723 ret = (rdp->grpmask & rnp->qsmaskinit) ||
724 !rcu_scheduler_fully_active; 724 !rcu_scheduler_fully_active;
@@ -738,7 +738,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
738 */ 738 */
739static int rcu_is_cpu_rrupt_from_idle(void) 739static int rcu_is_cpu_rrupt_from_idle(void)
740{ 740{
741 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; 741 return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1;
742} 742}
743 743
744/* 744/*
@@ -746,9 +746,11 @@ static int rcu_is_cpu_rrupt_from_idle(void)
746 * credit them with an implicit quiescent state. Return 1 if this CPU 746 * credit them with an implicit quiescent state. Return 1 if this CPU
747 * is in dynticks idle mode, which is an extended quiescent state. 747 * is in dynticks idle mode, which is an extended quiescent state.
748 */ 748 */
749static int dyntick_save_progress_counter(struct rcu_data *rdp) 749static int dyntick_save_progress_counter(struct rcu_data *rdp,
750 bool *isidle, unsigned long *maxj)
750{ 751{
751 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 752 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
753 rcu_sysidle_check_cpu(rdp, isidle, maxj);
752 return (rdp->dynticks_snap & 0x1) == 0; 754 return (rdp->dynticks_snap & 0x1) == 0;
753} 755}
754 756
@@ -758,7 +760,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
758 * idle state since the last call to dyntick_save_progress_counter() 760 * idle state since the last call to dyntick_save_progress_counter()
759 * for this same CPU, or by virtue of having been offline. 761 * for this same CPU, or by virtue of having been offline.
760 */ 762 */
761static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 763static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
764 bool *isidle, unsigned long *maxj)
762{ 765{
763 unsigned int curr; 766 unsigned int curr;
764 unsigned int snap; 767 unsigned int snap;
@@ -775,7 +778,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
775 * of the current RCU grace period. 778 * of the current RCU grace period.
776 */ 779 */
777 if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { 780 if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
778 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); 781 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
779 rdp->dynticks_fqs++; 782 rdp->dynticks_fqs++;
780 return 1; 783 return 1;
781 } 784 }
@@ -795,7 +798,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
795 return 0; /* Grace period is not old enough. */ 798 return 0; /* Grace period is not old enough. */
796 barrier(); 799 barrier();
797 if (cpu_is_offline(rdp->cpu)) { 800 if (cpu_is_offline(rdp->cpu)) {
798 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); 801 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
799 rdp->offline_fqs++; 802 rdp->offline_fqs++;
800 return 1; 803 return 1;
801 } 804 }
@@ -814,8 +817,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
814 817
815static void record_gp_stall_check_time(struct rcu_state *rsp) 818static void record_gp_stall_check_time(struct rcu_state *rsp)
816{ 819{
817 rsp->gp_start = jiffies; 820 unsigned long j = ACCESS_ONCE(jiffies);
818 rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); 821
822 rsp->gp_start = j;
823 smp_wmb(); /* Record start time before stall time. */
824 rsp->jiffies_stall = j + rcu_jiffies_till_stall_check();
819} 825}
820 826
821/* 827/*
@@ -910,6 +916,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
910 force_quiescent_state(rsp); /* Kick them all. */ 916 force_quiescent_state(rsp); /* Kick them all. */
911} 917}
912 918
919/*
920 * This function really isn't for public consumption, but RCU is special in
921 * that context switches can allow the state machine to make progress.
922 */
923extern void resched_cpu(int cpu);
924
913static void print_cpu_stall(struct rcu_state *rsp) 925static void print_cpu_stall(struct rcu_state *rsp)
914{ 926{
915 int cpu; 927 int cpu;
@@ -939,22 +951,60 @@ static void print_cpu_stall(struct rcu_state *rsp)
939 3 * rcu_jiffies_till_stall_check() + 3; 951 3 * rcu_jiffies_till_stall_check() + 3;
940 raw_spin_unlock_irqrestore(&rnp->lock, flags); 952 raw_spin_unlock_irqrestore(&rnp->lock, flags);
941 953
942 set_need_resched(); /* kick ourselves to get things going. */ 954 /*
955 * Attempt to revive the RCU machinery by forcing a context switch.
956 *
957 * A context switch would normally allow the RCU state machine to make
958 * progress and it could be we're stuck in kernel space without context
959 * switches for an entirely unreasonable amount of time.
960 */
961 resched_cpu(smp_processor_id());
943} 962}
944 963
945static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 964static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
946{ 965{
966 unsigned long completed;
967 unsigned long gpnum;
968 unsigned long gps;
947 unsigned long j; 969 unsigned long j;
948 unsigned long js; 970 unsigned long js;
949 struct rcu_node *rnp; 971 struct rcu_node *rnp;
950 972
951 if (rcu_cpu_stall_suppress) 973 if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
952 return; 974 return;
953 j = ACCESS_ONCE(jiffies); 975 j = ACCESS_ONCE(jiffies);
976
977 /*
978 * Lots of memory barriers to reject false positives.
979 *
980 * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall,
981 * then rsp->gp_start, and finally rsp->completed. These values
982 * are updated in the opposite order with memory barriers (or
983 * equivalent) during grace-period initialization and cleanup.
984 * Now, a false positive can occur if we get an new value of
985 * rsp->gp_start and a old value of rsp->jiffies_stall. But given
986 * the memory barriers, the only way that this can happen is if one
987 * grace period ends and another starts between these two fetches.
988 * Detect this by comparing rsp->completed with the previous fetch
989 * from rsp->gpnum.
990 *
991 * Given this check, comparisons of jiffies, rsp->jiffies_stall,
992 * and rsp->gp_start suffice to forestall false positives.
993 */
994 gpnum = ACCESS_ONCE(rsp->gpnum);
995 smp_rmb(); /* Pick up ->gpnum first... */
954 js = ACCESS_ONCE(rsp->jiffies_stall); 996 js = ACCESS_ONCE(rsp->jiffies_stall);
997 smp_rmb(); /* ...then ->jiffies_stall before the rest... */
998 gps = ACCESS_ONCE(rsp->gp_start);
999 smp_rmb(); /* ...and finally ->gp_start before ->completed. */
1000 completed = ACCESS_ONCE(rsp->completed);
1001 if (ULONG_CMP_GE(completed, gpnum) ||
1002 ULONG_CMP_LT(j, js) ||
1003 ULONG_CMP_GE(gps, js))
1004 return; /* No stall or GP completed since entering function. */
955 rnp = rdp->mynode; 1005 rnp = rdp->mynode;
956 if (rcu_gp_in_progress(rsp) && 1006 if (rcu_gp_in_progress(rsp) &&
957 (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { 1007 (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
958 1008
959 /* We haven't checked in, so go dump stack. */ 1009 /* We haven't checked in, so go dump stack. */
960 print_cpu_stall(rsp); 1010 print_cpu_stall(rsp);
@@ -1032,7 +1082,7 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
1032 * rcu_nocb_wait_gp(). 1082 * rcu_nocb_wait_gp().
1033 */ 1083 */
1034static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, 1084static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1035 unsigned long c, char *s) 1085 unsigned long c, const char *s)
1036{ 1086{
1037 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, 1087 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
1038 rnp->completed, c, rnp->level, 1088 rnp->completed, c, rnp->level,
@@ -1058,9 +1108,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1058 * grace period is already marked as needed, return to the caller. 1108 * grace period is already marked as needed, return to the caller.
1059 */ 1109 */
1060 c = rcu_cbs_completed(rdp->rsp, rnp); 1110 c = rcu_cbs_completed(rdp->rsp, rnp);
1061 trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); 1111 trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
1062 if (rnp->need_future_gp[c & 0x1]) { 1112 if (rnp->need_future_gp[c & 0x1]) {
1063 trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); 1113 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
1064 return c; 1114 return c;
1065 } 1115 }
1066 1116
@@ -1074,7 +1124,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1074 if (rnp->gpnum != rnp->completed || 1124 if (rnp->gpnum != rnp->completed ||
1075 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { 1125 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
1076 rnp->need_future_gp[c & 0x1]++; 1126 rnp->need_future_gp[c & 0x1]++;
1077 trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); 1127 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
1078 return c; 1128 return c;
1079 } 1129 }
1080 1130
@@ -1102,7 +1152,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1102 * recorded, trace and leave. 1152 * recorded, trace and leave.
1103 */ 1153 */
1104 if (rnp_root->need_future_gp[c & 0x1]) { 1154 if (rnp_root->need_future_gp[c & 0x1]) {
1105 trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); 1155 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot"));
1106 goto unlock_out; 1156 goto unlock_out;
1107 } 1157 }
1108 1158
@@ -1111,9 +1161,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1111 1161
1112 /* If a grace period is not already in progress, start one. */ 1162 /* If a grace period is not already in progress, start one. */
1113 if (rnp_root->gpnum != rnp_root->completed) { 1163 if (rnp_root->gpnum != rnp_root->completed) {
1114 trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); 1164 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
1115 } else { 1165 } else {
1116 trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); 1166 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
1117 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); 1167 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
1118 } 1168 }
1119unlock_out: 1169unlock_out:
@@ -1137,7 +1187,8 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1137 rcu_nocb_gp_cleanup(rsp, rnp); 1187 rcu_nocb_gp_cleanup(rsp, rnp);
1138 rnp->need_future_gp[c & 0x1] = 0; 1188 rnp->need_future_gp[c & 0x1] = 0;
1139 needmore = rnp->need_future_gp[(c + 1) & 0x1]; 1189 needmore = rnp->need_future_gp[(c + 1) & 0x1];
1140 trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); 1190 trace_rcu_future_gp(rnp, rdp, c,
1191 needmore ? TPS("CleanupMore") : TPS("Cleanup"));
1141 return needmore; 1192 return needmore;
1142} 1193}
1143 1194
@@ -1205,9 +1256,9 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1205 1256
1206 /* Trace depending on how much we were able to accelerate. */ 1257 /* Trace depending on how much we were able to accelerate. */
1207 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1258 if (!*rdp->nxttail[RCU_WAIT_TAIL])
1208 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); 1259 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
1209 else 1260 else
1210 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); 1261 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
1211} 1262}
1212 1263
1213/* 1264/*
@@ -1273,7 +1324,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
1273 1324
1274 /* Remember that we saw this grace-period completion. */ 1325 /* Remember that we saw this grace-period completion. */
1275 rdp->completed = rnp->completed; 1326 rdp->completed = rnp->completed;
1276 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); 1327 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
1277 } 1328 }
1278 1329
1279 if (rdp->gpnum != rnp->gpnum) { 1330 if (rdp->gpnum != rnp->gpnum) {
@@ -1283,7 +1334,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
1283 * go looking for one. 1334 * go looking for one.
1284 */ 1335 */
1285 rdp->gpnum = rnp->gpnum; 1336 rdp->gpnum = rnp->gpnum;
1286 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); 1337 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
1287 rdp->passed_quiesce = 0; 1338 rdp->passed_quiesce = 0;
1288 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1339 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1289 zero_cpu_stall_ticks(rdp); 1340 zero_cpu_stall_ticks(rdp);
@@ -1308,26 +1359,36 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1308} 1359}
1309 1360
1310/* 1361/*
1311 * Initialize a new grace period. 1362 * Initialize a new grace period. Return 0 if no grace period required.
1312 */ 1363 */
1313static int rcu_gp_init(struct rcu_state *rsp) 1364static int rcu_gp_init(struct rcu_state *rsp)
1314{ 1365{
1315 struct rcu_data *rdp; 1366 struct rcu_data *rdp;
1316 struct rcu_node *rnp = rcu_get_root(rsp); 1367 struct rcu_node *rnp = rcu_get_root(rsp);
1317 1368
1369 rcu_bind_gp_kthread();
1318 raw_spin_lock_irq(&rnp->lock); 1370 raw_spin_lock_irq(&rnp->lock);
1371 if (rsp->gp_flags == 0) {
1372 /* Spurious wakeup, tell caller to go back to sleep. */
1373 raw_spin_unlock_irq(&rnp->lock);
1374 return 0;
1375 }
1319 rsp->gp_flags = 0; /* Clear all flags: New grace period. */ 1376 rsp->gp_flags = 0; /* Clear all flags: New grace period. */
1320 1377
1321 if (rcu_gp_in_progress(rsp)) { 1378 if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
1322 /* Grace period already in progress, don't start another. */ 1379 /*
1380 * Grace period already in progress, don't start another.
1381 * Not supposed to be able to happen.
1382 */
1323 raw_spin_unlock_irq(&rnp->lock); 1383 raw_spin_unlock_irq(&rnp->lock);
1324 return 0; 1384 return 0;
1325 } 1385 }
1326 1386
1327 /* Advance to a new grace period and initialize state. */ 1387 /* Advance to a new grace period and initialize state. */
1328 rsp->gpnum++;
1329 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
1330 record_gp_stall_check_time(rsp); 1388 record_gp_stall_check_time(rsp);
1389 smp_wmb(); /* Record GP times before starting GP. */
1390 rsp->gpnum++;
1391 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
1331 raw_spin_unlock_irq(&rnp->lock); 1392 raw_spin_unlock_irq(&rnp->lock);
1332 1393
1333 /* Exclude any concurrent CPU-hotplug operations. */ 1394 /* Exclude any concurrent CPU-hotplug operations. */
@@ -1376,19 +1437,28 @@ static int rcu_gp_init(struct rcu_state *rsp)
1376/* 1437/*
1377 * Do one round of quiescent-state forcing. 1438 * Do one round of quiescent-state forcing.
1378 */ 1439 */
1379int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) 1440static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1380{ 1441{
1381 int fqs_state = fqs_state_in; 1442 int fqs_state = fqs_state_in;
1443 bool isidle = false;
1444 unsigned long maxj;
1382 struct rcu_node *rnp = rcu_get_root(rsp); 1445 struct rcu_node *rnp = rcu_get_root(rsp);
1383 1446
1384 rsp->n_force_qs++; 1447 rsp->n_force_qs++;
1385 if (fqs_state == RCU_SAVE_DYNTICK) { 1448 if (fqs_state == RCU_SAVE_DYNTICK) {
1386 /* Collect dyntick-idle snapshots. */ 1449 /* Collect dyntick-idle snapshots. */
1387 force_qs_rnp(rsp, dyntick_save_progress_counter); 1450 if (is_sysidle_rcu_state(rsp)) {
1451 isidle = 1;
1452 maxj = jiffies - ULONG_MAX / 4;
1453 }
1454 force_qs_rnp(rsp, dyntick_save_progress_counter,
1455 &isidle, &maxj);
1456 rcu_sysidle_report_gp(rsp, isidle, maxj);
1388 fqs_state = RCU_FORCE_QS; 1457 fqs_state = RCU_FORCE_QS;
1389 } else { 1458 } else {
1390 /* Handle dyntick-idle and offline CPUs. */ 1459 /* Handle dyntick-idle and offline CPUs. */
1391 force_qs_rnp(rsp, rcu_implicit_dynticks_qs); 1460 isidle = 0;
1461 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
1392 } 1462 }
1393 /* Clear flag to prevent immediate re-entry. */ 1463 /* Clear flag to prevent immediate re-entry. */
1394 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1464 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
@@ -1448,12 +1518,16 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1448 rcu_nocb_gp_set(rnp, nocb); 1518 rcu_nocb_gp_set(rnp, nocb);
1449 1519
1450 rsp->completed = rsp->gpnum; /* Declare grace period done. */ 1520 rsp->completed = rsp->gpnum; /* Declare grace period done. */
1451 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1521 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
1452 rsp->fqs_state = RCU_GP_IDLE; 1522 rsp->fqs_state = RCU_GP_IDLE;
1453 rdp = this_cpu_ptr(rsp->rda); 1523 rdp = this_cpu_ptr(rsp->rda);
1454 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ 1524 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
1455 if (cpu_needs_another_gp(rsp, rdp)) 1525 if (cpu_needs_another_gp(rsp, rdp)) {
1456 rsp->gp_flags = 1; 1526 rsp->gp_flags = RCU_GP_FLAG_INIT;
1527 trace_rcu_grace_period(rsp->name,
1528 ACCESS_ONCE(rsp->gpnum),
1529 TPS("newreq"));
1530 }
1457 raw_spin_unlock_irq(&rnp->lock); 1531 raw_spin_unlock_irq(&rnp->lock);
1458} 1532}
1459 1533
@@ -1463,6 +1537,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1463static int __noreturn rcu_gp_kthread(void *arg) 1537static int __noreturn rcu_gp_kthread(void *arg)
1464{ 1538{
1465 int fqs_state; 1539 int fqs_state;
1540 int gf;
1466 unsigned long j; 1541 unsigned long j;
1467 int ret; 1542 int ret;
1468 struct rcu_state *rsp = arg; 1543 struct rcu_state *rsp = arg;
@@ -1472,14 +1547,19 @@ static int __noreturn rcu_gp_kthread(void *arg)
1472 1547
1473 /* Handle grace-period start. */ 1548 /* Handle grace-period start. */
1474 for (;;) { 1549 for (;;) {
1550 trace_rcu_grace_period(rsp->name,
1551 ACCESS_ONCE(rsp->gpnum),
1552 TPS("reqwait"));
1475 wait_event_interruptible(rsp->gp_wq, 1553 wait_event_interruptible(rsp->gp_wq,
1476 rsp->gp_flags & 1554 ACCESS_ONCE(rsp->gp_flags) &
1477 RCU_GP_FLAG_INIT); 1555 RCU_GP_FLAG_INIT);
1478 if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && 1556 if (rcu_gp_init(rsp))
1479 rcu_gp_init(rsp))
1480 break; 1557 break;
1481 cond_resched(); 1558 cond_resched();
1482 flush_signals(current); 1559 flush_signals(current);
1560 trace_rcu_grace_period(rsp->name,
1561 ACCESS_ONCE(rsp->gpnum),
1562 TPS("reqwaitsig"));
1483 } 1563 }
1484 1564
1485 /* Handle quiescent-state forcing. */ 1565 /* Handle quiescent-state forcing. */
@@ -1489,10 +1569,16 @@ static int __noreturn rcu_gp_kthread(void *arg)
1489 j = HZ; 1569 j = HZ;
1490 jiffies_till_first_fqs = HZ; 1570 jiffies_till_first_fqs = HZ;
1491 } 1571 }
1572 ret = 0;
1492 for (;;) { 1573 for (;;) {
1493 rsp->jiffies_force_qs = jiffies + j; 1574 if (!ret)
1575 rsp->jiffies_force_qs = jiffies + j;
1576 trace_rcu_grace_period(rsp->name,
1577 ACCESS_ONCE(rsp->gpnum),
1578 TPS("fqswait"));
1494 ret = wait_event_interruptible_timeout(rsp->gp_wq, 1579 ret = wait_event_interruptible_timeout(rsp->gp_wq,
1495 (rsp->gp_flags & RCU_GP_FLAG_FQS) || 1580 ((gf = ACCESS_ONCE(rsp->gp_flags)) &
1581 RCU_GP_FLAG_FQS) ||
1496 (!ACCESS_ONCE(rnp->qsmask) && 1582 (!ACCESS_ONCE(rnp->qsmask) &&
1497 !rcu_preempt_blocked_readers_cgp(rnp)), 1583 !rcu_preempt_blocked_readers_cgp(rnp)),
1498 j); 1584 j);
@@ -1501,13 +1587,23 @@ static int __noreturn rcu_gp_kthread(void *arg)
1501 !rcu_preempt_blocked_readers_cgp(rnp)) 1587 !rcu_preempt_blocked_readers_cgp(rnp))
1502 break; 1588 break;
1503 /* If time for quiescent-state forcing, do it. */ 1589 /* If time for quiescent-state forcing, do it. */
1504 if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { 1590 if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
1591 (gf & RCU_GP_FLAG_FQS)) {
1592 trace_rcu_grace_period(rsp->name,
1593 ACCESS_ONCE(rsp->gpnum),
1594 TPS("fqsstart"));
1505 fqs_state = rcu_gp_fqs(rsp, fqs_state); 1595 fqs_state = rcu_gp_fqs(rsp, fqs_state);
1596 trace_rcu_grace_period(rsp->name,
1597 ACCESS_ONCE(rsp->gpnum),
1598 TPS("fqsend"));
1506 cond_resched(); 1599 cond_resched();
1507 } else { 1600 } else {
1508 /* Deal with stray signal. */ 1601 /* Deal with stray signal. */
1509 cond_resched(); 1602 cond_resched();
1510 flush_signals(current); 1603 flush_signals(current);
1604 trace_rcu_grace_period(rsp->name,
1605 ACCESS_ONCE(rsp->gpnum),
1606 TPS("fqswaitsig"));
1511 } 1607 }
1512 j = jiffies_till_next_fqs; 1608 j = jiffies_till_next_fqs;
1513 if (j > HZ) { 1609 if (j > HZ) {
@@ -1555,13 +1651,17 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1555 return; 1651 return;
1556 } 1652 }
1557 rsp->gp_flags = RCU_GP_FLAG_INIT; 1653 rsp->gp_flags = RCU_GP_FLAG_INIT;
1654 trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
1655 TPS("newreq"));
1558 1656
1559 /* 1657 /*
1560 * We can't do wakeups while holding the rnp->lock, as that 1658 * We can't do wakeups while holding the rnp->lock, as that
1561 * could cause possible deadlocks with the rq->lock. Deter 1659 * could cause possible deadlocks with the rq->lock. Defer
1562 * the wakeup to interrupt context. 1660 * the wakeup to interrupt context. And don't bother waking
1661 * up the running kthread.
1563 */ 1662 */
1564 irq_work_queue(&rsp->wakeup_work); 1663 if (current != rsp->gp_kthread)
1664 irq_work_queue(&rsp->wakeup_work);
1565} 1665}
1566 1666
1567/* 1667/*
@@ -1857,7 +1957,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1857 RCU_TRACE(mask = rdp->grpmask); 1957 RCU_TRACE(mask = rdp->grpmask);
1858 trace_rcu_grace_period(rsp->name, 1958 trace_rcu_grace_period(rsp->name,
1859 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 1959 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
1860 "cpuofl"); 1960 TPS("cpuofl"));
1861} 1961}
1862 1962
1863/* 1963/*
@@ -2044,7 +2144,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2044 */ 2144 */
2045void rcu_check_callbacks(int cpu, int user) 2145void rcu_check_callbacks(int cpu, int user)
2046{ 2146{
2047 trace_rcu_utilization("Start scheduler-tick"); 2147 trace_rcu_utilization(TPS("Start scheduler-tick"));
2048 increment_cpu_stall_ticks(); 2148 increment_cpu_stall_ticks();
2049 if (user || rcu_is_cpu_rrupt_from_idle()) { 2149 if (user || rcu_is_cpu_rrupt_from_idle()) {
2050 2150
@@ -2077,7 +2177,7 @@ void rcu_check_callbacks(int cpu, int user)
2077 rcu_preempt_check_callbacks(cpu); 2177 rcu_preempt_check_callbacks(cpu);
2078 if (rcu_pending(cpu)) 2178 if (rcu_pending(cpu))
2079 invoke_rcu_core(); 2179 invoke_rcu_core();
2080 trace_rcu_utilization("End scheduler-tick"); 2180 trace_rcu_utilization(TPS("End scheduler-tick"));
2081} 2181}
2082 2182
2083/* 2183/*
@@ -2087,7 +2187,10 @@ void rcu_check_callbacks(int cpu, int user)
2087 * 2187 *
2088 * The caller must have suppressed start of new grace periods. 2188 * The caller must have suppressed start of new grace periods.
2089 */ 2189 */
2090static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) 2190static void force_qs_rnp(struct rcu_state *rsp,
2191 int (*f)(struct rcu_data *rsp, bool *isidle,
2192 unsigned long *maxj),
2193 bool *isidle, unsigned long *maxj)
2091{ 2194{
2092 unsigned long bit; 2195 unsigned long bit;
2093 int cpu; 2196 int cpu;
@@ -2110,9 +2213,12 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
2110 cpu = rnp->grplo; 2213 cpu = rnp->grplo;
2111 bit = 1; 2214 bit = 1;
2112 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 2215 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
2113 if ((rnp->qsmask & bit) != 0 && 2216 if ((rnp->qsmask & bit) != 0) {
2114 f(per_cpu_ptr(rsp->rda, cpu))) 2217 if ((rnp->qsmaskinit & bit) != 0)
2115 mask |= bit; 2218 *isidle = 0;
2219 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
2220 mask |= bit;
2221 }
2116 } 2222 }
2117 if (mask != 0) { 2223 if (mask != 0) {
2118 2224
@@ -2208,10 +2314,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
2208 2314
2209 if (cpu_is_offline(smp_processor_id())) 2315 if (cpu_is_offline(smp_processor_id()))
2210 return; 2316 return;
2211 trace_rcu_utilization("Start RCU core"); 2317 trace_rcu_utilization(TPS("Start RCU core"));
2212 for_each_rcu_flavor(rsp) 2318 for_each_rcu_flavor(rsp)
2213 __rcu_process_callbacks(rsp); 2319 __rcu_process_callbacks(rsp);
2214 trace_rcu_utilization("End RCU core"); 2320 trace_rcu_utilization(TPS("End RCU core"));
2215} 2321}
2216 2322
2217/* 2323/*
@@ -2248,7 +2354,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2248 * If called from an extended quiescent state, invoke the RCU 2354 * If called from an extended quiescent state, invoke the RCU
2249 * core in order to force a re-evaluation of RCU's idleness. 2355 * core in order to force a re-evaluation of RCU's idleness.
2250 */ 2356 */
2251 if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) 2357 if (!rcu_is_watching() && cpu_online(smp_processor_id()))
2252 invoke_rcu_core(); 2358 invoke_rcu_core();
2253 2359
2254 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ 2360 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
@@ -2287,6 +2393,13 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2287} 2393}
2288 2394
2289/* 2395/*
2396 * RCU callback function to leak a callback.
2397 */
2398static void rcu_leak_callback(struct rcu_head *rhp)
2399{
2400}
2401
2402/*
2290 * Helper function for call_rcu() and friends. The cpu argument will 2403 * Helper function for call_rcu() and friends. The cpu argument will
2291 * normally be -1, indicating "currently running CPU". It may specify 2404 * normally be -1, indicating "currently running CPU". It may specify
2292 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() 2405 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier()
@@ -2300,7 +2413,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2300 struct rcu_data *rdp; 2413 struct rcu_data *rdp;
2301 2414
2302 WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ 2415 WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
2303 debug_rcu_head_queue(head); 2416 if (debug_rcu_head_queue(head)) {
2417 /* Probable double call_rcu(), so leak the callback. */
2418 ACCESS_ONCE(head->func) = rcu_leak_callback;
2419 WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
2420 return;
2421 }
2304 head->func = func; 2422 head->func = func;
2305 head->next = NULL; 2423 head->next = NULL;
2306 2424
@@ -2706,10 +2824,13 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2706 2824
2707 for_each_rcu_flavor(rsp) { 2825 for_each_rcu_flavor(rsp) {
2708 rdp = per_cpu_ptr(rsp->rda, cpu); 2826 rdp = per_cpu_ptr(rsp->rda, cpu);
2709 if (rdp->qlen != rdp->qlen_lazy) 2827 if (!rdp->nxtlist)
2828 continue;
2829 hc = true;
2830 if (rdp->qlen != rdp->qlen_lazy || !all_lazy) {
2710 al = false; 2831 al = false;
2711 if (rdp->nxtlist) 2832 break;
2712 hc = true; 2833 }
2713 } 2834 }
2714 if (all_lazy) 2835 if (all_lazy)
2715 *all_lazy = al; 2836 *all_lazy = al;
@@ -2720,7 +2841,7 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2720 * Helper function for _rcu_barrier() tracing. If tracing is disabled, 2841 * Helper function for _rcu_barrier() tracing. If tracing is disabled,
2721 * the compiler is expected to optimize this away. 2842 * the compiler is expected to optimize this away.
2722 */ 2843 */
2723static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, 2844static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s,
2724 int cpu, unsigned long done) 2845 int cpu, unsigned long done)
2725{ 2846{
2726 trace_rcu_barrier(rsp->name, s, cpu, 2847 trace_rcu_barrier(rsp->name, s, cpu,
@@ -2785,9 +2906,20 @@ static void _rcu_barrier(struct rcu_state *rsp)
2785 * transition. The "if" expression below therefore rounds the old 2906 * transition. The "if" expression below therefore rounds the old
2786 * value up to the next even number and adds two before comparing. 2907 * value up to the next even number and adds two before comparing.
2787 */ 2908 */
2788 snap_done = ACCESS_ONCE(rsp->n_barrier_done); 2909 snap_done = rsp->n_barrier_done;
2789 _rcu_barrier_trace(rsp, "Check", -1, snap_done); 2910 _rcu_barrier_trace(rsp, "Check", -1, snap_done);
2790 if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { 2911
2912 /*
2913 * If the value in snap is odd, we needed to wait for the current
2914 * rcu_barrier() to complete, then wait for the next one, in other
2915 * words, we need the value of snap_done to be three larger than
2916 * the value of snap. On the other hand, if the value in snap is
2917 * even, we only had to wait for the next rcu_barrier() to complete,
2918 * in other words, we need the value of snap_done to be only two
2919 * greater than the value of snap. The "(snap + 3) & ~0x1" computes
2920 * this for us (thank you, Linus!).
2921 */
2922 if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
2791 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); 2923 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
2792 smp_mb(); /* caller's subsequent code after above check. */ 2924 smp_mb(); /* caller's subsequent code after above check. */
2793 mutex_unlock(&rsp->barrier_mutex); 2925 mutex_unlock(&rsp->barrier_mutex);
@@ -2930,6 +3062,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2930 rdp->blimit = blimit; 3062 rdp->blimit = blimit;
2931 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ 3063 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
2932 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 3064 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
3065 rcu_sysidle_init_percpu_data(rdp->dynticks);
2933 atomic_set(&rdp->dynticks->dynticks, 3066 atomic_set(&rdp->dynticks->dynticks,
2934 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 3067 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2935 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 3068 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
@@ -2952,7 +3085,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2952 rdp->completed = rnp->completed; 3085 rdp->completed = rnp->completed;
2953 rdp->passed_quiesce = 0; 3086 rdp->passed_quiesce = 0;
2954 rdp->qs_pending = 0; 3087 rdp->qs_pending = 0;
2955 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); 3088 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
2956 } 3089 }
2957 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ 3090 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
2958 rnp = rnp->parent; 3091 rnp = rnp->parent;
@@ -2982,7 +3115,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
2982 struct rcu_node *rnp = rdp->mynode; 3115 struct rcu_node *rnp = rdp->mynode;
2983 struct rcu_state *rsp; 3116 struct rcu_state *rsp;
2984 3117
2985 trace_rcu_utilization("Start CPU hotplug"); 3118 trace_rcu_utilization(TPS("Start CPU hotplug"));
2986 switch (action) { 3119 switch (action) {
2987 case CPU_UP_PREPARE: 3120 case CPU_UP_PREPARE:
2988 case CPU_UP_PREPARE_FROZEN: 3121 case CPU_UP_PREPARE_FROZEN:
@@ -3011,7 +3144,26 @@ static int rcu_cpu_notify(struct notifier_block *self,
3011 default: 3144 default:
3012 break; 3145 break;
3013 } 3146 }
3014 trace_rcu_utilization("End CPU hotplug"); 3147 trace_rcu_utilization(TPS("End CPU hotplug"));
3148 return NOTIFY_OK;
3149}
3150
3151static int rcu_pm_notify(struct notifier_block *self,
3152 unsigned long action, void *hcpu)
3153{
3154 switch (action) {
3155 case PM_HIBERNATION_PREPARE:
3156 case PM_SUSPEND_PREPARE:
3157 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
3158 rcu_expedited = 1;
3159 break;
3160 case PM_POST_HIBERNATION:
3161 case PM_POST_SUSPEND:
3162 rcu_expedited = 0;
3163 break;
3164 default:
3165 break;
3166 }
3015 return NOTIFY_OK; 3167 return NOTIFY_OK;
3016} 3168}
3017 3169
@@ -3166,7 +3318,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3166 3318
3167/* 3319/*
3168 * Compute the rcu_node tree geometry from kernel parameters. This cannot 3320 * Compute the rcu_node tree geometry from kernel parameters. This cannot
3169 * replace the definitions in rcutree.h because those are needed to size 3321 * replace the definitions in tree.h because those are needed to size
3170 * the ->node array in the rcu_state structure. 3322 * the ->node array in the rcu_state structure.
3171 */ 3323 */
3172static void __init rcu_init_geometry(void) 3324static void __init rcu_init_geometry(void)
@@ -3245,8 +3397,8 @@ void __init rcu_init(void)
3245 3397
3246 rcu_bootup_announce(); 3398 rcu_bootup_announce();
3247 rcu_init_geometry(); 3399 rcu_init_geometry();
3248 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3249 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 3400 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
3401 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3250 __rcu_init_preempt(); 3402 __rcu_init_preempt();
3251 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 3403 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3252 3404
@@ -3256,8 +3408,9 @@ void __init rcu_init(void)
3256 * or the scheduler are operational. 3408 * or the scheduler are operational.
3257 */ 3409 */
3258 cpu_notifier(rcu_cpu_notify, 0); 3410 cpu_notifier(rcu_cpu_notify, 0);
3411 pm_notifier(rcu_pm_notify, 0);
3259 for_each_online_cpu(cpu) 3412 for_each_online_cpu(cpu)
3260 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3413 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3261} 3414}
3262 3415
3263#include "rcutree_plugin.h" 3416#include "tree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcu/tree.h
index b3832581043c..52be957c9fe2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcu/tree.h
@@ -88,6 +88,14 @@ struct rcu_dynticks {
88 /* Process level is worth LLONG_MAX/2. */ 88 /* Process level is worth LLONG_MAX/2. */
89 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 89 int dynticks_nmi_nesting; /* Track NMI nesting level. */
90 atomic_t dynticks; /* Even value for idle, else odd. */ 90 atomic_t dynticks; /* Even value for idle, else odd. */
91#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
92 long long dynticks_idle_nesting;
93 /* irq/process nesting level from idle. */
94 atomic_t dynticks_idle; /* Even value for idle, else odd. */
95 /* "Idle" excludes userspace execution. */
96 unsigned long dynticks_idle_jiffies;
97 /* End of last non-NMI non-idle period. */
98#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
91#ifdef CONFIG_RCU_FAST_NO_HZ 99#ifdef CONFIG_RCU_FAST_NO_HZ
92 bool all_lazy; /* Are all CPU's CBs lazy? */ 100 bool all_lazy; /* Are all CPU's CBs lazy? */
93 unsigned long nonlazy_posted; 101 unsigned long nonlazy_posted;
@@ -96,6 +104,8 @@ struct rcu_dynticks {
96 /* idle-period nonlazy_posted snapshot. */ 104 /* idle-period nonlazy_posted snapshot. */
97 unsigned long last_accelerate; 105 unsigned long last_accelerate;
98 /* Last jiffy CBs were accelerated. */ 106 /* Last jiffy CBs were accelerated. */
107 unsigned long last_advance_all;
108 /* Last jiffy CBs were all advanced. */
99 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ 109 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
100#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 110#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
101}; 111};
@@ -445,7 +455,7 @@ struct rcu_state {
445 /* for CPU stalls. */ 455 /* for CPU stalls. */
446 unsigned long gp_max; /* Maximum GP duration in */ 456 unsigned long gp_max; /* Maximum GP duration in */
447 /* jiffies. */ 457 /* jiffies. */
448 char *name; /* Name of structure. */ 458 const char *name; /* Name of structure. */
449 char abbr; /* Abbreviated name. */ 459 char abbr; /* Abbreviated name. */
450 struct list_head flavors; /* List of RCU flavors. */ 460 struct list_head flavors; /* List of RCU flavors. */
451 struct irq_work wakeup_work; /* Postponed wakeups */ 461 struct irq_work wakeup_work; /* Postponed wakeups */
@@ -545,6 +555,15 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
545static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 555static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
546static void rcu_kick_nohz_cpu(int cpu); 556static void rcu_kick_nohz_cpu(int cpu);
547static bool init_nocb_callback_list(struct rcu_data *rdp); 557static bool init_nocb_callback_list(struct rcu_data *rdp);
558static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
559static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
560static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
561 unsigned long *maxj);
562static bool is_sysidle_rcu_state(struct rcu_state *rsp);
563static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
564 unsigned long maxj);
565static void rcu_bind_gp_kthread(void);
566static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
548 567
549#endif /* #ifndef RCU_TREE_NONCORE */ 568#endif /* #ifndef RCU_TREE_NONCORE */
550 569
diff --git a/kernel/rcutree_plugin.h b/kernel/rcu/tree_plugin.h
index 769e12e3151b..3822ac0c4b27 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -28,7 +28,7 @@
28#include <linux/gfp.h> 28#include <linux/gfp.h>
29#include <linux/oom.h> 29#include <linux/oom.h>
30#include <linux/smpboot.h> 30#include <linux/smpboot.h>
31#include <linux/tick.h> 31#include "../time/tick-internal.h"
32 32
33#define RCU_KTHREAD_PRIO 1 33#define RCU_KTHREAD_PRIO 1
34 34
@@ -96,10 +96,15 @@ static void __init rcu_bootup_announce_oddness(void)
96#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ 96#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
97#ifdef CONFIG_RCU_NOCB_CPU_ALL 97#ifdef CONFIG_RCU_NOCB_CPU_ALL
98 pr_info("\tOffload RCU callbacks from all CPUs\n"); 98 pr_info("\tOffload RCU callbacks from all CPUs\n");
99 cpumask_setall(rcu_nocb_mask); 99 cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
100#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ 100#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
101#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ 101#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
102 if (have_rcu_nocb_mask) { 102 if (have_rcu_nocb_mask) {
103 if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
104 pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
105 cpumask_and(rcu_nocb_mask, cpu_possible_mask,
106 rcu_nocb_mask);
107 }
103 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 108 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
104 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); 109 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
105 if (rcu_nocb_poll) 110 if (rcu_nocb_poll)
@@ -110,9 +115,7 @@ static void __init rcu_bootup_announce_oddness(void)
110 115
111#ifdef CONFIG_TREE_PREEMPT_RCU 116#ifdef CONFIG_TREE_PREEMPT_RCU
112 117
113struct rcu_state rcu_preempt_state = 118RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
114 RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
115DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
116static struct rcu_state *rcu_state = &rcu_preempt_state; 119static struct rcu_state *rcu_state = &rcu_preempt_state;
117 120
118static int rcu_preempted_readers_exp(struct rcu_node *rnp); 121static int rcu_preempted_readers_exp(struct rcu_node *rnp);
@@ -169,7 +172,7 @@ static void rcu_preempt_qs(int cpu)
169 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 172 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
170 173
171 if (rdp->passed_quiesce == 0) 174 if (rdp->passed_quiesce == 0)
172 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); 175 trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs"));
173 rdp->passed_quiesce = 1; 176 rdp->passed_quiesce = 1;
174 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 177 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
175} 178}
@@ -388,7 +391,7 @@ void rcu_read_unlock_special(struct task_struct *t)
388 np = rcu_next_node_entry(t, rnp); 391 np = rcu_next_node_entry(t, rnp);
389 list_del_init(&t->rcu_node_entry); 392 list_del_init(&t->rcu_node_entry);
390 t->rcu_blocked_node = NULL; 393 t->rcu_blocked_node = NULL;
391 trace_rcu_unlock_preempted_task("rcu_preempt", 394 trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
392 rnp->gpnum, t->pid); 395 rnp->gpnum, t->pid);
393 if (&t->rcu_node_entry == rnp->gp_tasks) 396 if (&t->rcu_node_entry == rnp->gp_tasks)
394 rnp->gp_tasks = np; 397 rnp->gp_tasks = np;
@@ -412,7 +415,7 @@ void rcu_read_unlock_special(struct task_struct *t)
412 */ 415 */
413 empty_exp_now = !rcu_preempted_readers_exp(rnp); 416 empty_exp_now = !rcu_preempted_readers_exp(rnp);
414 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { 417 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
415 trace_rcu_quiescent_state_report("preempt_rcu", 418 trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
416 rnp->gpnum, 419 rnp->gpnum,
417 0, rnp->qsmask, 420 0, rnp->qsmask,
418 rnp->level, 421 rnp->level,
@@ -662,7 +665,7 @@ static void rcu_preempt_check_callbacks(int cpu)
662 665
663static void rcu_preempt_do_callbacks(void) 666static void rcu_preempt_do_callbacks(void)
664{ 667{
665 rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); 668 rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
666} 669}
667 670
668#endif /* #ifdef CONFIG_RCU_BOOST */ 671#endif /* #ifdef CONFIG_RCU_BOOST */
@@ -1130,7 +1133,7 @@ void exit_rcu(void)
1130 1133
1131#ifdef CONFIG_RCU_BOOST 1134#ifdef CONFIG_RCU_BOOST
1132 1135
1133#include "rtmutex_common.h" 1136#include "../rtmutex_common.h"
1134 1137
1135#ifdef CONFIG_RCU_TRACE 1138#ifdef CONFIG_RCU_TRACE
1136 1139
@@ -1250,12 +1253,12 @@ static int rcu_boost_kthread(void *arg)
1250 int spincnt = 0; 1253 int spincnt = 0;
1251 int more2boost; 1254 int more2boost;
1252 1255
1253 trace_rcu_utilization("Start boost kthread@init"); 1256 trace_rcu_utilization(TPS("Start boost kthread@init"));
1254 for (;;) { 1257 for (;;) {
1255 rnp->boost_kthread_status = RCU_KTHREAD_WAITING; 1258 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1256 trace_rcu_utilization("End boost kthread@rcu_wait"); 1259 trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
1257 rcu_wait(rnp->boost_tasks || rnp->exp_tasks); 1260 rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1258 trace_rcu_utilization("Start boost kthread@rcu_wait"); 1261 trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
1259 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; 1262 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1260 more2boost = rcu_boost(rnp); 1263 more2boost = rcu_boost(rnp);
1261 if (more2boost) 1264 if (more2boost)
@@ -1264,14 +1267,14 @@ static int rcu_boost_kthread(void *arg)
1264 spincnt = 0; 1267 spincnt = 0;
1265 if (spincnt > 10) { 1268 if (spincnt > 10) {
1266 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; 1269 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
1267 trace_rcu_utilization("End boost kthread@rcu_yield"); 1270 trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
1268 schedule_timeout_interruptible(2); 1271 schedule_timeout_interruptible(2);
1269 trace_rcu_utilization("Start boost kthread@rcu_yield"); 1272 trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
1270 spincnt = 0; 1273 spincnt = 0;
1271 } 1274 }
1272 } 1275 }
1273 /* NOTREACHED */ 1276 /* NOTREACHED */
1274 trace_rcu_utilization("End boost kthread@notreached"); 1277 trace_rcu_utilization(TPS("End boost kthread@notreached"));
1275 return 0; 1278 return 0;
1276} 1279}
1277 1280
@@ -1334,7 +1337,7 @@ static void invoke_rcu_callbacks_kthread(void)
1334 */ 1337 */
1335static bool rcu_is_callbacks_kthread(void) 1338static bool rcu_is_callbacks_kthread(void)
1336{ 1339{
1337 return __get_cpu_var(rcu_cpu_kthread_task) == current; 1340 return __this_cpu_read(rcu_cpu_kthread_task) == current;
1338} 1341}
1339 1342
1340#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) 1343#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
@@ -1384,8 +1387,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1384 1387
1385static void rcu_kthread_do_work(void) 1388static void rcu_kthread_do_work(void)
1386{ 1389{
1387 rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); 1390 rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
1388 rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1391 rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
1389 rcu_preempt_do_callbacks(); 1392 rcu_preempt_do_callbacks();
1390} 1393}
1391 1394
@@ -1404,7 +1407,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu)
1404 1407
1405static int rcu_cpu_kthread_should_run(unsigned int cpu) 1408static int rcu_cpu_kthread_should_run(unsigned int cpu)
1406{ 1409{
1407 return __get_cpu_var(rcu_cpu_has_work); 1410 return __this_cpu_read(rcu_cpu_has_work);
1408} 1411}
1409 1412
1410/* 1413/*
@@ -1414,12 +1417,12 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu)
1414 */ 1417 */
1415static void rcu_cpu_kthread(unsigned int cpu) 1418static void rcu_cpu_kthread(unsigned int cpu)
1416{ 1419{
1417 unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); 1420 unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
1418 char work, *workp = &__get_cpu_var(rcu_cpu_has_work); 1421 char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
1419 int spincnt; 1422 int spincnt;
1420 1423
1421 for (spincnt = 0; spincnt < 10; spincnt++) { 1424 for (spincnt = 0; spincnt < 10; spincnt++) {
1422 trace_rcu_utilization("Start CPU kthread@rcu_wait"); 1425 trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
1423 local_bh_disable(); 1426 local_bh_disable();
1424 *statusp = RCU_KTHREAD_RUNNING; 1427 *statusp = RCU_KTHREAD_RUNNING;
1425 this_cpu_inc(rcu_cpu_kthread_loops); 1428 this_cpu_inc(rcu_cpu_kthread_loops);
@@ -1431,15 +1434,15 @@ static void rcu_cpu_kthread(unsigned int cpu)
1431 rcu_kthread_do_work(); 1434 rcu_kthread_do_work();
1432 local_bh_enable(); 1435 local_bh_enable();
1433 if (*workp == 0) { 1436 if (*workp == 0) {
1434 trace_rcu_utilization("End CPU kthread@rcu_wait"); 1437 trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
1435 *statusp = RCU_KTHREAD_WAITING; 1438 *statusp = RCU_KTHREAD_WAITING;
1436 return; 1439 return;
1437 } 1440 }
1438 } 1441 }
1439 *statusp = RCU_KTHREAD_YIELDING; 1442 *statusp = RCU_KTHREAD_YIELDING;
1440 trace_rcu_utilization("Start CPU kthread@rcu_yield"); 1443 trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
1441 schedule_timeout_interruptible(2); 1444 schedule_timeout_interruptible(2);
1442 trace_rcu_utilization("End CPU kthread@rcu_yield"); 1445 trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
1443 *statusp = RCU_KTHREAD_WAITING; 1446 *statusp = RCU_KTHREAD_WAITING;
1444} 1447}
1445 1448
@@ -1632,17 +1635,23 @@ module_param(rcu_idle_lazy_gp_delay, int, 0644);
1632extern int tick_nohz_enabled; 1635extern int tick_nohz_enabled;
1633 1636
1634/* 1637/*
1635 * Try to advance callbacks for all flavors of RCU on the current CPU. 1638 * Try to advance callbacks for all flavors of RCU on the current CPU, but
1636 * Afterwards, if there are any callbacks ready for immediate invocation, 1639 * only if it has been awhile since the last time we did so. Afterwards,
1637 * return true. 1640 * if there are any callbacks ready for immediate invocation, return true.
1638 */ 1641 */
1639static bool rcu_try_advance_all_cbs(void) 1642static bool rcu_try_advance_all_cbs(void)
1640{ 1643{
1641 bool cbs_ready = false; 1644 bool cbs_ready = false;
1642 struct rcu_data *rdp; 1645 struct rcu_data *rdp;
1646 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1643 struct rcu_node *rnp; 1647 struct rcu_node *rnp;
1644 struct rcu_state *rsp; 1648 struct rcu_state *rsp;
1645 1649
1650 /* Exit early if we advanced recently. */
1651 if (jiffies == rdtp->last_advance_all)
1652 return 0;
1653 rdtp->last_advance_all = jiffies;
1654
1646 for_each_rcu_flavor(rsp) { 1655 for_each_rcu_flavor(rsp) {
1647 rdp = this_cpu_ptr(rsp->rda); 1656 rdp = this_cpu_ptr(rsp->rda);
1648 rnp = rdp->mynode; 1657 rnp = rdp->mynode;
@@ -1741,6 +1750,8 @@ static void rcu_prepare_for_idle(int cpu)
1741 */ 1750 */
1742 if (rdtp->all_lazy && 1751 if (rdtp->all_lazy &&
1743 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { 1752 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
1753 rdtp->all_lazy = false;
1754 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1744 invoke_rcu_core(); 1755 invoke_rcu_core();
1745 return; 1756 return;
1746 } 1757 }
@@ -1770,17 +1781,11 @@ static void rcu_prepare_for_idle(int cpu)
1770 */ 1781 */
1771static void rcu_cleanup_after_idle(int cpu) 1782static void rcu_cleanup_after_idle(int cpu)
1772{ 1783{
1773 struct rcu_data *rdp;
1774 struct rcu_state *rsp;
1775 1784
1776 if (rcu_is_nocb_cpu(cpu)) 1785 if (rcu_is_nocb_cpu(cpu))
1777 return; 1786 return;
1778 rcu_try_advance_all_cbs(); 1787 if (rcu_try_advance_all_cbs())
1779 for_each_rcu_flavor(rsp) { 1788 invoke_rcu_core();
1780 rdp = per_cpu_ptr(rsp->rda, cpu);
1781 if (cpu_has_callbacks_ready_to_invoke(rdp))
1782 invoke_rcu_core();
1783 }
1784} 1789}
1785 1790
1786/* 1791/*
@@ -2110,15 +2115,22 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2110 2115
2111 /* If we are not being polled and there is a kthread, awaken it ... */ 2116 /* If we are not being polled and there is a kthread, awaken it ... */
2112 t = ACCESS_ONCE(rdp->nocb_kthread); 2117 t = ACCESS_ONCE(rdp->nocb_kthread);
2113 if (rcu_nocb_poll | !t) 2118 if (rcu_nocb_poll || !t) {
2119 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2120 TPS("WakeNotPoll"));
2114 return; 2121 return;
2122 }
2115 len = atomic_long_read(&rdp->nocb_q_count); 2123 len = atomic_long_read(&rdp->nocb_q_count);
2116 if (old_rhpp == &rdp->nocb_head) { 2124 if (old_rhpp == &rdp->nocb_head) {
2117 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ 2125 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
2118 rdp->qlen_last_fqs_check = 0; 2126 rdp->qlen_last_fqs_check = 0;
2127 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty"));
2119 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 2128 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2120 wake_up_process(t); /* ... or if many callbacks queued. */ 2129 wake_up_process(t); /* ... or if many callbacks queued. */
2121 rdp->qlen_last_fqs_check = LONG_MAX / 2; 2130 rdp->qlen_last_fqs_check = LONG_MAX / 2;
2131 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
2132 } else {
2133 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
2122 } 2134 }
2123 return; 2135 return;
2124} 2136}
@@ -2142,10 +2154,12 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2142 if (__is_kfree_rcu_offset((unsigned long)rhp->func)) 2154 if (__is_kfree_rcu_offset((unsigned long)rhp->func))
2143 trace_rcu_kfree_callback(rdp->rsp->name, rhp, 2155 trace_rcu_kfree_callback(rdp->rsp->name, rhp,
2144 (unsigned long)rhp->func, 2156 (unsigned long)rhp->func,
2145 rdp->qlen_lazy, rdp->qlen); 2157 -atomic_long_read(&rdp->nocb_q_count_lazy),
2158 -atomic_long_read(&rdp->nocb_q_count));
2146 else 2159 else
2147 trace_rcu_callback(rdp->rsp->name, rhp, 2160 trace_rcu_callback(rdp->rsp->name, rhp,
2148 rdp->qlen_lazy, rdp->qlen); 2161 -atomic_long_read(&rdp->nocb_q_count_lazy),
2162 -atomic_long_read(&rdp->nocb_q_count));
2149 return 1; 2163 return 1;
2150} 2164}
2151 2165
@@ -2202,7 +2216,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2202 * Wait for the grace period. Do so interruptibly to avoid messing 2216 * Wait for the grace period. Do so interruptibly to avoid messing
2203 * up the load average. 2217 * up the load average.
2204 */ 2218 */
2205 trace_rcu_future_gp(rnp, rdp, c, "StartWait"); 2219 trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
2206 for (;;) { 2220 for (;;) {
2207 wait_event_interruptible( 2221 wait_event_interruptible(
2208 rnp->nocb_gp_wq[c & 0x1], 2222 rnp->nocb_gp_wq[c & 0x1],
@@ -2210,9 +2224,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2210 if (likely(d)) 2224 if (likely(d))
2211 break; 2225 break;
2212 flush_signals(current); 2226 flush_signals(current);
2213 trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); 2227 trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
2214 } 2228 }
2215 trace_rcu_future_gp(rnp, rdp, c, "EndWait"); 2229 trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
2216 smp_mb(); /* Ensure that CB invocation happens after GP end. */ 2230 smp_mb(); /* Ensure that CB invocation happens after GP end. */
2217} 2231}
2218 2232
@@ -2223,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2223static int rcu_nocb_kthread(void *arg) 2237static int rcu_nocb_kthread(void *arg)
2224{ 2238{
2225 int c, cl; 2239 int c, cl;
2240 bool firsttime = 1;
2226 struct rcu_head *list; 2241 struct rcu_head *list;
2227 struct rcu_head *next; 2242 struct rcu_head *next;
2228 struct rcu_head **tail; 2243 struct rcu_head **tail;
@@ -2231,14 +2246,27 @@ static int rcu_nocb_kthread(void *arg)
2231 /* Each pass through this loop invokes one batch of callbacks */ 2246 /* Each pass through this loop invokes one batch of callbacks */
2232 for (;;) { 2247 for (;;) {
2233 /* If not polling, wait for next batch of callbacks. */ 2248 /* If not polling, wait for next batch of callbacks. */
2234 if (!rcu_nocb_poll) 2249 if (!rcu_nocb_poll) {
2250 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2251 TPS("Sleep"));
2235 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); 2252 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
2253 } else if (firsttime) {
2254 firsttime = 0;
2255 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2256 TPS("Poll"));
2257 }
2236 list = ACCESS_ONCE(rdp->nocb_head); 2258 list = ACCESS_ONCE(rdp->nocb_head);
2237 if (!list) { 2259 if (!list) {
2260 if (!rcu_nocb_poll)
2261 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2262 TPS("WokeEmpty"));
2238 schedule_timeout_interruptible(1); 2263 schedule_timeout_interruptible(1);
2239 flush_signals(current); 2264 flush_signals(current);
2240 continue; 2265 continue;
2241 } 2266 }
2267 firsttime = 1;
2268 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2269 TPS("WokeNonEmpty"));
2242 2270
2243 /* 2271 /*
2244 * Extract queued callbacks, update counts, and wait 2272 * Extract queued callbacks, update counts, and wait
@@ -2259,7 +2287,11 @@ static int rcu_nocb_kthread(void *arg)
2259 next = list->next; 2287 next = list->next;
2260 /* Wait for enqueuing to complete, if needed. */ 2288 /* Wait for enqueuing to complete, if needed. */
2261 while (next == NULL && &list->next != tail) { 2289 while (next == NULL && &list->next != tail) {
2290 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2291 TPS("WaitQueue"));
2262 schedule_timeout_interruptible(1); 2292 schedule_timeout_interruptible(1);
2293 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2294 TPS("WokeQueue"));
2263 next = list->next; 2295 next = list->next;
2264 } 2296 }
2265 debug_rcu_head_unqueue(list); 2297 debug_rcu_head_unqueue(list);
@@ -2375,3 +2407,425 @@ static void rcu_kick_nohz_cpu(int cpu)
2375 smp_send_reschedule(cpu); 2407 smp_send_reschedule(cpu);
2376#endif /* #ifdef CONFIG_NO_HZ_FULL */ 2408#endif /* #ifdef CONFIG_NO_HZ_FULL */
2377} 2409}
2410
2411
2412#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
2413
2414/*
2415 * Define RCU flavor that holds sysidle state. This needs to be the
2416 * most active flavor of RCU.
2417 */
2418#ifdef CONFIG_PREEMPT_RCU
2419static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
2420#else /* #ifdef CONFIG_PREEMPT_RCU */
2421static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
2422#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
2423
2424static int full_sysidle_state; /* Current system-idle state. */
2425#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
2426#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
2427#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
2428#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
2429#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
2430
2431/*
2432 * Invoked to note exit from irq or task transition to idle. Note that
2433 * usermode execution does -not- count as idle here! After all, we want
2434 * to detect full-system idle states, not RCU quiescent states and grace
2435 * periods. The caller must have disabled interrupts.
2436 */
2437static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
2438{
2439 unsigned long j;
2440
2441 /* Adjust nesting, check for fully idle. */
2442 if (irq) {
2443 rdtp->dynticks_idle_nesting--;
2444 WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2445 if (rdtp->dynticks_idle_nesting != 0)
2446 return; /* Still not fully idle. */
2447 } else {
2448 if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
2449 DYNTICK_TASK_NEST_VALUE) {
2450 rdtp->dynticks_idle_nesting = 0;
2451 } else {
2452 rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
2453 WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2454 return; /* Still not fully idle. */
2455 }
2456 }
2457
2458 /* Record start of fully idle period. */
2459 j = jiffies;
2460 ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
2461 smp_mb__before_atomic_inc();
2462 atomic_inc(&rdtp->dynticks_idle);
2463 smp_mb__after_atomic_inc();
2464 WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
2465}
2466
2467/*
2468 * Unconditionally force exit from full system-idle state. This is
2469 * invoked when a normal CPU exits idle, but must be called separately
2470 * for the timekeeping CPU (tick_do_timer_cpu). The reason for this
2471 * is that the timekeeping CPU is permitted to take scheduling-clock
2472 * interrupts while the system is in system-idle state, and of course
2473 * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
2474 * interrupt from any other type of interrupt.
2475 */
2476void rcu_sysidle_force_exit(void)
2477{
2478 int oldstate = ACCESS_ONCE(full_sysidle_state);
2479 int newoldstate;
2480
2481 /*
2482 * Each pass through the following loop attempts to exit full
2483 * system-idle state. If contention proves to be a problem,
2484 * a trylock-based contention tree could be used here.
2485 */
2486 while (oldstate > RCU_SYSIDLE_SHORT) {
2487 newoldstate = cmpxchg(&full_sysidle_state,
2488 oldstate, RCU_SYSIDLE_NOT);
2489 if (oldstate == newoldstate &&
2490 oldstate == RCU_SYSIDLE_FULL_NOTED) {
2491 rcu_kick_nohz_cpu(tick_do_timer_cpu);
2492 return; /* We cleared it, done! */
2493 }
2494 oldstate = newoldstate;
2495 }
2496 smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
2497}
2498
2499/*
2500 * Invoked to note entry to irq or task transition from idle. Note that
2501 * usermode execution does -not- count as idle here! The caller must
2502 * have disabled interrupts.
2503 */
2504static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
2505{
2506 /* Adjust nesting, check for already non-idle. */
2507 if (irq) {
2508 rdtp->dynticks_idle_nesting++;
2509 WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2510 if (rdtp->dynticks_idle_nesting != 1)
2511 return; /* Already non-idle. */
2512 } else {
2513 /*
2514 * Allow for irq misnesting. Yes, it really is possible
2515 * to enter an irq handler then never leave it, and maybe
2516 * also vice versa. Handle both possibilities.
2517 */
2518 if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
2519 rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
2520 WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2521 return; /* Already non-idle. */
2522 } else {
2523 rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
2524 }
2525 }
2526
2527 /* Record end of idle period. */
2528 smp_mb__before_atomic_inc();
2529 atomic_inc(&rdtp->dynticks_idle);
2530 smp_mb__after_atomic_inc();
2531 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
2532
2533 /*
2534 * If we are the timekeeping CPU, we are permitted to be non-idle
2535 * during a system-idle state. This must be the case, because
2536 * the timekeeping CPU has to take scheduling-clock interrupts
2537 * during the time that the system is transitioning to full
2538 * system-idle state. This means that the timekeeping CPU must
2539 * invoke rcu_sysidle_force_exit() directly if it does anything
2540 * more than take a scheduling-clock interrupt.
2541 */
2542 if (smp_processor_id() == tick_do_timer_cpu)
2543 return;
2544
2545 /* Update system-idle state: We are clearly no longer fully idle! */
2546 rcu_sysidle_force_exit();
2547}
2548
2549/*
2550 * Check to see if the current CPU is idle. Note that usermode execution
2551 * does not count as idle. The caller must have disabled interrupts.
2552 */
2553static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2554 unsigned long *maxj)
2555{
2556 int cur;
2557 unsigned long j;
2558 struct rcu_dynticks *rdtp = rdp->dynticks;
2559
2560 /*
2561 * If some other CPU has already reported non-idle, if this is
2562 * not the flavor of RCU that tracks sysidle state, or if this
2563 * is an offline or the timekeeping CPU, nothing to do.
2564 */
2565 if (!*isidle || rdp->rsp != rcu_sysidle_state ||
2566 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
2567 return;
2568 if (rcu_gp_in_progress(rdp->rsp))
2569 WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
2570
2571 /* Pick up current idle and NMI-nesting counter and check. */
2572 cur = atomic_read(&rdtp->dynticks_idle);
2573 if (cur & 0x1) {
2574 *isidle = false; /* We are not idle! */
2575 return;
2576 }
2577 smp_mb(); /* Read counters before timestamps. */
2578
2579 /* Pick up timestamps. */
2580 j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
2581 /* If this CPU entered idle more recently, update maxj timestamp. */
2582 if (ULONG_CMP_LT(*maxj, j))
2583 *maxj = j;
2584}
2585
2586/*
2587 * Is this the flavor of RCU that is handling full-system idle?
2588 */
2589static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2590{
2591 return rsp == rcu_sysidle_state;
2592}
2593
2594/*
2595 * Bind the grace-period kthread for the sysidle flavor of RCU to the
2596 * timekeeping CPU.
2597 */
2598static void rcu_bind_gp_kthread(void)
2599{
2600 int cpu = ACCESS_ONCE(tick_do_timer_cpu);
2601
2602 if (cpu < 0 || cpu >= nr_cpu_ids)
2603 return;
2604 if (raw_smp_processor_id() != cpu)
2605 set_cpus_allowed_ptr(current, cpumask_of(cpu));
2606}
2607
2608/*
2609 * Return a delay in jiffies based on the number of CPUs, rcu_node
2610 * leaf fanout, and jiffies tick rate. The idea is to allow larger
2611 * systems more time to transition to full-idle state in order to
2612 * avoid the cache thrashing that otherwise occur on the state variable.
2613 * Really small systems (less than a couple of tens of CPUs) should
2614 * instead use a single global atomically incremented counter, and later
2615 * versions of this will automatically reconfigure themselves accordingly.
2616 */
2617static unsigned long rcu_sysidle_delay(void)
2618{
2619 if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2620 return 0;
2621 return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
2622}
2623
2624/*
2625 * Advance the full-system-idle state. This is invoked when all of
2626 * the non-timekeeping CPUs are idle.
2627 */
2628static void rcu_sysidle(unsigned long j)
2629{
2630 /* Check the current state. */
2631 switch (ACCESS_ONCE(full_sysidle_state)) {
2632 case RCU_SYSIDLE_NOT:
2633
2634 /* First time all are idle, so note a short idle period. */
2635 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
2636 break;
2637
2638 case RCU_SYSIDLE_SHORT:
2639
2640 /*
2641 * Idle for a bit, time to advance to next state?
2642 * cmpxchg failure means race with non-idle, let them win.
2643 */
2644 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2645 (void)cmpxchg(&full_sysidle_state,
2646 RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
2647 break;
2648
2649 case RCU_SYSIDLE_LONG:
2650
2651 /*
2652 * Do an additional check pass before advancing to full.
2653 * cmpxchg failure means race with non-idle, let them win.
2654 */
2655 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2656 (void)cmpxchg(&full_sysidle_state,
2657 RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
2658 break;
2659
2660 default:
2661 break;
2662 }
2663}
2664
2665/*
2666 * Found a non-idle non-timekeeping CPU, so kick the system-idle state
2667 * back to the beginning.
2668 */
2669static void rcu_sysidle_cancel(void)
2670{
2671 smp_mb();
2672 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
2673}
2674
2675/*
2676 * Update the sysidle state based on the results of a force-quiescent-state
2677 * scan of the CPUs' dyntick-idle state.
2678 */
2679static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
2680 unsigned long maxj, bool gpkt)
2681{
2682 if (rsp != rcu_sysidle_state)
2683 return; /* Wrong flavor, ignore. */
2684 if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2685 return; /* Running state machine from timekeeping CPU. */
2686 if (isidle)
2687 rcu_sysidle(maxj); /* More idle! */
2688 else
2689 rcu_sysidle_cancel(); /* Idle is over. */
2690}
2691
2692/*
2693 * Wrapper for rcu_sysidle_report() when called from the grace-period
2694 * kthread's context.
2695 */
2696static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2697 unsigned long maxj)
2698{
2699 rcu_sysidle_report(rsp, isidle, maxj, true);
2700}
2701
2702/* Callback and function for forcing an RCU grace period. */
2703struct rcu_sysidle_head {
2704 struct rcu_head rh;
2705 int inuse;
2706};
2707
2708static void rcu_sysidle_cb(struct rcu_head *rhp)
2709{
2710 struct rcu_sysidle_head *rshp;
2711
2712 /*
2713 * The following memory barrier is needed to replace the
2714 * memory barriers that would normally be in the memory
2715 * allocator.
2716 */
2717 smp_mb(); /* grace period precedes setting inuse. */
2718
2719 rshp = container_of(rhp, struct rcu_sysidle_head, rh);
2720 ACCESS_ONCE(rshp->inuse) = 0;
2721}
2722
2723/*
2724 * Check to see if the system is fully idle, other than the timekeeping CPU.
2725 * The caller must have disabled interrupts.
2726 */
2727bool rcu_sys_is_idle(void)
2728{
2729 static struct rcu_sysidle_head rsh;
2730 int rss = ACCESS_ONCE(full_sysidle_state);
2731
2732 if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
2733 return false;
2734
2735 /* Handle small-system case by doing a full scan of CPUs. */
2736 if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
2737 int oldrss = rss - 1;
2738
2739 /*
2740 * One pass to advance to each state up to _FULL.
2741 * Give up if any pass fails to advance the state.
2742 */
2743 while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
2744 int cpu;
2745 bool isidle = true;
2746 unsigned long maxj = jiffies - ULONG_MAX / 4;
2747 struct rcu_data *rdp;
2748
2749 /* Scan all the CPUs looking for nonidle CPUs. */
2750 for_each_possible_cpu(cpu) {
2751 rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
2752 rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
2753 if (!isidle)
2754 break;
2755 }
2756 rcu_sysidle_report(rcu_sysidle_state,
2757 isidle, maxj, false);
2758 oldrss = rss;
2759 rss = ACCESS_ONCE(full_sysidle_state);
2760 }
2761 }
2762
2763 /* If this is the first observation of an idle period, record it. */
2764 if (rss == RCU_SYSIDLE_FULL) {
2765 rss = cmpxchg(&full_sysidle_state,
2766 RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
2767 return rss == RCU_SYSIDLE_FULL;
2768 }
2769
2770 smp_mb(); /* ensure rss load happens before later caller actions. */
2771
2772 /* If already fully idle, tell the caller (in case of races). */
2773 if (rss == RCU_SYSIDLE_FULL_NOTED)
2774 return true;
2775
2776 /*
2777 * If we aren't there yet, and a grace period is not in flight,
2778 * initiate a grace period. Either way, tell the caller that
2779 * we are not there yet. We use an xchg() rather than an assignment
2780 * to make up for the memory barriers that would otherwise be
2781 * provided by the memory allocator.
2782 */
2783 if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
2784 !rcu_gp_in_progress(rcu_sysidle_state) &&
2785 !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
2786 call_rcu(&rsh.rh, rcu_sysidle_cb);
2787 return false;
2788}
2789
2790/*
2791 * Initialize dynticks sysidle state for CPUs coming online.
2792 */
2793static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2794{
2795 rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
2796}
2797
2798#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2799
2800static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
2801{
2802}
2803
2804static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
2805{
2806}
2807
2808static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2809 unsigned long *maxj)
2810{
2811}
2812
2813static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2814{
2815 return false;
2816}
2817
2818static void rcu_bind_gp_kthread(void)
2819{
2820}
2821
2822static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2823 unsigned long maxj)
2824{
2825}
2826
2827static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2828{
2829}
2830
2831#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
diff --git a/kernel/rcutree_trace.c b/kernel/rcu/tree_trace.c
index cf6c17412932..3596797b7e46 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -44,7 +44,7 @@
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "tree.h"
48 48
49static int r_open(struct inode *inode, struct file *file, 49static int r_open(struct inode *inode, struct file *file,
50 const struct seq_operations *op) 50 const struct seq_operations *op)
diff --git a/kernel/rcupdate.c b/kernel/rcu/update.c
index cce6ba8bbace..6cb3dff89e2b 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcu/update.c
@@ -53,6 +53,12 @@
53 53
54#include "rcu.h" 54#include "rcu.h"
55 55
56MODULE_ALIAS("rcupdate");
57#ifdef MODULE_PARAM_PREFIX
58#undef MODULE_PARAM_PREFIX
59#endif
60#define MODULE_PARAM_PREFIX "rcupdate."
61
56module_param(rcu_expedited, int, 0); 62module_param(rcu_expedited, int, 0);
57 63
58#ifdef CONFIG_PREEMPT_RCU 64#ifdef CONFIG_PREEMPT_RCU
@@ -122,7 +128,7 @@ struct lockdep_map rcu_sched_lock_map =
122 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); 128 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
123EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 129EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
124 130
125int debug_lockdep_rcu_enabled(void) 131int notrace debug_lockdep_rcu_enabled(void)
126{ 132{
127 return rcu_scheduler_active && debug_locks && 133 return rcu_scheduler_active && debug_locks &&
128 current->lockdep_recursion == 0; 134 current->lockdep_recursion == 0;
@@ -148,7 +154,7 @@ int rcu_read_lock_bh_held(void)
148{ 154{
149 if (!debug_lockdep_rcu_enabled()) 155 if (!debug_lockdep_rcu_enabled())
150 return 1; 156 return 1;
151 if (rcu_is_cpu_idle()) 157 if (!rcu_is_watching())
152 return 0; 158 return 0;
153 if (!rcu_lockdep_current_cpu_online()) 159 if (!rcu_lockdep_current_cpu_online())
154 return 0; 160 return 0;
@@ -212,43 +218,6 @@ static inline void debug_rcu_head_free(struct rcu_head *head)
212} 218}
213 219
214/* 220/*
215 * fixup_init is called when:
216 * - an active object is initialized
217 */
218static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
219{
220 struct rcu_head *head = addr;
221
222 switch (state) {
223 case ODEBUG_STATE_ACTIVE:
224 /*
225 * Ensure that queued callbacks are all executed.
226 * If we detect that we are nested in a RCU read-side critical
227 * section, we should simply fail, otherwise we would deadlock.
228 * In !PREEMPT configurations, there is no way to tell if we are
229 * in a RCU read-side critical section or not, so we never
230 * attempt any fixup and just print a warning.
231 */
232#ifndef CONFIG_PREEMPT
233 WARN_ON_ONCE(1);
234 return 0;
235#endif
236 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
237 irqs_disabled()) {
238 WARN_ON_ONCE(1);
239 return 0;
240 }
241 rcu_barrier();
242 rcu_barrier_sched();
243 rcu_barrier_bh();
244 debug_object_init(head, &rcuhead_debug_descr);
245 return 1;
246 default:
247 return 0;
248 }
249}
250
251/*
252 * fixup_activate is called when: 221 * fixup_activate is called when:
253 * - an active object is activated 222 * - an active object is activated
254 * - an unknown object is activated (might be a statically initialized object) 223 * - an unknown object is activated (might be a statically initialized object)
@@ -268,69 +237,8 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
268 debug_object_init(head, &rcuhead_debug_descr); 237 debug_object_init(head, &rcuhead_debug_descr);
269 debug_object_activate(head, &rcuhead_debug_descr); 238 debug_object_activate(head, &rcuhead_debug_descr);
270 return 0; 239 return 0;
271
272 case ODEBUG_STATE_ACTIVE:
273 /*
274 * Ensure that queued callbacks are all executed.
275 * If we detect that we are nested in a RCU read-side critical
276 * section, we should simply fail, otherwise we would deadlock.
277 * In !PREEMPT configurations, there is no way to tell if we are
278 * in a RCU read-side critical section or not, so we never
279 * attempt any fixup and just print a warning.
280 */
281#ifndef CONFIG_PREEMPT
282 WARN_ON_ONCE(1);
283 return 0;
284#endif
285 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
286 irqs_disabled()) {
287 WARN_ON_ONCE(1);
288 return 0;
289 }
290 rcu_barrier();
291 rcu_barrier_sched();
292 rcu_barrier_bh();
293 debug_object_activate(head, &rcuhead_debug_descr);
294 return 1;
295 default: 240 default:
296 return 0;
297 }
298}
299
300/*
301 * fixup_free is called when:
302 * - an active object is freed
303 */
304static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
305{
306 struct rcu_head *head = addr;
307
308 switch (state) {
309 case ODEBUG_STATE_ACTIVE:
310 /*
311 * Ensure that queued callbacks are all executed.
312 * If we detect that we are nested in a RCU read-side critical
313 * section, we should simply fail, otherwise we would deadlock.
314 * In !PREEMPT configurations, there is no way to tell if we are
315 * in a RCU read-side critical section or not, so we never
316 * attempt any fixup and just print a warning.
317 */
318#ifndef CONFIG_PREEMPT
319 WARN_ON_ONCE(1);
320 return 0;
321#endif
322 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
323 irqs_disabled()) {
324 WARN_ON_ONCE(1);
325 return 0;
326 }
327 rcu_barrier();
328 rcu_barrier_sched();
329 rcu_barrier_bh();
330 debug_object_free(head, &rcuhead_debug_descr);
331 return 1; 241 return 1;
332 default:
333 return 0;
334 } 242 }
335} 243}
336 244
@@ -369,15 +277,13 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
369 277
370struct debug_obj_descr rcuhead_debug_descr = { 278struct debug_obj_descr rcuhead_debug_descr = {
371 .name = "rcu_head", 279 .name = "rcu_head",
372 .fixup_init = rcuhead_fixup_init,
373 .fixup_activate = rcuhead_fixup_activate, 280 .fixup_activate = rcuhead_fixup_activate,
374 .fixup_free = rcuhead_fixup_free,
375}; 281};
376EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 282EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
377#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 283#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
378 284
379#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) 285#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
380void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, 286void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
381 unsigned long secs, 287 unsigned long secs,
382 unsigned long c_old, unsigned long c) 288 unsigned long c_old, unsigned long c)
383{ 289{
@@ -398,7 +304,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
398#endif 304#endif
399 305
400int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 306int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
401int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; 307static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
402 308
403module_param(rcu_cpu_stall_suppress, int, 0644); 309module_param(rcu_cpu_stall_suppress, int, 0644);
404module_param(rcu_cpu_stall_timeout, int, 0644); 310module_param(rcu_cpu_stall_timeout, int, 0644);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 269ed9384cc4..f813b3474646 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -32,7 +32,14 @@ EXPORT_SYMBOL(cad_pid);
32#endif 32#endif
33enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; 33enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
34 34
35int reboot_default; 35/*
36 * This variable is used privately to keep track of whether or not
37 * reboot_type is still set to its default value (i.e., reboot= hasn't
38 * been set on the command line). This is needed so that we can
39 * suppress DMI scanning for reboot quirks. Without it, it's
40 * impossible to override a faulty reboot quirk without recompiling.
41 */
42int reboot_default = 1;
36int reboot_cpu; 43int reboot_cpu;
37enum reboot_type reboot_type = BOOT_ACPI; 44enum reboot_type reboot_type = BOOT_ACPI;
38int reboot_force; 45int reboot_force;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ff55247e7049..4aa8a305aede 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -17,8 +17,8 @@
17void res_counter_init(struct res_counter *counter, struct res_counter *parent) 17void res_counter_init(struct res_counter *counter, struct res_counter *parent)
18{ 18{
19 spin_lock_init(&counter->lock); 19 spin_lock_init(&counter->lock);
20 counter->limit = RESOURCE_MAX; 20 counter->limit = RES_COUNTER_MAX;
21 counter->soft_limit = RESOURCE_MAX; 21 counter->soft_limit = RES_COUNTER_MAX;
22 counter->parent = parent; 22 counter->parent = parent;
23} 23}
24 24
@@ -178,23 +178,30 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
178#endif 178#endif
179 179
180int res_counter_memparse_write_strategy(const char *buf, 180int res_counter_memparse_write_strategy(const char *buf,
181 unsigned long long *res) 181 unsigned long long *resp)
182{ 182{
183 char *end; 183 char *end;
184 unsigned long long res;
184 185
185 /* return RESOURCE_MAX(unlimited) if "-1" is specified */ 186 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
186 if (*buf == '-') { 187 if (*buf == '-') {
187 *res = simple_strtoull(buf + 1, &end, 10); 188 res = simple_strtoull(buf + 1, &end, 10);
188 if (*res != 1 || *end != '\0') 189 if (res != 1 || *end != '\0')
189 return -EINVAL; 190 return -EINVAL;
190 *res = RESOURCE_MAX; 191 *resp = RES_COUNTER_MAX;
191 return 0; 192 return 0;
192 } 193 }
193 194
194 *res = memparse(buf, &end); 195 res = memparse(buf, &end);
195 if (*end != '\0') 196 if (*end != '\0')
196 return -EINVAL; 197 return -EINVAL;
197 198
198 *res = PAGE_ALIGN(*res); 199 if (PAGE_ALIGN(res) >= res)
200 res = PAGE_ALIGN(res);
201 else
202 res = RES_COUNTER_MAX;
203
204 *resp = res;
205
199 return 0; 206 return 0;
200} 207}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 54adcf35f495..7b621409cf15 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -12,6 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
15obj-y += wait.o completion.o
15obj-$(CONFIG_SMP) += cpupri.o 16obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 17obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 18obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
new file mode 100644
index 000000000000..a63f4dc27909
--- /dev/null
+++ b/kernel/sched/completion.c
@@ -0,0 +1,299 @@
1/*
2 * Generic wait-for-completion handler;
3 *
4 * It differs from semaphores in that their default case is the opposite,
5 * wait_for_completion default blocks whereas semaphore default non-block. The
6 * interface also makes it easy to 'complete' multiple waiting threads,
7 * something which isn't entirely natural for semaphores.
8 *
9 * But more importantly, the primitive documents the usage. Semaphores would
10 * typically be used for exclusion which gives rise to priority inversion.
11 * Waiting for completion is a typically sync point, but not an exclusion point.
12 */
13
14#include <linux/sched.h>
15#include <linux/completion.h>
16
17/**
18 * complete: - signals a single thread waiting on this completion
19 * @x: holds the state of this particular completion
20 *
21 * This will wake up a single thread waiting on this completion. Threads will be
22 * awakened in the same order in which they were queued.
23 *
24 * See also complete_all(), wait_for_completion() and related routines.
25 *
26 * It may be assumed that this function implies a write memory barrier before
27 * changing the task state if and only if any tasks are woken up.
28 */
29void complete(struct completion *x)
30{
31 unsigned long flags;
32
33 spin_lock_irqsave(&x->wait.lock, flags);
34 x->done++;
35 __wake_up_locked(&x->wait, TASK_NORMAL, 1);
36 spin_unlock_irqrestore(&x->wait.lock, flags);
37}
38EXPORT_SYMBOL(complete);
39
40/**
41 * complete_all: - signals all threads waiting on this completion
42 * @x: holds the state of this particular completion
43 *
44 * This will wake up all threads waiting on this particular completion event.
45 *
46 * It may be assumed that this function implies a write memory barrier before
47 * changing the task state if and only if any tasks are woken up.
48 */
49void complete_all(struct completion *x)
50{
51 unsigned long flags;
52
53 spin_lock_irqsave(&x->wait.lock, flags);
54 x->done += UINT_MAX/2;
55 __wake_up_locked(&x->wait, TASK_NORMAL, 0);
56 spin_unlock_irqrestore(&x->wait.lock, flags);
57}
58EXPORT_SYMBOL(complete_all);
59
60static inline long __sched
61do_wait_for_common(struct completion *x,
62 long (*action)(long), long timeout, int state)
63{
64 if (!x->done) {
65 DECLARE_WAITQUEUE(wait, current);
66
67 __add_wait_queue_tail_exclusive(&x->wait, &wait);
68 do {
69 if (signal_pending_state(state, current)) {
70 timeout = -ERESTARTSYS;
71 break;
72 }
73 __set_current_state(state);
74 spin_unlock_irq(&x->wait.lock);
75 timeout = action(timeout);
76 spin_lock_irq(&x->wait.lock);
77 } while (!x->done && timeout);
78 __remove_wait_queue(&x->wait, &wait);
79 if (!x->done)
80 return timeout;
81 }
82 x->done--;
83 return timeout ?: 1;
84}
85
86static inline long __sched
87__wait_for_common(struct completion *x,
88 long (*action)(long), long timeout, int state)
89{
90 might_sleep();
91
92 spin_lock_irq(&x->wait.lock);
93 timeout = do_wait_for_common(x, action, timeout, state);
94 spin_unlock_irq(&x->wait.lock);
95 return timeout;
96}
97
98static long __sched
99wait_for_common(struct completion *x, long timeout, int state)
100{
101 return __wait_for_common(x, schedule_timeout, timeout, state);
102}
103
104static long __sched
105wait_for_common_io(struct completion *x, long timeout, int state)
106{
107 return __wait_for_common(x, io_schedule_timeout, timeout, state);
108}
109
110/**
111 * wait_for_completion: - waits for completion of a task
112 * @x: holds the state of this particular completion
113 *
114 * This waits to be signaled for completion of a specific task. It is NOT
115 * interruptible and there is no timeout.
116 *
117 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
118 * and interrupt capability. Also see complete().
119 */
120void __sched wait_for_completion(struct completion *x)
121{
122 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
123}
124EXPORT_SYMBOL(wait_for_completion);
125
126/**
127 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
128 * @x: holds the state of this particular completion
129 * @timeout: timeout value in jiffies
130 *
131 * This waits for either a completion of a specific task to be signaled or for a
132 * specified timeout to expire. The timeout is in jiffies. It is not
133 * interruptible.
134 *
135 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
136 * till timeout) if completed.
137 */
138unsigned long __sched
139wait_for_completion_timeout(struct completion *x, unsigned long timeout)
140{
141 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
142}
143EXPORT_SYMBOL(wait_for_completion_timeout);
144
145/**
146 * wait_for_completion_io: - waits for completion of a task
147 * @x: holds the state of this particular completion
148 *
149 * This waits to be signaled for completion of a specific task. It is NOT
150 * interruptible and there is no timeout. The caller is accounted as waiting
151 * for IO.
152 */
153void __sched wait_for_completion_io(struct completion *x)
154{
155 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
156}
157EXPORT_SYMBOL(wait_for_completion_io);
158
159/**
160 * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
161 * @x: holds the state of this particular completion
162 * @timeout: timeout value in jiffies
163 *
164 * This waits for either a completion of a specific task to be signaled or for a
165 * specified timeout to expire. The timeout is in jiffies. It is not
166 * interruptible. The caller is accounted as waiting for IO.
167 *
168 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
169 * till timeout) if completed.
170 */
171unsigned long __sched
172wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
173{
174 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
175}
176EXPORT_SYMBOL(wait_for_completion_io_timeout);
177
178/**
179 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
180 * @x: holds the state of this particular completion
181 *
182 * This waits for completion of a specific task to be signaled. It is
183 * interruptible.
184 *
185 * Return: -ERESTARTSYS if interrupted, 0 if completed.
186 */
187int __sched wait_for_completion_interruptible(struct completion *x)
188{
189 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
190 if (t == -ERESTARTSYS)
191 return t;
192 return 0;
193}
194EXPORT_SYMBOL(wait_for_completion_interruptible);
195
196/**
197 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
198 * @x: holds the state of this particular completion
199 * @timeout: timeout value in jiffies
200 *
201 * This waits for either a completion of a specific task to be signaled or for a
202 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
203 *
204 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
205 * or number of jiffies left till timeout) if completed.
206 */
207long __sched
208wait_for_completion_interruptible_timeout(struct completion *x,
209 unsigned long timeout)
210{
211 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
212}
213EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
214
215/**
216 * wait_for_completion_killable: - waits for completion of a task (killable)
217 * @x: holds the state of this particular completion
218 *
219 * This waits to be signaled for completion of a specific task. It can be
220 * interrupted by a kill signal.
221 *
222 * Return: -ERESTARTSYS if interrupted, 0 if completed.
223 */
224int __sched wait_for_completion_killable(struct completion *x)
225{
226 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
227 if (t == -ERESTARTSYS)
228 return t;
229 return 0;
230}
231EXPORT_SYMBOL(wait_for_completion_killable);
232
233/**
234 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
235 * @x: holds the state of this particular completion
236 * @timeout: timeout value in jiffies
237 *
238 * This waits for either a completion of a specific task to be
239 * signaled or for a specified timeout to expire. It can be
240 * interrupted by a kill signal. The timeout is in jiffies.
241 *
242 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
243 * or number of jiffies left till timeout) if completed.
244 */
245long __sched
246wait_for_completion_killable_timeout(struct completion *x,
247 unsigned long timeout)
248{
249 return wait_for_common(x, timeout, TASK_KILLABLE);
250}
251EXPORT_SYMBOL(wait_for_completion_killable_timeout);
252
253/**
254 * try_wait_for_completion - try to decrement a completion without blocking
255 * @x: completion structure
256 *
257 * Return: 0 if a decrement cannot be done without blocking
258 * 1 if a decrement succeeded.
259 *
260 * If a completion is being used as a counting completion,
261 * attempt to decrement the counter without blocking. This
262 * enables us to avoid waiting if the resource the completion
263 * is protecting is not available.
264 */
265bool try_wait_for_completion(struct completion *x)
266{
267 unsigned long flags;
268 int ret = 1;
269
270 spin_lock_irqsave(&x->wait.lock, flags);
271 if (!x->done)
272 ret = 0;
273 else
274 x->done--;
275 spin_unlock_irqrestore(&x->wait.lock, flags);
276 return ret;
277}
278EXPORT_SYMBOL(try_wait_for_completion);
279
280/**
281 * completion_done - Test to see if a completion has any waiters
282 * @x: completion structure
283 *
284 * Return: 0 if there are waiters (wait_for_completion() in progress)
285 * 1 if there are no waiters.
286 *
287 */
288bool completion_done(struct completion *x)
289{
290 unsigned long flags;
291 int ret = 1;
292
293 spin_lock_irqsave(&x->wait.lock, flags);
294 if (!x->done)
295 ret = 0;
296 spin_unlock_irqrestore(&x->wait.lock, flags);
297 return ret;
298}
299EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7c32cb7bfeb..1deccd78be98 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -513,12 +513,11 @@ static inline void init_hrtick(void)
513 * might also involve a cross-CPU call to trigger the scheduler on 513 * might also involve a cross-CPU call to trigger the scheduler on
514 * the target CPU. 514 * the target CPU.
515 */ 515 */
516#ifdef CONFIG_SMP
517void resched_task(struct task_struct *p) 516void resched_task(struct task_struct *p)
518{ 517{
519 int cpu; 518 int cpu;
520 519
521 assert_raw_spin_locked(&task_rq(p)->lock); 520 lockdep_assert_held(&task_rq(p)->lock);
522 521
523 if (test_tsk_need_resched(p)) 522 if (test_tsk_need_resched(p))
524 return; 523 return;
@@ -526,8 +525,10 @@ void resched_task(struct task_struct *p)
526 set_tsk_need_resched(p); 525 set_tsk_need_resched(p);
527 526
528 cpu = task_cpu(p); 527 cpu = task_cpu(p);
529 if (cpu == smp_processor_id()) 528 if (cpu == smp_processor_id()) {
529 set_preempt_need_resched();
530 return; 530 return;
531 }
531 532
532 /* NEED_RESCHED must be visible before we test polling */ 533 /* NEED_RESCHED must be visible before we test polling */
533 smp_mb(); 534 smp_mb();
@@ -546,6 +547,7 @@ void resched_cpu(int cpu)
546 raw_spin_unlock_irqrestore(&rq->lock, flags); 547 raw_spin_unlock_irqrestore(&rq->lock, flags);
547} 548}
548 549
550#ifdef CONFIG_SMP
549#ifdef CONFIG_NO_HZ_COMMON 551#ifdef CONFIG_NO_HZ_COMMON
550/* 552/*
551 * In the semi idle case, use the nearest busy cpu for migrating timers 553 * In the semi idle case, use the nearest busy cpu for migrating timers
@@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq)
693 } 695 }
694} 696}
695 697
696#else /* !CONFIG_SMP */
697void resched_task(struct task_struct *p)
698{
699 assert_raw_spin_locked(&task_rq(p)->lock);
700 set_tsk_need_resched(p);
701}
702#endif /* CONFIG_SMP */ 698#endif /* CONFIG_SMP */
703 699
704#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 700#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p)
767static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 763static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
768{ 764{
769 update_rq_clock(rq); 765 update_rq_clock(rq);
770 sched_info_queued(p); 766 sched_info_queued(rq, p);
771 p->sched_class->enqueue_task(rq, p, flags); 767 p->sched_class->enqueue_task(rq, p, flags);
772} 768}
773 769
774static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 770static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
775{ 771{
776 update_rq_clock(rq); 772 update_rq_clock(rq);
777 sched_info_dequeued(p); 773 sched_info_dequeued(rq, p);
778 p->sched_class->dequeue_task(rq, p, flags); 774 p->sched_class->dequeue_task(rq, p, flags);
779} 775}
780 776
@@ -933,6 +929,8 @@ static int effective_prio(struct task_struct *p)
933/** 929/**
934 * task_curr - is this task currently executing on a CPU? 930 * task_curr - is this task currently executing on a CPU?
935 * @p: the task in question. 931 * @p: the task in question.
932 *
933 * Return: 1 if the task is currently executing. 0 otherwise.
936 */ 934 */
937inline int task_curr(const struct task_struct *p) 935inline int task_curr(const struct task_struct *p)
938{ 936{
@@ -976,13 +974,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
976 rq->skip_clock_update = 1; 974 rq->skip_clock_update = 1;
977} 975}
978 976
979static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
980
981void register_task_migration_notifier(struct notifier_block *n)
982{
983 atomic_notifier_chain_register(&task_migration_notifier, n);
984}
985
986#ifdef CONFIG_SMP 977#ifdef CONFIG_SMP
987void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 978void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
988{ 979{
@@ -992,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
992 * ttwu() will sort out the placement. 983 * ttwu() will sort out the placement.
993 */ 984 */
994 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 985 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
995 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 986 !(task_preempt_count(p) & PREEMPT_ACTIVE));
996 987
997#ifdef CONFIG_LOCKDEP 988#ifdef CONFIG_LOCKDEP
998 /* 989 /*
@@ -1013,21 +1004,114 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1013 trace_sched_migrate_task(p, new_cpu); 1004 trace_sched_migrate_task(p, new_cpu);
1014 1005
1015 if (task_cpu(p) != new_cpu) { 1006 if (task_cpu(p) != new_cpu) {
1016 struct task_migration_notifier tmn;
1017
1018 if (p->sched_class->migrate_task_rq) 1007 if (p->sched_class->migrate_task_rq)
1019 p->sched_class->migrate_task_rq(p, new_cpu); 1008 p->sched_class->migrate_task_rq(p, new_cpu);
1020 p->se.nr_migrations++; 1009 p->se.nr_migrations++;
1021 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 1010 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1011 }
1012
1013 __set_task_cpu(p, new_cpu);
1014}
1015
1016static void __migrate_swap_task(struct task_struct *p, int cpu)
1017{
1018 if (p->on_rq) {
1019 struct rq *src_rq, *dst_rq;
1022 1020
1023 tmn.task = p; 1021 src_rq = task_rq(p);
1024 tmn.from_cpu = task_cpu(p); 1022 dst_rq = cpu_rq(cpu);
1025 tmn.to_cpu = new_cpu;
1026 1023
1027 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); 1024 deactivate_task(src_rq, p, 0);
1025 set_task_cpu(p, cpu);
1026 activate_task(dst_rq, p, 0);
1027 check_preempt_curr(dst_rq, p, 0);
1028 } else {
1029 /*
1030 * Task isn't running anymore; make it appear like we migrated
1031 * it before it went to sleep. This means on wakeup we make the
1032 * previous cpu our targer instead of where it really is.
1033 */
1034 p->wake_cpu = cpu;
1028 } 1035 }
1036}
1029 1037
1030 __set_task_cpu(p, new_cpu); 1038struct migration_swap_arg {
1039 struct task_struct *src_task, *dst_task;
1040 int src_cpu, dst_cpu;
1041};
1042
1043static int migrate_swap_stop(void *data)
1044{
1045 struct migration_swap_arg *arg = data;
1046 struct rq *src_rq, *dst_rq;
1047 int ret = -EAGAIN;
1048
1049 src_rq = cpu_rq(arg->src_cpu);
1050 dst_rq = cpu_rq(arg->dst_cpu);
1051
1052 double_raw_lock(&arg->src_task->pi_lock,
1053 &arg->dst_task->pi_lock);
1054 double_rq_lock(src_rq, dst_rq);
1055 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1056 goto unlock;
1057
1058 if (task_cpu(arg->src_task) != arg->src_cpu)
1059 goto unlock;
1060
1061 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1062 goto unlock;
1063
1064 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1065 goto unlock;
1066
1067 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1068 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1069
1070 ret = 0;
1071
1072unlock:
1073 double_rq_unlock(src_rq, dst_rq);
1074 raw_spin_unlock(&arg->dst_task->pi_lock);
1075 raw_spin_unlock(&arg->src_task->pi_lock);
1076
1077 return ret;
1078}
1079
1080/*
1081 * Cross migrate two tasks
1082 */
1083int migrate_swap(struct task_struct *cur, struct task_struct *p)
1084{
1085 struct migration_swap_arg arg;
1086 int ret = -EINVAL;
1087
1088 arg = (struct migration_swap_arg){
1089 .src_task = cur,
1090 .src_cpu = task_cpu(cur),
1091 .dst_task = p,
1092 .dst_cpu = task_cpu(p),
1093 };
1094
1095 if (arg.src_cpu == arg.dst_cpu)
1096 goto out;
1097
1098 /*
1099 * These three tests are all lockless; this is OK since all of them
1100 * will be re-checked with proper locks held further down the line.
1101 */
1102 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1103 goto out;
1104
1105 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1106 goto out;
1107
1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1109 goto out;
1110
1111 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1112
1113out:
1114 return ret;
1031} 1115}
1032 1116
1033struct migration_arg { 1117struct migration_arg {
@@ -1249,9 +1333,9 @@ out:
1249 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1333 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1250 */ 1334 */
1251static inline 1335static inline
1252int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 1336int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1253{ 1337{
1254 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 1338 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1255 1339
1256 /* 1340 /*
1257 * In order not to call set_task_cpu() on a blocking task we need 1341 * In order not to call set_task_cpu() on a blocking task we need
@@ -1343,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1343 1427
1344 if (rq->idle_stamp) { 1428 if (rq->idle_stamp) {
1345 u64 delta = rq_clock(rq) - rq->idle_stamp; 1429 u64 delta = rq_clock(rq) - rq->idle_stamp;
1346 u64 max = 2*sysctl_sched_migration_cost; 1430 u64 max = 2*rq->max_idle_balance_cost;
1347 1431
1348 if (delta > max) 1432 update_avg(&rq->avg_idle, delta);
1433
1434 if (rq->avg_idle > max)
1349 rq->avg_idle = max; 1435 rq->avg_idle = max;
1350 else 1436
1351 update_avg(&rq->avg_idle, delta);
1352 rq->idle_stamp = 0; 1437 rq->idle_stamp = 0;
1353 } 1438 }
1354#endif 1439#endif
@@ -1409,6 +1494,14 @@ static void sched_ttwu_pending(void)
1409 1494
1410void scheduler_ipi(void) 1495void scheduler_ipi(void)
1411{ 1496{
1497 /*
1498 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
1499 * TIF_NEED_RESCHED remotely (for the first time) will also send
1500 * this IPI.
1501 */
1502 if (tif_need_resched())
1503 set_preempt_need_resched();
1504
1412 if (llist_empty(&this_rq()->wake_list) 1505 if (llist_empty(&this_rq()->wake_list)
1413 && !tick_nohz_full_cpu(smp_processor_id()) 1506 && !tick_nohz_full_cpu(smp_processor_id())
1414 && !got_nohz_idle_kick()) 1507 && !got_nohz_idle_kick())
@@ -1482,7 +1575,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
1482 * the simpler "current->state = TASK_RUNNING" to mark yourself 1575 * the simpler "current->state = TASK_RUNNING" to mark yourself
1483 * runnable without the overhead of this. 1576 * runnable without the overhead of this.
1484 * 1577 *
1485 * Returns %true if @p was woken up, %false if it was already running 1578 * Return: %true if @p was woken up, %false if it was already running.
1486 * or @state didn't match @p's state. 1579 * or @state didn't match @p's state.
1487 */ 1580 */
1488static int 1581static int
@@ -1491,7 +1584,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1491 unsigned long flags; 1584 unsigned long flags;
1492 int cpu, success = 0; 1585 int cpu, success = 0;
1493 1586
1494 smp_wmb(); 1587 /*
1588 * If we are going to wake up a thread waiting for CONDITION we
1589 * need to ensure that CONDITION=1 done by the caller can not be
1590 * reordered with p->state check below. This pairs with mb() in
1591 * set_current_state() the waiting thread does.
1592 */
1593 smp_mb__before_spinlock();
1495 raw_spin_lock_irqsave(&p->pi_lock, flags); 1594 raw_spin_lock_irqsave(&p->pi_lock, flags);
1496 if (!(p->state & state)) 1595 if (!(p->state & state))
1497 goto out; 1596 goto out;
@@ -1520,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1520 if (p->sched_class->task_waking) 1619 if (p->sched_class->task_waking)
1521 p->sched_class->task_waking(p); 1620 p->sched_class->task_waking(p);
1522 1621
1523 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 1622 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
1524 if (task_cpu(p) != cpu) { 1623 if (task_cpu(p) != cpu) {
1525 wake_flags |= WF_MIGRATED; 1624 wake_flags |= WF_MIGRATED;
1526 set_task_cpu(p, cpu); 1625 set_task_cpu(p, cpu);
@@ -1577,8 +1676,9 @@ out:
1577 * @p: The process to be woken up. 1676 * @p: The process to be woken up.
1578 * 1677 *
1579 * Attempt to wake up the nominated process and move it to the set of runnable 1678 * Attempt to wake up the nominated process and move it to the set of runnable
1580 * processes. Returns 1 if the process was woken up, 0 if it was already 1679 * processes.
1581 * running. 1680 *
1681 * Return: 1 if the process was woken up, 0 if it was already running.
1582 * 1682 *
1583 * It may be assumed that this function implies a write memory barrier before 1683 * It may be assumed that this function implies a write memory barrier before
1584 * changing the task state if and only if any tasks are woken up. 1684 * changing the task state if and only if any tasks are woken up.
@@ -1601,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
1601 * 1701 *
1602 * __sched_fork() is basic setup used by init_idle() too: 1702 * __sched_fork() is basic setup used by init_idle() too:
1603 */ 1703 */
1604static void __sched_fork(struct task_struct *p) 1704static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1605{ 1705{
1606 p->on_rq = 0; 1706 p->on_rq = 0;
1607 1707
@@ -1625,16 +1725,24 @@ static void __sched_fork(struct task_struct *p)
1625 1725
1626#ifdef CONFIG_NUMA_BALANCING 1726#ifdef CONFIG_NUMA_BALANCING
1627 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 1727 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1628 p->mm->numa_next_scan = jiffies; 1728 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1629 p->mm->numa_next_reset = jiffies;
1630 p->mm->numa_scan_seq = 0; 1729 p->mm->numa_scan_seq = 0;
1631 } 1730 }
1632 1731
1732 if (clone_flags & CLONE_VM)
1733 p->numa_preferred_nid = current->numa_preferred_nid;
1734 else
1735 p->numa_preferred_nid = -1;
1736
1633 p->node_stamp = 0ULL; 1737 p->node_stamp = 0ULL;
1634 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1738 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1635 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1636 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1739 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1637 p->numa_work.next = &p->numa_work; 1740 p->numa_work.next = &p->numa_work;
1741 p->numa_faults = NULL;
1742 p->numa_faults_buffer = NULL;
1743
1744 INIT_LIST_HEAD(&p->numa_entry);
1745 p->numa_group = NULL;
1638#endif /* CONFIG_NUMA_BALANCING */ 1746#endif /* CONFIG_NUMA_BALANCING */
1639} 1747}
1640 1748
@@ -1660,12 +1768,12 @@ void set_numabalancing_state(bool enabled)
1660/* 1768/*
1661 * fork()/clone()-time setup: 1769 * fork()/clone()-time setup:
1662 */ 1770 */
1663void sched_fork(struct task_struct *p) 1771void sched_fork(unsigned long clone_flags, struct task_struct *p)
1664{ 1772{
1665 unsigned long flags; 1773 unsigned long flags;
1666 int cpu = get_cpu(); 1774 int cpu = get_cpu();
1667 1775
1668 __sched_fork(p); 1776 __sched_fork(clone_flags, p);
1669 /* 1777 /*
1670 * We mark the process as running here. This guarantees that 1778 * We mark the process as running here. This guarantees that
1671 * nobody will actually run it, and a signal or other external 1779 * nobody will actually run it, and a signal or other external
@@ -1723,10 +1831,7 @@ void sched_fork(struct task_struct *p)
1723#if defined(CONFIG_SMP) 1831#if defined(CONFIG_SMP)
1724 p->on_cpu = 0; 1832 p->on_cpu = 0;
1725#endif 1833#endif
1726#ifdef CONFIG_PREEMPT_COUNT 1834 init_task_preempt_count(p);
1727 /* Want to start with kernel preemption disabled. */
1728 task_thread_info(p)->preempt_count = 1;
1729#endif
1730#ifdef CONFIG_SMP 1835#ifdef CONFIG_SMP
1731 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1836 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1732#endif 1837#endif
@@ -1753,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p)
1753 * - cpus_allowed can change in the fork path 1858 * - cpus_allowed can change in the fork path
1754 * - any previously selected cpu might disappear through hotplug 1859 * - any previously selected cpu might disappear through hotplug
1755 */ 1860 */
1756 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 1861 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
1757#endif 1862#endif
1758 1863
1759 /* Initialize new task's runnable average */ 1864 /* Initialize new task's runnable average */
@@ -1844,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
1844 struct task_struct *next) 1949 struct task_struct *next)
1845{ 1950{
1846 trace_sched_switch(prev, next); 1951 trace_sched_switch(prev, next);
1847 sched_info_switch(prev, next); 1952 sched_info_switch(rq, prev, next);
1848 perf_event_task_sched_out(prev, next); 1953 perf_event_task_sched_out(prev, next);
1849 fire_sched_out_preempt_notifiers(prev, next); 1954 fire_sched_out_preempt_notifiers(prev, next);
1850 prepare_lock_switch(rq, next); 1955 prepare_lock_switch(rq, next);
@@ -1896,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1896 if (mm) 2001 if (mm)
1897 mmdrop(mm); 2002 mmdrop(mm);
1898 if (unlikely(prev_state == TASK_DEAD)) { 2003 if (unlikely(prev_state == TASK_DEAD)) {
2004 task_numa_free(prev);
2005
1899 /* 2006 /*
1900 * Remove function-return probe instances associated with this 2007 * Remove function-return probe instances associated with this
1901 * task and put them back on the free list. 2008 * task and put them back on the free list.
@@ -2079,7 +2186,7 @@ void sched_exec(void)
2079 int dest_cpu; 2186 int dest_cpu;
2080 2187
2081 raw_spin_lock_irqsave(&p->pi_lock, flags); 2188 raw_spin_lock_irqsave(&p->pi_lock, flags);
2082 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); 2189 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2083 if (dest_cpu == smp_processor_id()) 2190 if (dest_cpu == smp_processor_id())
2084 goto unlock; 2191 goto unlock;
2085 2192
@@ -2191,6 +2298,8 @@ void scheduler_tick(void)
2191 * This makes sure that uptime, CFS vruntime, load 2298 * This makes sure that uptime, CFS vruntime, load
2192 * balancing, etc... continue to move forward, even 2299 * balancing, etc... continue to move forward, even
2193 * with a very low granularity. 2300 * with a very low granularity.
2301 *
2302 * Return: Maximum deferment in nanoseconds.
2194 */ 2303 */
2195u64 scheduler_tick_max_deferment(void) 2304u64 scheduler_tick_max_deferment(void)
2196{ 2305{
@@ -2219,7 +2328,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
2219#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2328#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2220 defined(CONFIG_PREEMPT_TRACER)) 2329 defined(CONFIG_PREEMPT_TRACER))
2221 2330
2222void __kprobes add_preempt_count(int val) 2331void __kprobes preempt_count_add(int val)
2223{ 2332{
2224#ifdef CONFIG_DEBUG_PREEMPT 2333#ifdef CONFIG_DEBUG_PREEMPT
2225 /* 2334 /*
@@ -2228,7 +2337,7 @@ void __kprobes add_preempt_count(int val)
2228 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2337 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2229 return; 2338 return;
2230#endif 2339#endif
2231 preempt_count() += val; 2340 __preempt_count_add(val);
2232#ifdef CONFIG_DEBUG_PREEMPT 2341#ifdef CONFIG_DEBUG_PREEMPT
2233 /* 2342 /*
2234 * Spinlock count overflowing soon? 2343 * Spinlock count overflowing soon?
@@ -2239,9 +2348,9 @@ void __kprobes add_preempt_count(int val)
2239 if (preempt_count() == val) 2348 if (preempt_count() == val)
2240 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2349 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2241} 2350}
2242EXPORT_SYMBOL(add_preempt_count); 2351EXPORT_SYMBOL(preempt_count_add);
2243 2352
2244void __kprobes sub_preempt_count(int val) 2353void __kprobes preempt_count_sub(int val)
2245{ 2354{
2246#ifdef CONFIG_DEBUG_PREEMPT 2355#ifdef CONFIG_DEBUG_PREEMPT
2247 /* 2356 /*
@@ -2259,9 +2368,9 @@ void __kprobes sub_preempt_count(int val)
2259 2368
2260 if (preempt_count() == val) 2369 if (preempt_count() == val)
2261 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2370 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2262 preempt_count() -= val; 2371 __preempt_count_sub(val);
2263} 2372}
2264EXPORT_SYMBOL(sub_preempt_count); 2373EXPORT_SYMBOL(preempt_count_sub);
2265 2374
2266#endif 2375#endif
2267 2376
@@ -2394,6 +2503,12 @@ need_resched:
2394 if (sched_feat(HRTICK)) 2503 if (sched_feat(HRTICK))
2395 hrtick_clear(rq); 2504 hrtick_clear(rq);
2396 2505
2506 /*
2507 * Make sure that signal_pending_state()->signal_pending() below
2508 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
2509 * done by the caller to avoid the race with signal_wake_up().
2510 */
2511 smp_mb__before_spinlock();
2397 raw_spin_lock_irq(&rq->lock); 2512 raw_spin_lock_irq(&rq->lock);
2398 2513
2399 switch_count = &prev->nivcsw; 2514 switch_count = &prev->nivcsw;
@@ -2428,6 +2543,7 @@ need_resched:
2428 put_prev_task(rq, prev); 2543 put_prev_task(rq, prev);
2429 next = pick_next_task(rq); 2544 next = pick_next_task(rq);
2430 clear_tsk_need_resched(prev); 2545 clear_tsk_need_resched(prev);
2546 clear_preempt_need_resched();
2431 rq->skip_clock_update = 0; 2547 rq->skip_clock_update = 0;
2432 2548
2433 if (likely(prev != next)) { 2549 if (likely(prev != next)) {
@@ -2510,19 +2626,17 @@ void __sched schedule_preempt_disabled(void)
2510 */ 2626 */
2511asmlinkage void __sched notrace preempt_schedule(void) 2627asmlinkage void __sched notrace preempt_schedule(void)
2512{ 2628{
2513 struct thread_info *ti = current_thread_info();
2514
2515 /* 2629 /*
2516 * If there is a non-zero preempt_count or interrupts are disabled, 2630 * If there is a non-zero preempt_count or interrupts are disabled,
2517 * we do not want to preempt the current task. Just return.. 2631 * we do not want to preempt the current task. Just return..
2518 */ 2632 */
2519 if (likely(ti->preempt_count || irqs_disabled())) 2633 if (likely(!preemptible()))
2520 return; 2634 return;
2521 2635
2522 do { 2636 do {
2523 add_preempt_count_notrace(PREEMPT_ACTIVE); 2637 __preempt_count_add(PREEMPT_ACTIVE);
2524 __schedule(); 2638 __schedule();
2525 sub_preempt_count_notrace(PREEMPT_ACTIVE); 2639 __preempt_count_sub(PREEMPT_ACTIVE);
2526 2640
2527 /* 2641 /*
2528 * Check again in case we missed a preemption opportunity 2642 * Check again in case we missed a preemption opportunity
@@ -2541,20 +2655,19 @@ EXPORT_SYMBOL(preempt_schedule);
2541 */ 2655 */
2542asmlinkage void __sched preempt_schedule_irq(void) 2656asmlinkage void __sched preempt_schedule_irq(void)
2543{ 2657{
2544 struct thread_info *ti = current_thread_info();
2545 enum ctx_state prev_state; 2658 enum ctx_state prev_state;
2546 2659
2547 /* Catch callers which need to be fixed */ 2660 /* Catch callers which need to be fixed */
2548 BUG_ON(ti->preempt_count || !irqs_disabled()); 2661 BUG_ON(preempt_count() || !irqs_disabled());
2549 2662
2550 prev_state = exception_enter(); 2663 prev_state = exception_enter();
2551 2664
2552 do { 2665 do {
2553 add_preempt_count(PREEMPT_ACTIVE); 2666 __preempt_count_add(PREEMPT_ACTIVE);
2554 local_irq_enable(); 2667 local_irq_enable();
2555 __schedule(); 2668 __schedule();
2556 local_irq_disable(); 2669 local_irq_disable();
2557 sub_preempt_count(PREEMPT_ACTIVE); 2670 __preempt_count_sub(PREEMPT_ACTIVE);
2558 2671
2559 /* 2672 /*
2560 * Check again in case we missed a preemption opportunity 2673 * Check again in case we missed a preemption opportunity
@@ -2575,393 +2688,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2575} 2688}
2576EXPORT_SYMBOL(default_wake_function); 2689EXPORT_SYMBOL(default_wake_function);
2577 2690
2578/*
2579 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
2580 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
2581 * number) then we wake all the non-exclusive tasks and one exclusive task.
2582 *
2583 * There are circumstances in which we can try to wake a task which has already
2584 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
2585 * zero in this (rare) case, and we handle it by continuing to scan the queue.
2586 */
2587static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2588 int nr_exclusive, int wake_flags, void *key)
2589{
2590 wait_queue_t *curr, *next;
2591
2592 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
2593 unsigned flags = curr->flags;
2594
2595 if (curr->func(curr, mode, wake_flags, key) &&
2596 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
2597 break;
2598 }
2599}
2600
2601/**
2602 * __wake_up - wake up threads blocked on a waitqueue.
2603 * @q: the waitqueue
2604 * @mode: which threads
2605 * @nr_exclusive: how many wake-one or wake-many threads to wake up
2606 * @key: is directly passed to the wakeup function
2607 *
2608 * It may be assumed that this function implies a write memory barrier before
2609 * changing the task state if and only if any tasks are woken up.
2610 */
2611void __wake_up(wait_queue_head_t *q, unsigned int mode,
2612 int nr_exclusive, void *key)
2613{
2614 unsigned long flags;
2615
2616 spin_lock_irqsave(&q->lock, flags);
2617 __wake_up_common(q, mode, nr_exclusive, 0, key);
2618 spin_unlock_irqrestore(&q->lock, flags);
2619}
2620EXPORT_SYMBOL(__wake_up);
2621
2622/*
2623 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
2624 */
2625void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
2626{
2627 __wake_up_common(q, mode, nr, 0, NULL);
2628}
2629EXPORT_SYMBOL_GPL(__wake_up_locked);
2630
2631void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
2632{
2633 __wake_up_common(q, mode, 1, 0, key);
2634}
2635EXPORT_SYMBOL_GPL(__wake_up_locked_key);
2636
2637/**
2638 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
2639 * @q: the waitqueue
2640 * @mode: which threads
2641 * @nr_exclusive: how many wake-one or wake-many threads to wake up
2642 * @key: opaque value to be passed to wakeup targets
2643 *
2644 * The sync wakeup differs that the waker knows that it will schedule
2645 * away soon, so while the target thread will be woken up, it will not
2646 * be migrated to another CPU - ie. the two threads are 'synchronized'
2647 * with each other. This can prevent needless bouncing between CPUs.
2648 *
2649 * On UP it can prevent extra preemption.
2650 *
2651 * It may be assumed that this function implies a write memory barrier before
2652 * changing the task state if and only if any tasks are woken up.
2653 */
2654void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
2655 int nr_exclusive, void *key)
2656{
2657 unsigned long flags;
2658 int wake_flags = WF_SYNC;
2659
2660 if (unlikely(!q))
2661 return;
2662
2663 if (unlikely(!nr_exclusive))
2664 wake_flags = 0;
2665
2666 spin_lock_irqsave(&q->lock, flags);
2667 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
2668 spin_unlock_irqrestore(&q->lock, flags);
2669}
2670EXPORT_SYMBOL_GPL(__wake_up_sync_key);
2671
2672/*
2673 * __wake_up_sync - see __wake_up_sync_key()
2674 */
2675void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2676{
2677 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
2678}
2679EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
2680
2681/**
2682 * complete: - signals a single thread waiting on this completion
2683 * @x: holds the state of this particular completion
2684 *
2685 * This will wake up a single thread waiting on this completion. Threads will be
2686 * awakened in the same order in which they were queued.
2687 *
2688 * See also complete_all(), wait_for_completion() and related routines.
2689 *
2690 * It may be assumed that this function implies a write memory barrier before
2691 * changing the task state if and only if any tasks are woken up.
2692 */
2693void complete(struct completion *x)
2694{
2695 unsigned long flags;
2696
2697 spin_lock_irqsave(&x->wait.lock, flags);
2698 x->done++;
2699 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
2700 spin_unlock_irqrestore(&x->wait.lock, flags);
2701}
2702EXPORT_SYMBOL(complete);
2703
2704/**
2705 * complete_all: - signals all threads waiting on this completion
2706 * @x: holds the state of this particular completion
2707 *
2708 * This will wake up all threads waiting on this particular completion event.
2709 *
2710 * It may be assumed that this function implies a write memory barrier before
2711 * changing the task state if and only if any tasks are woken up.
2712 */
2713void complete_all(struct completion *x)
2714{
2715 unsigned long flags;
2716
2717 spin_lock_irqsave(&x->wait.lock, flags);
2718 x->done += UINT_MAX/2;
2719 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
2720 spin_unlock_irqrestore(&x->wait.lock, flags);
2721}
2722EXPORT_SYMBOL(complete_all);
2723
2724static inline long __sched
2725do_wait_for_common(struct completion *x,
2726 long (*action)(long), long timeout, int state)
2727{
2728 if (!x->done) {
2729 DECLARE_WAITQUEUE(wait, current);
2730
2731 __add_wait_queue_tail_exclusive(&x->wait, &wait);
2732 do {
2733 if (signal_pending_state(state, current)) {
2734 timeout = -ERESTARTSYS;
2735 break;
2736 }
2737 __set_current_state(state);
2738 spin_unlock_irq(&x->wait.lock);
2739 timeout = action(timeout);
2740 spin_lock_irq(&x->wait.lock);
2741 } while (!x->done && timeout);
2742 __remove_wait_queue(&x->wait, &wait);
2743 if (!x->done)
2744 return timeout;
2745 }
2746 x->done--;
2747 return timeout ?: 1;
2748}
2749
2750static inline long __sched
2751__wait_for_common(struct completion *x,
2752 long (*action)(long), long timeout, int state)
2753{
2754 might_sleep();
2755
2756 spin_lock_irq(&x->wait.lock);
2757 timeout = do_wait_for_common(x, action, timeout, state);
2758 spin_unlock_irq(&x->wait.lock);
2759 return timeout;
2760}
2761
2762static long __sched
2763wait_for_common(struct completion *x, long timeout, int state)
2764{
2765 return __wait_for_common(x, schedule_timeout, timeout, state);
2766}
2767
2768static long __sched
2769wait_for_common_io(struct completion *x, long timeout, int state)
2770{
2771 return __wait_for_common(x, io_schedule_timeout, timeout, state);
2772}
2773
2774/**
2775 * wait_for_completion: - waits for completion of a task
2776 * @x: holds the state of this particular completion
2777 *
2778 * This waits to be signaled for completion of a specific task. It is NOT
2779 * interruptible and there is no timeout.
2780 *
2781 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
2782 * and interrupt capability. Also see complete().
2783 */
2784void __sched wait_for_completion(struct completion *x)
2785{
2786 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
2787}
2788EXPORT_SYMBOL(wait_for_completion);
2789
2790/**
2791 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
2792 * @x: holds the state of this particular completion
2793 * @timeout: timeout value in jiffies
2794 *
2795 * This waits for either a completion of a specific task to be signaled or for a
2796 * specified timeout to expire. The timeout is in jiffies. It is not
2797 * interruptible.
2798 *
2799 * The return value is 0 if timed out, and positive (at least 1, or number of
2800 * jiffies left till timeout) if completed.
2801 */
2802unsigned long __sched
2803wait_for_completion_timeout(struct completion *x, unsigned long timeout)
2804{
2805 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
2806}
2807EXPORT_SYMBOL(wait_for_completion_timeout);
2808
2809/**
2810 * wait_for_completion_io: - waits for completion of a task
2811 * @x: holds the state of this particular completion
2812 *
2813 * This waits to be signaled for completion of a specific task. It is NOT
2814 * interruptible and there is no timeout. The caller is accounted as waiting
2815 * for IO.
2816 */
2817void __sched wait_for_completion_io(struct completion *x)
2818{
2819 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
2820}
2821EXPORT_SYMBOL(wait_for_completion_io);
2822
2823/**
2824 * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
2825 * @x: holds the state of this particular completion
2826 * @timeout: timeout value in jiffies
2827 *
2828 * This waits for either a completion of a specific task to be signaled or for a
2829 * specified timeout to expire. The timeout is in jiffies. It is not
2830 * interruptible. The caller is accounted as waiting for IO.
2831 *
2832 * The return value is 0 if timed out, and positive (at least 1, or number of
2833 * jiffies left till timeout) if completed.
2834 */
2835unsigned long __sched
2836wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
2837{
2838 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
2839}
2840EXPORT_SYMBOL(wait_for_completion_io_timeout);
2841
2842/**
2843 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
2844 * @x: holds the state of this particular completion
2845 *
2846 * This waits for completion of a specific task to be signaled. It is
2847 * interruptible.
2848 *
2849 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
2850 */
2851int __sched wait_for_completion_interruptible(struct completion *x)
2852{
2853 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
2854 if (t == -ERESTARTSYS)
2855 return t;
2856 return 0;
2857}
2858EXPORT_SYMBOL(wait_for_completion_interruptible);
2859
2860/**
2861 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
2862 * @x: holds the state of this particular completion
2863 * @timeout: timeout value in jiffies
2864 *
2865 * This waits for either a completion of a specific task to be signaled or for a
2866 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
2867 *
2868 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
2869 * positive (at least 1, or number of jiffies left till timeout) if completed.
2870 */
2871long __sched
2872wait_for_completion_interruptible_timeout(struct completion *x,
2873 unsigned long timeout)
2874{
2875 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
2876}
2877EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
2878
2879/**
2880 * wait_for_completion_killable: - waits for completion of a task (killable)
2881 * @x: holds the state of this particular completion
2882 *
2883 * This waits to be signaled for completion of a specific task. It can be
2884 * interrupted by a kill signal.
2885 *
2886 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
2887 */
2888int __sched wait_for_completion_killable(struct completion *x)
2889{
2890 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
2891 if (t == -ERESTARTSYS)
2892 return t;
2893 return 0;
2894}
2895EXPORT_SYMBOL(wait_for_completion_killable);
2896
2897/**
2898 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
2899 * @x: holds the state of this particular completion
2900 * @timeout: timeout value in jiffies
2901 *
2902 * This waits for either a completion of a specific task to be
2903 * signaled or for a specified timeout to expire. It can be
2904 * interrupted by a kill signal. The timeout is in jiffies.
2905 *
2906 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
2907 * positive (at least 1, or number of jiffies left till timeout) if completed.
2908 */
2909long __sched
2910wait_for_completion_killable_timeout(struct completion *x,
2911 unsigned long timeout)
2912{
2913 return wait_for_common(x, timeout, TASK_KILLABLE);
2914}
2915EXPORT_SYMBOL(wait_for_completion_killable_timeout);
2916
2917/**
2918 * try_wait_for_completion - try to decrement a completion without blocking
2919 * @x: completion structure
2920 *
2921 * Returns: 0 if a decrement cannot be done without blocking
2922 * 1 if a decrement succeeded.
2923 *
2924 * If a completion is being used as a counting completion,
2925 * attempt to decrement the counter without blocking. This
2926 * enables us to avoid waiting if the resource the completion
2927 * is protecting is not available.
2928 */
2929bool try_wait_for_completion(struct completion *x)
2930{
2931 unsigned long flags;
2932 int ret = 1;
2933
2934 spin_lock_irqsave(&x->wait.lock, flags);
2935 if (!x->done)
2936 ret = 0;
2937 else
2938 x->done--;
2939 spin_unlock_irqrestore(&x->wait.lock, flags);
2940 return ret;
2941}
2942EXPORT_SYMBOL(try_wait_for_completion);
2943
2944/**
2945 * completion_done - Test to see if a completion has any waiters
2946 * @x: completion structure
2947 *
2948 * Returns: 0 if there are waiters (wait_for_completion() in progress)
2949 * 1 if there are no waiters.
2950 *
2951 */
2952bool completion_done(struct completion *x)
2953{
2954 unsigned long flags;
2955 int ret = 1;
2956
2957 spin_lock_irqsave(&x->wait.lock, flags);
2958 if (!x->done)
2959 ret = 0;
2960 spin_unlock_irqrestore(&x->wait.lock, flags);
2961 return ret;
2962}
2963EXPORT_SYMBOL(completion_done);
2964
2965static long __sched 2691static long __sched
2966sleep_on_common(wait_queue_head_t *q, int state, long timeout) 2692sleep_on_common(wait_queue_head_t *q, int state, long timeout)
2967{ 2693{
@@ -3182,7 +2908,7 @@ SYSCALL_DEFINE1(nice, int, increment)
3182 * task_prio - return the priority value of a given task. 2908 * task_prio - return the priority value of a given task.
3183 * @p: the task in question. 2909 * @p: the task in question.
3184 * 2910 *
3185 * This is the priority value as seen by users in /proc. 2911 * Return: The priority value as seen by users in /proc.
3186 * RT tasks are offset by -200. Normal tasks are centered 2912 * RT tasks are offset by -200. Normal tasks are centered
3187 * around 0, value goes from -16 to +15. 2913 * around 0, value goes from -16 to +15.
3188 */ 2914 */
@@ -3194,6 +2920,8 @@ int task_prio(const struct task_struct *p)
3194/** 2920/**
3195 * task_nice - return the nice value of a given task. 2921 * task_nice - return the nice value of a given task.
3196 * @p: the task in question. 2922 * @p: the task in question.
2923 *
2924 * Return: The nice value [ -20 ... 0 ... 19 ].
3197 */ 2925 */
3198int task_nice(const struct task_struct *p) 2926int task_nice(const struct task_struct *p)
3199{ 2927{
@@ -3204,6 +2932,8 @@ EXPORT_SYMBOL(task_nice);
3204/** 2932/**
3205 * idle_cpu - is a given cpu idle currently? 2933 * idle_cpu - is a given cpu idle currently?
3206 * @cpu: the processor in question. 2934 * @cpu: the processor in question.
2935 *
2936 * Return: 1 if the CPU is currently idle. 0 otherwise.
3207 */ 2937 */
3208int idle_cpu(int cpu) 2938int idle_cpu(int cpu)
3209{ 2939{
@@ -3226,6 +2956,8 @@ int idle_cpu(int cpu)
3226/** 2956/**
3227 * idle_task - return the idle task for a given cpu. 2957 * idle_task - return the idle task for a given cpu.
3228 * @cpu: the processor in question. 2958 * @cpu: the processor in question.
2959 *
2960 * Return: The idle task for the cpu @cpu.
3229 */ 2961 */
3230struct task_struct *idle_task(int cpu) 2962struct task_struct *idle_task(int cpu)
3231{ 2963{
@@ -3235,6 +2967,8 @@ struct task_struct *idle_task(int cpu)
3235/** 2967/**
3236 * find_process_by_pid - find a process with a matching PID value. 2968 * find_process_by_pid - find a process with a matching PID value.
3237 * @pid: the pid in question. 2969 * @pid: the pid in question.
2970 *
2971 * The task of @pid, if found. %NULL otherwise.
3238 */ 2972 */
3239static struct task_struct *find_process_by_pid(pid_t pid) 2973static struct task_struct *find_process_by_pid(pid_t pid)
3240{ 2974{
@@ -3432,6 +3166,8 @@ recheck:
3432 * @policy: new policy. 3166 * @policy: new policy.
3433 * @param: structure containing the new RT priority. 3167 * @param: structure containing the new RT priority.
3434 * 3168 *
3169 * Return: 0 on success. An error code otherwise.
3170 *
3435 * NOTE that the task may be already dead. 3171 * NOTE that the task may be already dead.
3436 */ 3172 */
3437int sched_setscheduler(struct task_struct *p, int policy, 3173int sched_setscheduler(struct task_struct *p, int policy,
@@ -3451,6 +3187,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
3451 * current context has permission. For example, this is needed in 3187 * current context has permission. For example, this is needed in
3452 * stop_machine(): we create temporary high priority worker threads, 3188 * stop_machine(): we create temporary high priority worker threads,
3453 * but our caller might not have that capability. 3189 * but our caller might not have that capability.
3190 *
3191 * Return: 0 on success. An error code otherwise.
3454 */ 3192 */
3455int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3193int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3456 const struct sched_param *param) 3194 const struct sched_param *param)
@@ -3485,6 +3223,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3485 * @pid: the pid in question. 3223 * @pid: the pid in question.
3486 * @policy: new policy. 3224 * @policy: new policy.
3487 * @param: structure containing the new RT priority. 3225 * @param: structure containing the new RT priority.
3226 *
3227 * Return: 0 on success. An error code otherwise.
3488 */ 3228 */
3489SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 3229SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3490 struct sched_param __user *, param) 3230 struct sched_param __user *, param)
@@ -3500,6 +3240,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3500 * sys_sched_setparam - set/change the RT priority of a thread 3240 * sys_sched_setparam - set/change the RT priority of a thread
3501 * @pid: the pid in question. 3241 * @pid: the pid in question.
3502 * @param: structure containing the new RT priority. 3242 * @param: structure containing the new RT priority.
3243 *
3244 * Return: 0 on success. An error code otherwise.
3503 */ 3245 */
3504SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3246SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3505{ 3247{
@@ -3509,6 +3251,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3509/** 3251/**
3510 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3252 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3511 * @pid: the pid in question. 3253 * @pid: the pid in question.
3254 *
3255 * Return: On success, the policy of the thread. Otherwise, a negative error
3256 * code.
3512 */ 3257 */
3513SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 3258SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3514{ 3259{
@@ -3535,6 +3280,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3535 * sys_sched_getparam - get the RT priority of a thread 3280 * sys_sched_getparam - get the RT priority of a thread
3536 * @pid: the pid in question. 3281 * @pid: the pid in question.
3537 * @param: structure containing the RT priority. 3282 * @param: structure containing the RT priority.
3283 *
3284 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
3285 * code.
3538 */ 3286 */
3539SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 3287SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3540{ 3288{
@@ -3576,13 +3324,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3576 struct task_struct *p; 3324 struct task_struct *p;
3577 int retval; 3325 int retval;
3578 3326
3579 get_online_cpus();
3580 rcu_read_lock(); 3327 rcu_read_lock();
3581 3328
3582 p = find_process_by_pid(pid); 3329 p = find_process_by_pid(pid);
3583 if (!p) { 3330 if (!p) {
3584 rcu_read_unlock(); 3331 rcu_read_unlock();
3585 put_online_cpus();
3586 return -ESRCH; 3332 return -ESRCH;
3587 } 3333 }
3588 3334
@@ -3639,7 +3385,6 @@ out_free_cpus_allowed:
3639 free_cpumask_var(cpus_allowed); 3385 free_cpumask_var(cpus_allowed);
3640out_put_task: 3386out_put_task:
3641 put_task_struct(p); 3387 put_task_struct(p);
3642 put_online_cpus();
3643 return retval; 3388 return retval;
3644} 3389}
3645 3390
@@ -3659,6 +3404,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3659 * @pid: pid of the process 3404 * @pid: pid of the process
3660 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3405 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3661 * @user_mask_ptr: user-space pointer to the new cpu mask 3406 * @user_mask_ptr: user-space pointer to the new cpu mask
3407 *
3408 * Return: 0 on success. An error code otherwise.
3662 */ 3409 */
3663SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 3410SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
3664 unsigned long __user *, user_mask_ptr) 3411 unsigned long __user *, user_mask_ptr)
@@ -3682,7 +3429,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
3682 unsigned long flags; 3429 unsigned long flags;
3683 int retval; 3430 int retval;
3684 3431
3685 get_online_cpus();
3686 rcu_read_lock(); 3432 rcu_read_lock();
3687 3433
3688 retval = -ESRCH; 3434 retval = -ESRCH;
@@ -3695,12 +3441,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
3695 goto out_unlock; 3441 goto out_unlock;
3696 3442
3697 raw_spin_lock_irqsave(&p->pi_lock, flags); 3443 raw_spin_lock_irqsave(&p->pi_lock, flags);
3698 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 3444 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
3699 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3445 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3700 3446
3701out_unlock: 3447out_unlock:
3702 rcu_read_unlock(); 3448 rcu_read_unlock();
3703 put_online_cpus();
3704 3449
3705 return retval; 3450 return retval;
3706} 3451}
@@ -3710,6 +3455,8 @@ out_unlock:
3710 * @pid: pid of the process 3455 * @pid: pid of the process
3711 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3456 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3712 * @user_mask_ptr: user-space pointer to hold the current cpu mask 3457 * @user_mask_ptr: user-space pointer to hold the current cpu mask
3458 *
3459 * Return: 0 on success. An error code otherwise.
3713 */ 3460 */
3714SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 3461SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
3715 unsigned long __user *, user_mask_ptr) 3462 unsigned long __user *, user_mask_ptr)
@@ -3744,6 +3491,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
3744 * 3491 *
3745 * This function yields the current CPU to other tasks. If there are no 3492 * This function yields the current CPU to other tasks. If there are no
3746 * other threads running on this CPU then this function will return. 3493 * other threads running on this CPU then this function will return.
3494 *
3495 * Return: 0.
3747 */ 3496 */
3748SYSCALL_DEFINE0(sched_yield) 3497SYSCALL_DEFINE0(sched_yield)
3749{ 3498{
@@ -3766,16 +3515,11 @@ SYSCALL_DEFINE0(sched_yield)
3766 return 0; 3515 return 0;
3767} 3516}
3768 3517
3769static inline int should_resched(void)
3770{
3771 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
3772}
3773
3774static void __cond_resched(void) 3518static void __cond_resched(void)
3775{ 3519{
3776 add_preempt_count(PREEMPT_ACTIVE); 3520 __preempt_count_add(PREEMPT_ACTIVE);
3777 __schedule(); 3521 __schedule();
3778 sub_preempt_count(PREEMPT_ACTIVE); 3522 __preempt_count_sub(PREEMPT_ACTIVE);
3779} 3523}
3780 3524
3781int __sched _cond_resched(void) 3525int __sched _cond_resched(void)
@@ -3869,7 +3613,7 @@ EXPORT_SYMBOL(yield);
3869 * It's the caller's job to ensure that the target task struct 3613 * It's the caller's job to ensure that the target task struct
3870 * can't go away on us before we can do any checks. 3614 * can't go away on us before we can do any checks.
3871 * 3615 *
3872 * Returns: 3616 * Return:
3873 * true (>0) if we indeed boosted the target task. 3617 * true (>0) if we indeed boosted the target task.
3874 * false (0) if we failed to boost the target. 3618 * false (0) if we failed to boost the target.
3875 * -ESRCH if there's no task to yield to. 3619 * -ESRCH if there's no task to yield to.
@@ -3972,8 +3716,9 @@ long __sched io_schedule_timeout(long timeout)
3972 * sys_sched_get_priority_max - return maximum RT priority. 3716 * sys_sched_get_priority_max - return maximum RT priority.
3973 * @policy: scheduling class. 3717 * @policy: scheduling class.
3974 * 3718 *
3975 * this syscall returns the maximum rt_priority that can be used 3719 * Return: On success, this syscall returns the maximum
3976 * by a given scheduling class. 3720 * rt_priority that can be used by a given scheduling class.
3721 * On failure, a negative error code is returned.
3977 */ 3722 */
3978SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 3723SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
3979{ 3724{
@@ -3997,8 +3742,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
3997 * sys_sched_get_priority_min - return minimum RT priority. 3742 * sys_sched_get_priority_min - return minimum RT priority.
3998 * @policy: scheduling class. 3743 * @policy: scheduling class.
3999 * 3744 *
4000 * this syscall returns the minimum rt_priority that can be used 3745 * Return: On success, this syscall returns the minimum
4001 * by a given scheduling class. 3746 * rt_priority that can be used by a given scheduling class.
3747 * On failure, a negative error code is returned.
4002 */ 3748 */
4003SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 3749SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4004{ 3750{
@@ -4024,6 +3770,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4024 * 3770 *
4025 * this syscall writes the default timeslice value of a given process 3771 * this syscall writes the default timeslice value of a given process
4026 * into the user-space timespec buffer. A value of '0' means infinity. 3772 * into the user-space timespec buffer. A value of '0' means infinity.
3773 *
3774 * Return: On success, 0 and the timeslice is in @interval. Otherwise,
3775 * an error code.
4027 */ 3776 */
4028SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 3777SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4029 struct timespec __user *, interval) 3778 struct timespec __user *, interval)
@@ -4153,7 +3902,7 @@ void init_idle(struct task_struct *idle, int cpu)
4153 3902
4154 raw_spin_lock_irqsave(&rq->lock, flags); 3903 raw_spin_lock_irqsave(&rq->lock, flags);
4155 3904
4156 __sched_fork(idle); 3905 __sched_fork(0, idle);
4157 idle->state = TASK_RUNNING; 3906 idle->state = TASK_RUNNING;
4158 idle->se.exec_start = sched_clock(); 3907 idle->se.exec_start = sched_clock();
4159 3908
@@ -4179,7 +3928,7 @@ void init_idle(struct task_struct *idle, int cpu)
4179 raw_spin_unlock_irqrestore(&rq->lock, flags); 3928 raw_spin_unlock_irqrestore(&rq->lock, flags);
4180 3929
4181 /* Set the preempt count _outside_ the spinlocks! */ 3930 /* Set the preempt count _outside_ the spinlocks! */
4182 task_thread_info(idle)->preempt_count = 0; 3931 init_idle_preempt_count(idle, cpu);
4183 3932
4184 /* 3933 /*
4185 * The idle tasks have their own, simple scheduling class: 3934 * The idle tasks have their own, simple scheduling class:
@@ -4313,6 +4062,53 @@ fail:
4313 return ret; 4062 return ret;
4314} 4063}
4315 4064
4065#ifdef CONFIG_NUMA_BALANCING
4066/* Migrate current task p to target_cpu */
4067int migrate_task_to(struct task_struct *p, int target_cpu)
4068{
4069 struct migration_arg arg = { p, target_cpu };
4070 int curr_cpu = task_cpu(p);
4071
4072 if (curr_cpu == target_cpu)
4073 return 0;
4074
4075 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
4076 return -EINVAL;
4077
4078 /* TODO: This is not properly updating schedstats */
4079
4080 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4081}
4082
4083/*
4084 * Requeue a task on a given node and accurately track the number of NUMA
4085 * tasks on the runqueues
4086 */
4087void sched_setnuma(struct task_struct *p, int nid)
4088{
4089 struct rq *rq;
4090 unsigned long flags;
4091 bool on_rq, running;
4092
4093 rq = task_rq_lock(p, &flags);
4094 on_rq = p->on_rq;
4095 running = task_current(rq, p);
4096
4097 if (on_rq)
4098 dequeue_task(rq, p, 0);
4099 if (running)
4100 p->sched_class->put_prev_task(rq, p);
4101
4102 p->numa_preferred_nid = nid;
4103
4104 if (running)
4105 p->sched_class->set_curr_task(rq);
4106 if (on_rq)
4107 enqueue_task(rq, p, 0);
4108 task_rq_unlock(rq, p, &flags);
4109}
4110#endif
4111
4316/* 4112/*
4317 * migration_cpu_stop - this will be executed by a highprio stopper thread 4113 * migration_cpu_stop - this will be executed by a highprio stopper thread
4318 * and performs thread migration by bumping thread off CPU then 4114 * and performs thread migration by bumping thread off CPU then
@@ -4914,7 +4710,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
4914 SD_BALANCE_FORK | 4710 SD_BALANCE_FORK |
4915 SD_BALANCE_EXEC | 4711 SD_BALANCE_EXEC |
4916 SD_SHARE_CPUPOWER | 4712 SD_SHARE_CPUPOWER |
4917 SD_SHARE_PKG_RESOURCES); 4713 SD_SHARE_PKG_RESOURCES |
4714 SD_PREFER_SIBLING);
4918 if (nr_node_ids == 1) 4715 if (nr_node_ids == 1)
4919 pflags &= ~SD_SERIALIZE; 4716 pflags &= ~SD_SERIALIZE;
4920 } 4717 }
@@ -5083,19 +4880,34 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5083 * two cpus are in the same cache domain, see cpus_share_cache(). 4880 * two cpus are in the same cache domain, see cpus_share_cache().
5084 */ 4881 */
5085DEFINE_PER_CPU(struct sched_domain *, sd_llc); 4882DEFINE_PER_CPU(struct sched_domain *, sd_llc);
4883DEFINE_PER_CPU(int, sd_llc_size);
5086DEFINE_PER_CPU(int, sd_llc_id); 4884DEFINE_PER_CPU(int, sd_llc_id);
4885DEFINE_PER_CPU(struct sched_domain *, sd_numa);
4886DEFINE_PER_CPU(struct sched_domain *, sd_busy);
4887DEFINE_PER_CPU(struct sched_domain *, sd_asym);
5087 4888
5088static void update_top_cache_domain(int cpu) 4889static void update_top_cache_domain(int cpu)
5089{ 4890{
5090 struct sched_domain *sd; 4891 struct sched_domain *sd;
5091 int id = cpu; 4892 int id = cpu;
4893 int size = 1;
5092 4894
5093 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 4895 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5094 if (sd) 4896 if (sd) {
5095 id = cpumask_first(sched_domain_span(sd)); 4897 id = cpumask_first(sched_domain_span(sd));
4898 size = cpumask_weight(sched_domain_span(sd));
4899 rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
4900 }
5096 4901
5097 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 4902 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
4903 per_cpu(sd_llc_size, cpu) = size;
5098 per_cpu(sd_llc_id, cpu) = id; 4904 per_cpu(sd_llc_id, cpu) = id;
4905
4906 sd = lowest_flag_domain(cpu, SD_NUMA);
4907 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
4908
4909 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
4910 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
5099} 4911}
5100 4912
5101/* 4913/*
@@ -5118,6 +4930,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5118 tmp->parent = parent->parent; 4930 tmp->parent = parent->parent;
5119 if (parent->parent) 4931 if (parent->parent)
5120 parent->parent->child = tmp; 4932 parent->parent->child = tmp;
4933 /*
4934 * Transfer SD_PREFER_SIBLING down in case of a
4935 * degenerate parent; the spans match for this
4936 * so the property transfers.
4937 */
4938 if (parent->flags & SD_PREFER_SIBLING)
4939 tmp->flags |= SD_PREFER_SIBLING;
5121 destroy_sched_domain(parent, cpu); 4940 destroy_sched_domain(parent, cpu);
5122 } else 4941 } else
5123 tmp = tmp->parent; 4942 tmp = tmp->parent;
@@ -5608,6 +5427,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
5608 | 0*SD_SHARE_PKG_RESOURCES 5427 | 0*SD_SHARE_PKG_RESOURCES
5609 | 1*SD_SERIALIZE 5428 | 1*SD_SERIALIZE
5610 | 0*SD_PREFER_SIBLING 5429 | 0*SD_PREFER_SIBLING
5430 | 1*SD_NUMA
5611 | sd_local_flags(level) 5431 | sd_local_flags(level)
5612 , 5432 ,
5613 .last_balance = jiffies, 5433 .last_balance = jiffies,
@@ -6184,8 +6004,9 @@ match1:
6184 ; 6004 ;
6185 } 6005 }
6186 6006
6007 n = ndoms_cur;
6187 if (doms_new == NULL) { 6008 if (doms_new == NULL) {
6188 ndoms_cur = 0; 6009 n = 0;
6189 doms_new = &fallback_doms; 6010 doms_new = &fallback_doms;
6190 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 6011 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6191 WARN_ON_ONCE(dattr_new); 6012 WARN_ON_ONCE(dattr_new);
@@ -6193,7 +6014,7 @@ match1:
6193 6014
6194 /* Build new domains */ 6015 /* Build new domains */
6195 for (i = 0; i < ndoms_new; i++) { 6016 for (i = 0; i < ndoms_new; i++) {
6196 for (j = 0; j < ndoms_cur && !new_topology; j++) { 6017 for (j = 0; j < n && !new_topology; j++) {
6197 if (cpumask_equal(doms_new[i], doms_cur[j]) 6018 if (cpumask_equal(doms_new[i], doms_cur[j])
6198 && dattrs_equal(dattr_new, i, dattr_cur, j)) 6019 && dattrs_equal(dattr_new, i, dattr_cur, j))
6199 goto match2; 6020 goto match2;
@@ -6288,14 +6109,17 @@ void __init sched_init_smp(void)
6288 6109
6289 sched_init_numa(); 6110 sched_init_numa();
6290 6111
6291 get_online_cpus(); 6112 /*
6113 * There's no userspace yet to cause hotplug operations; hence all the
6114 * cpu masks are stable and all blatant races in the below code cannot
6115 * happen.
6116 */
6292 mutex_lock(&sched_domains_mutex); 6117 mutex_lock(&sched_domains_mutex);
6293 init_sched_domains(cpu_active_mask); 6118 init_sched_domains(cpu_active_mask);
6294 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 6119 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6295 if (cpumask_empty(non_isolated_cpus)) 6120 if (cpumask_empty(non_isolated_cpus))
6296 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 6121 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6297 mutex_unlock(&sched_domains_mutex); 6122 mutex_unlock(&sched_domains_mutex);
6298 put_online_cpus();
6299 6123
6300 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); 6124 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6301 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6125 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
@@ -6458,6 +6282,7 @@ void __init sched_init(void)
6458 rq->online = 0; 6282 rq->online = 0;
6459 rq->idle_stamp = 0; 6283 rq->idle_stamp = 0;
6460 rq->avg_idle = 2*sysctl_sched_migration_cost; 6284 rq->avg_idle = 2*sysctl_sched_migration_cost;
6285 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6461 6286
6462 INIT_LIST_HEAD(&rq->cfs_tasks); 6287 INIT_LIST_HEAD(&rq->cfs_tasks);
6463 6288
@@ -6632,6 +6457,8 @@ void normalize_rt_tasks(void)
6632 * @cpu: the processor in question. 6457 * @cpu: the processor in question.
6633 * 6458 *
6634 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6459 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6460 *
6461 * Return: The current task for @cpu.
6635 */ 6462 */
6636struct task_struct *curr_task(int cpu) 6463struct task_struct *curr_task(int cpu)
6637{ 6464{
@@ -6763,7 +6590,7 @@ void sched_move_task(struct task_struct *tsk)
6763 if (unlikely(running)) 6590 if (unlikely(running))
6764 tsk->sched_class->put_prev_task(rq, tsk); 6591 tsk->sched_class->put_prev_task(rq, tsk);
6765 6592
6766 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, 6593 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
6767 lockdep_is_held(&tsk->sighand->siglock)), 6594 lockdep_is_held(&tsk->sighand->siglock)),
6768 struct task_group, css); 6595 struct task_group, css);
6769 tg = autogroup_task_group(tsk, tg); 6596 tg = autogroup_task_group(tsk, tg);
@@ -7085,23 +6912,22 @@ int sched_rt_handler(struct ctl_table *table, int write,
7085 6912
7086#ifdef CONFIG_CGROUP_SCHED 6913#ifdef CONFIG_CGROUP_SCHED
7087 6914
7088/* return corresponding task_group object of a cgroup */ 6915static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
7089static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7090{ 6916{
7091 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 6917 return css ? container_of(css, struct task_group, css) : NULL;
7092 struct task_group, css);
7093} 6918}
7094 6919
7095static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) 6920static struct cgroup_subsys_state *
6921cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7096{ 6922{
7097 struct task_group *tg, *parent; 6923 struct task_group *parent = css_tg(parent_css);
6924 struct task_group *tg;
7098 6925
7099 if (!cgrp->parent) { 6926 if (!parent) {
7100 /* This is early initialization for the top cgroup */ 6927 /* This is early initialization for the top cgroup */
7101 return &root_task_group.css; 6928 return &root_task_group.css;
7102 } 6929 }
7103 6930
7104 parent = cgroup_tg(cgrp->parent);
7105 tg = sched_create_group(parent); 6931 tg = sched_create_group(parent);
7106 if (IS_ERR(tg)) 6932 if (IS_ERR(tg))
7107 return ERR_PTR(-ENOMEM); 6933 return ERR_PTR(-ENOMEM);
@@ -7109,41 +6935,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7109 return &tg->css; 6935 return &tg->css;
7110} 6936}
7111 6937
7112static int cpu_cgroup_css_online(struct cgroup *cgrp) 6938static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7113{ 6939{
7114 struct task_group *tg = cgroup_tg(cgrp); 6940 struct task_group *tg = css_tg(css);
7115 struct task_group *parent; 6941 struct task_group *parent = css_tg(css_parent(css));
7116
7117 if (!cgrp->parent)
7118 return 0;
7119 6942
7120 parent = cgroup_tg(cgrp->parent); 6943 if (parent)
7121 sched_online_group(tg, parent); 6944 sched_online_group(tg, parent);
7122 return 0; 6945 return 0;
7123} 6946}
7124 6947
7125static void cpu_cgroup_css_free(struct cgroup *cgrp) 6948static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
7126{ 6949{
7127 struct task_group *tg = cgroup_tg(cgrp); 6950 struct task_group *tg = css_tg(css);
7128 6951
7129 sched_destroy_group(tg); 6952 sched_destroy_group(tg);
7130} 6953}
7131 6954
7132static void cpu_cgroup_css_offline(struct cgroup *cgrp) 6955static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7133{ 6956{
7134 struct task_group *tg = cgroup_tg(cgrp); 6957 struct task_group *tg = css_tg(css);
7135 6958
7136 sched_offline_group(tg); 6959 sched_offline_group(tg);
7137} 6960}
7138 6961
7139static int cpu_cgroup_can_attach(struct cgroup *cgrp, 6962static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7140 struct cgroup_taskset *tset) 6963 struct cgroup_taskset *tset)
7141{ 6964{
7142 struct task_struct *task; 6965 struct task_struct *task;
7143 6966
7144 cgroup_taskset_for_each(task, cgrp, tset) { 6967 cgroup_taskset_for_each(task, css, tset) {
7145#ifdef CONFIG_RT_GROUP_SCHED 6968#ifdef CONFIG_RT_GROUP_SCHED
7146 if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) 6969 if (!sched_rt_can_attach(css_tg(css), task))
7147 return -EINVAL; 6970 return -EINVAL;
7148#else 6971#else
7149 /* We don't support RT-tasks being in separate groups */ 6972 /* We don't support RT-tasks being in separate groups */
@@ -7154,18 +6977,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7154 return 0; 6977 return 0;
7155} 6978}
7156 6979
7157static void cpu_cgroup_attach(struct cgroup *cgrp, 6980static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7158 struct cgroup_taskset *tset) 6981 struct cgroup_taskset *tset)
7159{ 6982{
7160 struct task_struct *task; 6983 struct task_struct *task;
7161 6984
7162 cgroup_taskset_for_each(task, cgrp, tset) 6985 cgroup_taskset_for_each(task, css, tset)
7163 sched_move_task(task); 6986 sched_move_task(task);
7164} 6987}
7165 6988
7166static void 6989static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
7167cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 6990 struct cgroup_subsys_state *old_css,
7168 struct task_struct *task) 6991 struct task_struct *task)
7169{ 6992{
7170 /* 6993 /*
7171 * cgroup_exit() is called in the copy_process() failure path. 6994 * cgroup_exit() is called in the copy_process() failure path.
@@ -7179,15 +7002,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7179} 7002}
7180 7003
7181#ifdef CONFIG_FAIR_GROUP_SCHED 7004#ifdef CONFIG_FAIR_GROUP_SCHED
7182static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7005static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7183 u64 shareval) 7006 struct cftype *cftype, u64 shareval)
7184{ 7007{
7185 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); 7008 return sched_group_set_shares(css_tg(css), scale_load(shareval));
7186} 7009}
7187 7010
7188static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 7011static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7012 struct cftype *cft)
7189{ 7013{
7190 struct task_group *tg = cgroup_tg(cgrp); 7014 struct task_group *tg = css_tg(css);
7191 7015
7192 return (u64) scale_load_down(tg->shares); 7016 return (u64) scale_load_down(tg->shares);
7193} 7017}
@@ -7231,7 +7055,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7231 7055
7232 runtime_enabled = quota != RUNTIME_INF; 7056 runtime_enabled = quota != RUNTIME_INF;
7233 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7057 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7234 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); 7058 /*
7059 * If we need to toggle cfs_bandwidth_used, off->on must occur
7060 * before making related changes, and on->off must occur afterwards
7061 */
7062 if (runtime_enabled && !runtime_was_enabled)
7063 cfs_bandwidth_usage_inc();
7235 raw_spin_lock_irq(&cfs_b->lock); 7064 raw_spin_lock_irq(&cfs_b->lock);
7236 cfs_b->period = ns_to_ktime(period); 7065 cfs_b->period = ns_to_ktime(period);
7237 cfs_b->quota = quota; 7066 cfs_b->quota = quota;
@@ -7257,6 +7086,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7257 unthrottle_cfs_rq(cfs_rq); 7086 unthrottle_cfs_rq(cfs_rq);
7258 raw_spin_unlock_irq(&rq->lock); 7087 raw_spin_unlock_irq(&rq->lock);
7259 } 7088 }
7089 if (runtime_was_enabled && !runtime_enabled)
7090 cfs_bandwidth_usage_dec();
7260out_unlock: 7091out_unlock:
7261 mutex_unlock(&cfs_constraints_mutex); 7092 mutex_unlock(&cfs_constraints_mutex);
7262 7093
@@ -7309,26 +7140,28 @@ long tg_get_cfs_period(struct task_group *tg)
7309 return cfs_period_us; 7140 return cfs_period_us;
7310} 7141}
7311 7142
7312static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) 7143static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
7144 struct cftype *cft)
7313{ 7145{
7314 return tg_get_cfs_quota(cgroup_tg(cgrp)); 7146 return tg_get_cfs_quota(css_tg(css));
7315} 7147}
7316 7148
7317static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, 7149static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
7318 s64 cfs_quota_us) 7150 struct cftype *cftype, s64 cfs_quota_us)
7319{ 7151{
7320 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); 7152 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
7321} 7153}
7322 7154
7323static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) 7155static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
7156 struct cftype *cft)
7324{ 7157{
7325 return tg_get_cfs_period(cgroup_tg(cgrp)); 7158 return tg_get_cfs_period(css_tg(css));
7326} 7159}
7327 7160
7328static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7161static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
7329 u64 cfs_period_us) 7162 struct cftype *cftype, u64 cfs_period_us)
7330{ 7163{
7331 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); 7164 return tg_set_cfs_period(css_tg(css), cfs_period_us);
7332} 7165}
7333 7166
7334struct cfs_schedulable_data { 7167struct cfs_schedulable_data {
@@ -7409,10 +7242,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7409 return ret; 7242 return ret;
7410} 7243}
7411 7244
7412static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, 7245static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
7413 struct cgroup_map_cb *cb) 7246 struct cgroup_map_cb *cb)
7414{ 7247{
7415 struct task_group *tg = cgroup_tg(cgrp); 7248 struct task_group *tg = css_tg(css);
7416 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7249 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7417 7250
7418 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7251 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
@@ -7425,26 +7258,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
7425#endif /* CONFIG_FAIR_GROUP_SCHED */ 7258#endif /* CONFIG_FAIR_GROUP_SCHED */
7426 7259
7427#ifdef CONFIG_RT_GROUP_SCHED 7260#ifdef CONFIG_RT_GROUP_SCHED
7428static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 7261static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
7429 s64 val) 7262 struct cftype *cft, s64 val)
7430{ 7263{
7431 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 7264 return sched_group_set_rt_runtime(css_tg(css), val);
7432} 7265}
7433 7266
7434static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 7267static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
7268 struct cftype *cft)
7435{ 7269{
7436 return sched_group_rt_runtime(cgroup_tg(cgrp)); 7270 return sched_group_rt_runtime(css_tg(css));
7437} 7271}
7438 7272
7439static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 7273static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
7440 u64 rt_period_us) 7274 struct cftype *cftype, u64 rt_period_us)
7441{ 7275{
7442 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 7276 return sched_group_set_rt_period(css_tg(css), rt_period_us);
7443} 7277}
7444 7278
7445static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 7279static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
7280 struct cftype *cft)
7446{ 7281{
7447 return sched_group_rt_period(cgroup_tg(cgrp)); 7282 return sched_group_rt_period(css_tg(css));
7448} 7283}
7449#endif /* CONFIG_RT_GROUP_SCHED */ 7284#endif /* CONFIG_RT_GROUP_SCHED */
7450 7285
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dbb7e2cd95eb..f64722ff0299 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -33,30 +33,20 @@ struct cpuacct {
33 struct kernel_cpustat __percpu *cpustat; 33 struct kernel_cpustat __percpu *cpustat;
34}; 34};
35 35
36/* return cpu accounting group corresponding to this container */ 36static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
37static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
38{ 37{
39 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), 38 return css ? container_of(css, struct cpuacct, css) : NULL;
40 struct cpuacct, css);
41} 39}
42 40
43/* return cpu accounting group to which this task belongs */ 41/* return cpu accounting group to which this task belongs */
44static inline struct cpuacct *task_ca(struct task_struct *tsk) 42static inline struct cpuacct *task_ca(struct task_struct *tsk)
45{ 43{
46 return container_of(task_subsys_state(tsk, cpuacct_subsys_id), 44 return css_ca(task_css(tsk, cpuacct_subsys_id));
47 struct cpuacct, css);
48}
49
50static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
51{
52 return cgroup_ca(ca->css.cgroup->parent);
53} 45}
54 46
55static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
56{ 48{
57 if (!ca->css.cgroup->parent) 49 return css_ca(css_parent(&ca->css));
58 return NULL;
59 return cgroup_ca(ca->css.cgroup->parent);
60} 50}
61 51
62static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); 52static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
@@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = {
66}; 56};
67 57
68/* create a new cpu accounting group */ 58/* create a new cpu accounting group */
69static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) 59static struct cgroup_subsys_state *
60cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
70{ 61{
71 struct cpuacct *ca; 62 struct cpuacct *ca;
72 63
73 if (!cgrp->parent) 64 if (!parent_css)
74 return &root_cpuacct.css; 65 return &root_cpuacct.css;
75 66
76 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 67 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
@@ -96,9 +87,9 @@ out:
96} 87}
97 88
98/* destroy an existing cpu accounting group */ 89/* destroy an existing cpu accounting group */
99static void cpuacct_css_free(struct cgroup *cgrp) 90static void cpuacct_css_free(struct cgroup_subsys_state *css)
100{ 91{
101 struct cpuacct *ca = cgroup_ca(cgrp); 92 struct cpuacct *ca = css_ca(css);
102 93
103 free_percpu(ca->cpustat); 94 free_percpu(ca->cpustat);
104 free_percpu(ca->cpuusage); 95 free_percpu(ca->cpuusage);
@@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
141} 132}
142 133
143/* return total cpu usage (in nanoseconds) of a group */ 134/* return total cpu usage (in nanoseconds) of a group */
144static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 135static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
145{ 136{
146 struct cpuacct *ca = cgroup_ca(cgrp); 137 struct cpuacct *ca = css_ca(css);
147 u64 totalcpuusage = 0; 138 u64 totalcpuusage = 0;
148 int i; 139 int i;
149 140
@@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
153 return totalcpuusage; 144 return totalcpuusage;
154} 145}
155 146
156static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 147static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
157 u64 reset) 148 u64 reset)
158{ 149{
159 struct cpuacct *ca = cgroup_ca(cgrp); 150 struct cpuacct *ca = css_ca(css);
160 int err = 0; 151 int err = 0;
161 int i; 152 int i;
162 153
@@ -172,10 +163,10 @@ out:
172 return err; 163 return err;
173} 164}
174 165
175static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 166static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css,
176 struct seq_file *m) 167 struct cftype *cft, struct seq_file *m)
177{ 168{
178 struct cpuacct *ca = cgroup_ca(cgroup); 169 struct cpuacct *ca = css_ca(css);
179 u64 percpu; 170 u64 percpu;
180 int i; 171 int i;
181 172
@@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = {
192 [CPUACCT_STAT_SYSTEM] = "system", 183 [CPUACCT_STAT_SYSTEM] = "system",
193}; 184};
194 185
195static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 186static int cpuacct_stats_show(struct cgroup_subsys_state *css,
196 struct cgroup_map_cb *cb) 187 struct cftype *cft, struct cgroup_map_cb *cb)
197{ 188{
198 struct cpuacct *ca = cgroup_ca(cgrp); 189 struct cpuacct *ca = css_ca(css);
199 int cpu; 190 int cpu;
200 s64 val = 0; 191 s64 val = 0;
201 192
@@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
281 while (ca != &root_cpuacct) { 272 while (ca != &root_cpuacct) {
282 kcpustat = this_cpu_ptr(ca->cpustat); 273 kcpustat = this_cpu_ptr(ca->cpustat);
283 kcpustat->cpustat[index] += val; 274 kcpustat->cpustat[index] += val;
284 ca = __parent_ca(ca); 275 ca = parent_ca(ca);
285 } 276 }
286 rcu_read_unlock(); 277 rcu_read_unlock();
287} 278}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 1095e878a46f..8b836b376d91 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -62,7 +62,7 @@ static int convert_prio(int prio)
62 * any discrepancies created by racing against the uncertainty of the current 62 * any discrepancies created by racing against the uncertainty of the current
63 * priority configuration. 63 * priority configuration.
64 * 64 *
65 * Returns: (int)bool - CPUs were found 65 * Return: (int)bool - CPUs were found
66 */ 66 */
67int cpupri_find(struct cpupri *cp, struct task_struct *p, 67int cpupri_find(struct cpupri *cp, struct task_struct *p,
68 struct cpumask *lowest_mask) 68 struct cpumask *lowest_mask)
@@ -203,7 +203,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
203 * cpupri_init - initialize the cpupri structure 203 * cpupri_init - initialize the cpupri structure
204 * @cp: The cpupri context 204 * @cp: The cpupri context
205 * 205 *
206 * Returns: -ENOMEM if memory fails. 206 * Return: -ENOMEM on memory allocation failure.
207 */ 207 */
208int cpupri_init(struct cpupri *cp) 208int cpupri_init(struct cpupri *cp)
209{ 209{
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a7959e05a9d5..99947919e30b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
121 * is the only cgroup, then nothing else should be necessary. 121 * is the only cgroup, then nothing else should be necessary.
122 * 122 *
123 */ 123 */
124 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; 124 __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
125 125
126 cpuacct_account_field(p, index, tmp); 126 cpuacct_account_field(p, index, tmp);
127} 127}
@@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
378#ifdef CONFIG_VIRT_CPU_ACCOUNTING 378#ifdef CONFIG_VIRT_CPU_ACCOUNTING
379 379
380#ifndef __ARCH_HAS_VTIME_TASK_SWITCH 380#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
381void vtime_task_switch(struct task_struct *prev) 381void vtime_common_task_switch(struct task_struct *prev)
382{ 382{
383 if (!vtime_accounting_enabled())
384 return;
385
386 if (is_idle_task(prev)) 383 if (is_idle_task(prev))
387 vtime_account_idle(prev); 384 vtime_account_idle(prev);
388 else 385 else
@@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev)
404 * vtime_account(). 401 * vtime_account().
405 */ 402 */
406#ifndef __ARCH_HAS_VTIME_ACCOUNT 403#ifndef __ARCH_HAS_VTIME_ACCOUNT
407void vtime_account_irq_enter(struct task_struct *tsk) 404void vtime_common_account_irq_enter(struct task_struct *tsk)
408{ 405{
409 if (!vtime_accounting_enabled())
410 return;
411
412 if (!in_interrupt()) { 406 if (!in_interrupt()) {
413 /* 407 /*
414 * If we interrupted user, context_tracking_in_user() 408 * If we interrupted user, context_tracking_in_user()
@@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
428 } 422 }
429 vtime_account_system(tsk); 423 vtime_account_system(tsk);
430} 424}
431EXPORT_SYMBOL_GPL(vtime_account_irq_enter); 425EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
432#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 426#endif /* __ARCH_HAS_VTIME_ACCOUNT */
433#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ 427#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
434 428
@@ -557,16 +551,7 @@ static void cputime_adjust(struct task_cputime *curr,
557 struct cputime *prev, 551 struct cputime *prev,
558 cputime_t *ut, cputime_t *st) 552 cputime_t *ut, cputime_t *st)
559{ 553{
560 cputime_t rtime, stime, utime, total; 554 cputime_t rtime, stime, utime;
561
562 if (vtime_accounting_enabled()) {
563 *ut = curr->utime;
564 *st = curr->stime;
565 return;
566 }
567
568 stime = curr->stime;
569 total = stime + curr->utime;
570 555
571 /* 556 /*
572 * Tick based cputime accounting depend on random scheduling 557 * Tick based cputime accounting depend on random scheduling
@@ -588,13 +573,19 @@ static void cputime_adjust(struct task_cputime *curr,
588 if (prev->stime + prev->utime >= rtime) 573 if (prev->stime + prev->utime >= rtime)
589 goto out; 574 goto out;
590 575
591 if (total) { 576 stime = curr->stime;
577 utime = curr->utime;
578
579 if (utime == 0) {
580 stime = rtime;
581 } else if (stime == 0) {
582 utime = rtime;
583 } else {
584 cputime_t total = stime + utime;
585
592 stime = scale_stime((__force u64)stime, 586 stime = scale_stime((__force u64)stime,
593 (__force u64)rtime, (__force u64)total); 587 (__force u64)rtime, (__force u64)total);
594 utime = rtime - stime; 588 utime = rtime - stime;
595 } else {
596 stime = rtime;
597 utime = 0;
598 } 589 }
599 590
600 /* 591 /*
@@ -664,23 +655,17 @@ static void __vtime_account_system(struct task_struct *tsk)
664 655
665void vtime_account_system(struct task_struct *tsk) 656void vtime_account_system(struct task_struct *tsk)
666{ 657{
667 if (!vtime_accounting_enabled())
668 return;
669
670 write_seqlock(&tsk->vtime_seqlock); 658 write_seqlock(&tsk->vtime_seqlock);
671 __vtime_account_system(tsk); 659 __vtime_account_system(tsk);
672 write_sequnlock(&tsk->vtime_seqlock); 660 write_sequnlock(&tsk->vtime_seqlock);
673} 661}
674 662
675void vtime_account_irq_exit(struct task_struct *tsk) 663void vtime_gen_account_irq_exit(struct task_struct *tsk)
676{ 664{
677 if (!vtime_accounting_enabled())
678 return;
679
680 write_seqlock(&tsk->vtime_seqlock); 665 write_seqlock(&tsk->vtime_seqlock);
666 __vtime_account_system(tsk);
681 if (context_tracking_in_user()) 667 if (context_tracking_in_user())
682 tsk->vtime_snap_whence = VTIME_USER; 668 tsk->vtime_snap_whence = VTIME_USER;
683 __vtime_account_system(tsk);
684 write_sequnlock(&tsk->vtime_seqlock); 669 write_sequnlock(&tsk->vtime_seqlock);
685} 670}
686 671
@@ -688,12 +673,8 @@ void vtime_account_user(struct task_struct *tsk)
688{ 673{
689 cputime_t delta_cpu; 674 cputime_t delta_cpu;
690 675
691 if (!vtime_accounting_enabled())
692 return;
693
694 delta_cpu = get_vtime_delta(tsk);
695
696 write_seqlock(&tsk->vtime_seqlock); 676 write_seqlock(&tsk->vtime_seqlock);
677 delta_cpu = get_vtime_delta(tsk);
697 tsk->vtime_snap_whence = VTIME_SYS; 678 tsk->vtime_snap_whence = VTIME_SYS;
698 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); 679 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
699 write_sequnlock(&tsk->vtime_seqlock); 680 write_sequnlock(&tsk->vtime_seqlock);
@@ -701,22 +682,27 @@ void vtime_account_user(struct task_struct *tsk)
701 682
702void vtime_user_enter(struct task_struct *tsk) 683void vtime_user_enter(struct task_struct *tsk)
703{ 684{
704 if (!vtime_accounting_enabled())
705 return;
706
707 write_seqlock(&tsk->vtime_seqlock); 685 write_seqlock(&tsk->vtime_seqlock);
708 tsk->vtime_snap_whence = VTIME_USER;
709 __vtime_account_system(tsk); 686 __vtime_account_system(tsk);
687 tsk->vtime_snap_whence = VTIME_USER;
710 write_sequnlock(&tsk->vtime_seqlock); 688 write_sequnlock(&tsk->vtime_seqlock);
711} 689}
712 690
713void vtime_guest_enter(struct task_struct *tsk) 691void vtime_guest_enter(struct task_struct *tsk)
714{ 692{
693 /*
694 * The flags must be updated under the lock with
695 * the vtime_snap flush and update.
696 * That enforces a right ordering and update sequence
697 * synchronization against the reader (task_gtime())
698 * that can thus safely catch up with a tickless delta.
699 */
715 write_seqlock(&tsk->vtime_seqlock); 700 write_seqlock(&tsk->vtime_seqlock);
716 __vtime_account_system(tsk); 701 __vtime_account_system(tsk);
717 current->flags |= PF_VCPU; 702 current->flags |= PF_VCPU;
718 write_sequnlock(&tsk->vtime_seqlock); 703 write_sequnlock(&tsk->vtime_seqlock);
719} 704}
705EXPORT_SYMBOL_GPL(vtime_guest_enter);
720 706
721void vtime_guest_exit(struct task_struct *tsk) 707void vtime_guest_exit(struct task_struct *tsk)
722{ 708{
@@ -725,6 +711,7 @@ void vtime_guest_exit(struct task_struct *tsk)
725 current->flags &= ~PF_VCPU; 711 current->flags &= ~PF_VCPU;
726 write_sequnlock(&tsk->vtime_seqlock); 712 write_sequnlock(&tsk->vtime_seqlock);
727} 713}
714EXPORT_SYMBOL_GPL(vtime_guest_exit);
728 715
729void vtime_account_idle(struct task_struct *tsk) 716void vtime_account_idle(struct task_struct *tsk)
730{ 717{
@@ -733,11 +720,6 @@ void vtime_account_idle(struct task_struct *tsk)
733 account_idle_time(delta_cpu); 720 account_idle_time(delta_cpu);
734} 721}
735 722
736bool vtime_accounting_enabled(void)
737{
738 return context_tracking_active();
739}
740
741void arch_vtime_task_switch(struct task_struct *prev) 723void arch_vtime_task_switch(struct task_struct *prev)
742{ 724{
743 write_seqlock(&prev->vtime_seqlock); 725 write_seqlock(&prev->vtime_seqlock);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index e076bddd4c66..5c34d1817e8f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -15,6 +15,7 @@
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18#include <linux/mempolicy.h>
18 19
19#include "sched.h" 20#include "sched.h"
20 21
@@ -124,7 +125,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
124 SEQ_printf(m, " "); 125 SEQ_printf(m, " ");
125 126
126 SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", 127 SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
127 p->comm, p->pid, 128 p->comm, task_pid_nr(p),
128 SPLIT_NS(p->se.vruntime), 129 SPLIT_NS(p->se.vruntime),
129 (long long)(p->nvcsw + p->nivcsw), 130 (long long)(p->nvcsw + p->nivcsw),
130 p->prio); 131 p->prio);
@@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
137 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 138 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
138 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 139 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
139#endif 140#endif
141#ifdef CONFIG_NUMA_BALANCING
142 SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
143#endif
140#ifdef CONFIG_CGROUP_SCHED 144#ifdef CONFIG_CGROUP_SCHED
141 SEQ_printf(m, " %s", task_group_path(task_group(p))); 145 SEQ_printf(m, " %s", task_group_path(task_group(p)));
142#endif 146#endif
@@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
159 read_lock_irqsave(&tasklist_lock, flags); 163 read_lock_irqsave(&tasklist_lock, flags);
160 164
161 do_each_thread(g, p) { 165 do_each_thread(g, p) {
162 if (!p->on_rq || task_cpu(p) != rq_cpu) 166 if (task_cpu(p) != rq_cpu)
163 continue; 167 continue;
164 168
165 print_task(m, rq, p); 169 print_task(m, rq, p);
@@ -225,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
225 atomic_read(&cfs_rq->tg->runnable_avg)); 229 atomic_read(&cfs_rq->tg->runnable_avg));
226#endif 230#endif
227#endif 231#endif
232#ifdef CONFIG_CFS_BANDWIDTH
233 SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
234 cfs_rq->tg->cfs_bandwidth.timer_active);
235 SEQ_printf(m, " .%-30s: %d\n", "throttled",
236 cfs_rq->throttled);
237 SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
238 cfs_rq->throttle_count);
239#endif
228 240
229#ifdef CONFIG_FAIR_GROUP_SCHED 241#ifdef CONFIG_FAIR_GROUP_SCHED
230 print_cfs_group_stats(m, cpu, cfs_rq->tg); 242 print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -289,7 +301,7 @@ do { \
289 P(nr_load_updates); 301 P(nr_load_updates);
290 P(nr_uninterruptible); 302 P(nr_uninterruptible);
291 PN(next_balance); 303 PN(next_balance);
292 P(curr->pid); 304 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
293 PN(clock); 305 PN(clock);
294 P(cpu_load[0]); 306 P(cpu_load[0]);
295 P(cpu_load[1]); 307 P(cpu_load[1]);
@@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m)
345 cpu_clk = local_clock(); 357 cpu_clk = local_clock();
346 local_irq_restore(flags); 358 local_irq_restore(flags);
347 359
348 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", 360 SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
349 init_utsname()->release, 361 init_utsname()->release,
350 (int)strcspn(init_utsname()->version, " "), 362 (int)strcspn(init_utsname()->version, " "),
351 init_utsname()->version); 363 init_utsname()->version);
@@ -488,11 +500,61 @@ static int __init init_sched_debug_procfs(void)
488 500
489__initcall(init_sched_debug_procfs); 501__initcall(init_sched_debug_procfs);
490 502
503#define __P(F) \
504 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
505#define P(F) \
506 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
507#define __PN(F) \
508 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
509#define PN(F) \
510 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
511
512
513static void sched_show_numa(struct task_struct *p, struct seq_file *m)
514{
515#ifdef CONFIG_NUMA_BALANCING
516 struct mempolicy *pol;
517 int node, i;
518
519 if (p->mm)
520 P(mm->numa_scan_seq);
521
522 task_lock(p);
523 pol = p->mempolicy;
524 if (pol && !(pol->flags & MPOL_F_MORON))
525 pol = NULL;
526 mpol_get(pol);
527 task_unlock(p);
528
529 SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
530
531 for_each_online_node(node) {
532 for (i = 0; i < 2; i++) {
533 unsigned long nr_faults = -1;
534 int cpu_current, home_node;
535
536 if (p->numa_faults)
537 nr_faults = p->numa_faults[2*node + i];
538
539 cpu_current = !i ? (task_node(p) == node) :
540 (pol && node_isset(node, pol->v.nodes));
541
542 home_node = (p->numa_preferred_nid == node);
543
544 SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
545 i, node, cpu_current, home_node, nr_faults);
546 }
547 }
548
549 mpol_put(pol);
550#endif
551}
552
491void proc_sched_show_task(struct task_struct *p, struct seq_file *m) 553void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
492{ 554{
493 unsigned long nr_switches; 555 unsigned long nr_switches;
494 556
495 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, 557 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
496 get_nr_threads(p)); 558 get_nr_threads(p));
497 SEQ_printf(m, 559 SEQ_printf(m,
498 "---------------------------------------------------------" 560 "---------------------------------------------------------"
@@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
591 SEQ_printf(m, "%-45s:%21Ld\n", 653 SEQ_printf(m, "%-45s:%21Ld\n",
592 "clock-delta", (long long)(t1-t0)); 654 "clock-delta", (long long)(t1-t0));
593 } 655 }
656
657 sched_show_numa(p, m);
594} 658}
595 659
596void proc_sched_set_task(struct task_struct *p) 660void proc_sched_set_task(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9565645e3202..df77c605c7a6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
681} 681}
682 682
683#ifdef CONFIG_SMP 683#ifdef CONFIG_SMP
684static unsigned long task_h_load(struct task_struct *p);
685
684static inline void __update_task_entity_contrib(struct sched_entity *se); 686static inline void __update_task_entity_contrib(struct sched_entity *se);
685 687
686/* Give new task start runnable values to heavy its load in infant time */ 688/* Give new task start runnable values to heavy its load in infant time */
@@ -818,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
818 820
819#ifdef CONFIG_NUMA_BALANCING 821#ifdef CONFIG_NUMA_BALANCING
820/* 822/*
821 * numa task sample period in ms 823 * Approximate time to scan a full NUMA task in ms. The task scan period is
824 * calculated based on the tasks virtual memory size and
825 * numa_balancing_scan_size.
822 */ 826 */
823unsigned int sysctl_numa_balancing_scan_period_min = 100; 827unsigned int sysctl_numa_balancing_scan_period_min = 1000;
824unsigned int sysctl_numa_balancing_scan_period_max = 100*50; 828unsigned int sysctl_numa_balancing_scan_period_max = 60000;
825unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
826 829
827/* Portion of address space to scan in MB */ 830/* Portion of address space to scan in MB */
828unsigned int sysctl_numa_balancing_scan_size = 256; 831unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,41 +833,810 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
830/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 833/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
831unsigned int sysctl_numa_balancing_scan_delay = 1000; 834unsigned int sysctl_numa_balancing_scan_delay = 1000;
832 835
833static void task_numa_placement(struct task_struct *p) 836/*
837 * After skipping a page migration on a shared page, skip N more numa page
838 * migrations unconditionally. This reduces the number of NUMA migrations
839 * in shared memory workloads, and has the effect of pulling tasks towards
840 * where their memory lives, over pulling the memory towards the task.
841 */
842unsigned int sysctl_numa_balancing_migrate_deferred = 16;
843
844static unsigned int task_nr_scan_windows(struct task_struct *p)
845{
846 unsigned long rss = 0;
847 unsigned long nr_scan_pages;
848
849 /*
850 * Calculations based on RSS as non-present and empty pages are skipped
851 * by the PTE scanner and NUMA hinting faults should be trapped based
852 * on resident pages
853 */
854 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
855 rss = get_mm_rss(p->mm);
856 if (!rss)
857 rss = nr_scan_pages;
858
859 rss = round_up(rss, nr_scan_pages);
860 return rss / nr_scan_pages;
861}
862
863/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
864#define MAX_SCAN_WINDOW 2560
865
866static unsigned int task_scan_min(struct task_struct *p)
867{
868 unsigned int scan, floor;
869 unsigned int windows = 1;
870
871 if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
872 windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
873 floor = 1000 / windows;
874
875 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
876 return max_t(unsigned int, floor, scan);
877}
878
879static unsigned int task_scan_max(struct task_struct *p)
880{
881 unsigned int smin = task_scan_min(p);
882 unsigned int smax;
883
884 /* Watch for min being lower than max due to floor calculations */
885 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
886 return max(smin, smax);
887}
888
889/*
890 * Once a preferred node is selected the scheduler balancer will prefer moving
891 * a task to that node for sysctl_numa_balancing_settle_count number of PTE
892 * scans. This will give the process the chance to accumulate more faults on
893 * the preferred node but still allow the scheduler to move the task again if
894 * the nodes CPUs are overloaded.
895 */
896unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
897
898static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
899{
900 rq->nr_numa_running += (p->numa_preferred_nid != -1);
901 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
902}
903
904static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
905{
906 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
907 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
908}
909
910struct numa_group {
911 atomic_t refcount;
912
913 spinlock_t lock; /* nr_tasks, tasks */
914 int nr_tasks;
915 pid_t gid;
916 struct list_head task_list;
917
918 struct rcu_head rcu;
919 unsigned long total_faults;
920 unsigned long faults[0];
921};
922
923pid_t task_numa_group_id(struct task_struct *p)
924{
925 return p->numa_group ? p->numa_group->gid : 0;
926}
927
928static inline int task_faults_idx(int nid, int priv)
929{
930 return 2 * nid + priv;
931}
932
933static inline unsigned long task_faults(struct task_struct *p, int nid)
934{
935 if (!p->numa_faults)
936 return 0;
937
938 return p->numa_faults[task_faults_idx(nid, 0)] +
939 p->numa_faults[task_faults_idx(nid, 1)];
940}
941
942static inline unsigned long group_faults(struct task_struct *p, int nid)
943{
944 if (!p->numa_group)
945 return 0;
946
947 return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
948}
949
950/*
951 * These return the fraction of accesses done by a particular task, or
952 * task group, on a particular numa node. The group weight is given a
953 * larger multiplier, in order to group tasks together that are almost
954 * evenly spread out between numa nodes.
955 */
956static inline unsigned long task_weight(struct task_struct *p, int nid)
957{
958 unsigned long total_faults;
959
960 if (!p->numa_faults)
961 return 0;
962
963 total_faults = p->total_numa_faults;
964
965 if (!total_faults)
966 return 0;
967
968 return 1000 * task_faults(p, nid) / total_faults;
969}
970
971static inline unsigned long group_weight(struct task_struct *p, int nid)
972{
973 if (!p->numa_group || !p->numa_group->total_faults)
974 return 0;
975
976 return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
977}
978
979static unsigned long weighted_cpuload(const int cpu);
980static unsigned long source_load(int cpu, int type);
981static unsigned long target_load(int cpu, int type);
982static unsigned long power_of(int cpu);
983static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
984
985/* Cached statistics for all CPUs within a node */
986struct numa_stats {
987 unsigned long nr_running;
988 unsigned long load;
989
990 /* Total compute capacity of CPUs on a node */
991 unsigned long power;
992
993 /* Approximate capacity in terms of runnable tasks on a node */
994 unsigned long capacity;
995 int has_capacity;
996};
997
998/*
999 * XXX borrowed from update_sg_lb_stats
1000 */
1001static void update_numa_stats(struct numa_stats *ns, int nid)
1002{
1003 int cpu;
1004
1005 memset(ns, 0, sizeof(*ns));
1006 for_each_cpu(cpu, cpumask_of_node(nid)) {
1007 struct rq *rq = cpu_rq(cpu);
1008
1009 ns->nr_running += rq->nr_running;
1010 ns->load += weighted_cpuload(cpu);
1011 ns->power += power_of(cpu);
1012 }
1013
1014 ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
1015 ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
1016 ns->has_capacity = (ns->nr_running < ns->capacity);
1017}
1018
1019struct task_numa_env {
1020 struct task_struct *p;
1021
1022 int src_cpu, src_nid;
1023 int dst_cpu, dst_nid;
1024
1025 struct numa_stats src_stats, dst_stats;
1026
1027 int imbalance_pct, idx;
1028
1029 struct task_struct *best_task;
1030 long best_imp;
1031 int best_cpu;
1032};
1033
1034static void task_numa_assign(struct task_numa_env *env,
1035 struct task_struct *p, long imp)
1036{
1037 if (env->best_task)
1038 put_task_struct(env->best_task);
1039 if (p)
1040 get_task_struct(p);
1041
1042 env->best_task = p;
1043 env->best_imp = imp;
1044 env->best_cpu = env->dst_cpu;
1045}
1046
1047/*
1048 * This checks if the overall compute and NUMA accesses of the system would
1049 * be improved if the source tasks was migrated to the target dst_cpu taking
1050 * into account that it might be best if task running on the dst_cpu should
1051 * be exchanged with the source task
1052 */
1053static void task_numa_compare(struct task_numa_env *env,
1054 long taskimp, long groupimp)
1055{
1056 struct rq *src_rq = cpu_rq(env->src_cpu);
1057 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1058 struct task_struct *cur;
1059 long dst_load, src_load;
1060 long load;
1061 long imp = (groupimp > 0) ? groupimp : taskimp;
1062
1063 rcu_read_lock();
1064 cur = ACCESS_ONCE(dst_rq->curr);
1065 if (cur->pid == 0) /* idle */
1066 cur = NULL;
1067
1068 /*
1069 * "imp" is the fault differential for the source task between the
1070 * source and destination node. Calculate the total differential for
1071 * the source task and potential destination task. The more negative
1072 * the value is, the more rmeote accesses that would be expected to
1073 * be incurred if the tasks were swapped.
1074 */
1075 if (cur) {
1076 /* Skip this swap candidate if cannot move to the source cpu */
1077 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1078 goto unlock;
1079
1080 /*
1081 * If dst and source tasks are in the same NUMA group, or not
1082 * in any group then look only at task weights.
1083 */
1084 if (cur->numa_group == env->p->numa_group) {
1085 imp = taskimp + task_weight(cur, env->src_nid) -
1086 task_weight(cur, env->dst_nid);
1087 /*
1088 * Add some hysteresis to prevent swapping the
1089 * tasks within a group over tiny differences.
1090 */
1091 if (cur->numa_group)
1092 imp -= imp/16;
1093 } else {
1094 /*
1095 * Compare the group weights. If a task is all by
1096 * itself (not part of a group), use the task weight
1097 * instead.
1098 */
1099 if (env->p->numa_group)
1100 imp = groupimp;
1101 else
1102 imp = taskimp;
1103
1104 if (cur->numa_group)
1105 imp += group_weight(cur, env->src_nid) -
1106 group_weight(cur, env->dst_nid);
1107 else
1108 imp += task_weight(cur, env->src_nid) -
1109 task_weight(cur, env->dst_nid);
1110 }
1111 }
1112
1113 if (imp < env->best_imp)
1114 goto unlock;
1115
1116 if (!cur) {
1117 /* Is there capacity at our destination? */
1118 if (env->src_stats.has_capacity &&
1119 !env->dst_stats.has_capacity)
1120 goto unlock;
1121
1122 goto balance;
1123 }
1124
1125 /* Balance doesn't matter much if we're running a task per cpu */
1126 if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
1127 goto assign;
1128
1129 /*
1130 * In the overloaded case, try and keep the load balanced.
1131 */
1132balance:
1133 dst_load = env->dst_stats.load;
1134 src_load = env->src_stats.load;
1135
1136 /* XXX missing power terms */
1137 load = task_h_load(env->p);
1138 dst_load += load;
1139 src_load -= load;
1140
1141 if (cur) {
1142 load = task_h_load(cur);
1143 dst_load -= load;
1144 src_load += load;
1145 }
1146
1147 /* make src_load the smaller */
1148 if (dst_load < src_load)
1149 swap(dst_load, src_load);
1150
1151 if (src_load * env->imbalance_pct < dst_load * 100)
1152 goto unlock;
1153
1154assign:
1155 task_numa_assign(env, cur, imp);
1156unlock:
1157 rcu_read_unlock();
1158}
1159
1160static void task_numa_find_cpu(struct task_numa_env *env,
1161 long taskimp, long groupimp)
1162{
1163 int cpu;
1164
1165 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1166 /* Skip this CPU if the source task cannot migrate */
1167 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1168 continue;
1169
1170 env->dst_cpu = cpu;
1171 task_numa_compare(env, taskimp, groupimp);
1172 }
1173}
1174
1175static int task_numa_migrate(struct task_struct *p)
1176{
1177 struct task_numa_env env = {
1178 .p = p,
1179
1180 .src_cpu = task_cpu(p),
1181 .src_nid = task_node(p),
1182
1183 .imbalance_pct = 112,
1184
1185 .best_task = NULL,
1186 .best_imp = 0,
1187 .best_cpu = -1
1188 };
1189 struct sched_domain *sd;
1190 unsigned long taskweight, groupweight;
1191 int nid, ret;
1192 long taskimp, groupimp;
1193
1194 /*
1195 * Pick the lowest SD_NUMA domain, as that would have the smallest
1196 * imbalance and would be the first to start moving tasks about.
1197 *
1198 * And we want to avoid any moving of tasks about, as that would create
1199 * random movement of tasks -- counter the numa conditions we're trying
1200 * to satisfy here.
1201 */
1202 rcu_read_lock();
1203 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1204 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1205 rcu_read_unlock();
1206
1207 taskweight = task_weight(p, env.src_nid);
1208 groupweight = group_weight(p, env.src_nid);
1209 update_numa_stats(&env.src_stats, env.src_nid);
1210 env.dst_nid = p->numa_preferred_nid;
1211 taskimp = task_weight(p, env.dst_nid) - taskweight;
1212 groupimp = group_weight(p, env.dst_nid) - groupweight;
1213 update_numa_stats(&env.dst_stats, env.dst_nid);
1214
1215 /* If the preferred nid has capacity, try to use it. */
1216 if (env.dst_stats.has_capacity)
1217 task_numa_find_cpu(&env, taskimp, groupimp);
1218
1219 /* No space available on the preferred nid. Look elsewhere. */
1220 if (env.best_cpu == -1) {
1221 for_each_online_node(nid) {
1222 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1223 continue;
1224
1225 /* Only consider nodes where both task and groups benefit */
1226 taskimp = task_weight(p, nid) - taskweight;
1227 groupimp = group_weight(p, nid) - groupweight;
1228 if (taskimp < 0 && groupimp < 0)
1229 continue;
1230
1231 env.dst_nid = nid;
1232 update_numa_stats(&env.dst_stats, env.dst_nid);
1233 task_numa_find_cpu(&env, taskimp, groupimp);
1234 }
1235 }
1236
1237 /* No better CPU than the current one was found. */
1238 if (env.best_cpu == -1)
1239 return -EAGAIN;
1240
1241 sched_setnuma(p, env.dst_nid);
1242
1243 /*
1244 * Reset the scan period if the task is being rescheduled on an
1245 * alternative node to recheck if the tasks is now properly placed.
1246 */
1247 p->numa_scan_period = task_scan_min(p);
1248
1249 if (env.best_task == NULL) {
1250 int ret = migrate_task_to(p, env.best_cpu);
1251 return ret;
1252 }
1253
1254 ret = migrate_swap(p, env.best_task);
1255 put_task_struct(env.best_task);
1256 return ret;
1257}
1258
1259/* Attempt to migrate a task to a CPU on the preferred node. */
1260static void numa_migrate_preferred(struct task_struct *p)
1261{
1262 /* This task has no NUMA fault statistics yet */
1263 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1264 return;
1265
1266 /* Periodically retry migrating the task to the preferred node */
1267 p->numa_migrate_retry = jiffies + HZ;
1268
1269 /* Success if task is already running on preferred CPU */
1270 if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
1271 return;
1272
1273 /* Otherwise, try migrate to a CPU on the preferred node */
1274 task_numa_migrate(p);
1275}
1276
1277/*
1278 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1279 * increments. The more local the fault statistics are, the higher the scan
1280 * period will be for the next scan window. If local/remote ratio is below
1281 * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
1282 * scan period will decrease
1283 */
1284#define NUMA_PERIOD_SLOTS 10
1285#define NUMA_PERIOD_THRESHOLD 3
1286
1287/*
1288 * Increase the scan period (slow down scanning) if the majority of
1289 * our memory is already on our local node, or if the majority of
1290 * the page accesses are shared with other processes.
1291 * Otherwise, decrease the scan period.
1292 */
1293static void update_task_scan_period(struct task_struct *p,
1294 unsigned long shared, unsigned long private)
834{ 1295{
835 int seq; 1296 unsigned int period_slot;
1297 int ratio;
1298 int diff;
1299
1300 unsigned long remote = p->numa_faults_locality[0];
1301 unsigned long local = p->numa_faults_locality[1];
1302
1303 /*
1304 * If there were no record hinting faults then either the task is
1305 * completely idle or all activity is areas that are not of interest
1306 * to automatic numa balancing. Scan slower
1307 */
1308 if (local + shared == 0) {
1309 p->numa_scan_period = min(p->numa_scan_period_max,
1310 p->numa_scan_period << 1);
1311
1312 p->mm->numa_next_scan = jiffies +
1313 msecs_to_jiffies(p->numa_scan_period);
836 1314
837 if (!p->mm) /* for example, ksmd faulting in a user's mm */
838 return; 1315 return;
1316 }
1317
1318 /*
1319 * Prepare to scale scan period relative to the current period.
1320 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1321 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1322 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1323 */
1324 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1325 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1326 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1327 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1328 if (!slot)
1329 slot = 1;
1330 diff = slot * period_slot;
1331 } else {
1332 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1333
1334 /*
1335 * Scale scan rate increases based on sharing. There is an
1336 * inverse relationship between the degree of sharing and
1337 * the adjustment made to the scanning period. Broadly
1338 * speaking the intent is that there is little point
1339 * scanning faster if shared accesses dominate as it may
1340 * simply bounce migrations uselessly
1341 */
1342 period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
1343 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1344 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1345 }
1346
1347 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1348 task_scan_min(p), task_scan_max(p));
1349 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1350}
1351
1352static void task_numa_placement(struct task_struct *p)
1353{
1354 int seq, nid, max_nid = -1, max_group_nid = -1;
1355 unsigned long max_faults = 0, max_group_faults = 0;
1356 unsigned long fault_types[2] = { 0, 0 };
1357 spinlock_t *group_lock = NULL;
1358
839 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1359 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
840 if (p->numa_scan_seq == seq) 1360 if (p->numa_scan_seq == seq)
841 return; 1361 return;
842 p->numa_scan_seq = seq; 1362 p->numa_scan_seq = seq;
1363 p->numa_scan_period_max = task_scan_max(p);
1364
1365 /* If the task is part of a group prevent parallel updates to group stats */
1366 if (p->numa_group) {
1367 group_lock = &p->numa_group->lock;
1368 spin_lock(group_lock);
1369 }
1370
1371 /* Find the node with the highest number of faults */
1372 for_each_online_node(nid) {
1373 unsigned long faults = 0, group_faults = 0;
1374 int priv, i;
1375
1376 for (priv = 0; priv < 2; priv++) {
1377 long diff;
1378
1379 i = task_faults_idx(nid, priv);
1380 diff = -p->numa_faults[i];
1381
1382 /* Decay existing window, copy faults since last scan */
1383 p->numa_faults[i] >>= 1;
1384 p->numa_faults[i] += p->numa_faults_buffer[i];
1385 fault_types[priv] += p->numa_faults_buffer[i];
1386 p->numa_faults_buffer[i] = 0;
1387
1388 faults += p->numa_faults[i];
1389 diff += p->numa_faults[i];
1390 p->total_numa_faults += diff;
1391 if (p->numa_group) {
1392 /* safe because we can only change our own group */
1393 p->numa_group->faults[i] += diff;
1394 p->numa_group->total_faults += diff;
1395 group_faults += p->numa_group->faults[i];
1396 }
1397 }
1398
1399 if (faults > max_faults) {
1400 max_faults = faults;
1401 max_nid = nid;
1402 }
1403
1404 if (group_faults > max_group_faults) {
1405 max_group_faults = group_faults;
1406 max_group_nid = nid;
1407 }
1408 }
1409
1410 update_task_scan_period(p, fault_types[0], fault_types[1]);
1411
1412 if (p->numa_group) {
1413 /*
1414 * If the preferred task and group nids are different,
1415 * iterate over the nodes again to find the best place.
1416 */
1417 if (max_nid != max_group_nid) {
1418 unsigned long weight, max_weight = 0;
1419
1420 for_each_online_node(nid) {
1421 weight = task_weight(p, nid) + group_weight(p, nid);
1422 if (weight > max_weight) {
1423 max_weight = weight;
1424 max_nid = nid;
1425 }
1426 }
1427 }
1428
1429 spin_unlock(group_lock);
1430 }
1431
1432 /* Preferred node as the node with the most faults */
1433 if (max_faults && max_nid != p->numa_preferred_nid) {
1434 /* Update the preferred nid and migrate task if possible */
1435 sched_setnuma(p, max_nid);
1436 numa_migrate_preferred(p);
1437 }
1438}
1439
1440static inline int get_numa_group(struct numa_group *grp)
1441{
1442 return atomic_inc_not_zero(&grp->refcount);
1443}
1444
1445static inline void put_numa_group(struct numa_group *grp)
1446{
1447 if (atomic_dec_and_test(&grp->refcount))
1448 kfree_rcu(grp, rcu);
1449}
1450
1451static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1452 int *priv)
1453{
1454 struct numa_group *grp, *my_grp;
1455 struct task_struct *tsk;
1456 bool join = false;
1457 int cpu = cpupid_to_cpu(cpupid);
1458 int i;
1459
1460 if (unlikely(!p->numa_group)) {
1461 unsigned int size = sizeof(struct numa_group) +
1462 2*nr_node_ids*sizeof(unsigned long);
1463
1464 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1465 if (!grp)
1466 return;
1467
1468 atomic_set(&grp->refcount, 1);
1469 spin_lock_init(&grp->lock);
1470 INIT_LIST_HEAD(&grp->task_list);
1471 grp->gid = p->pid;
1472
1473 for (i = 0; i < 2*nr_node_ids; i++)
1474 grp->faults[i] = p->numa_faults[i];
1475
1476 grp->total_faults = p->total_numa_faults;
1477
1478 list_add(&p->numa_entry, &grp->task_list);
1479 grp->nr_tasks++;
1480 rcu_assign_pointer(p->numa_group, grp);
1481 }
1482
1483 rcu_read_lock();
1484 tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
843 1485
844 /* FIXME: Scheduling placement policy hints go here */ 1486 if (!cpupid_match_pid(tsk, cpupid))
1487 goto no_join;
1488
1489 grp = rcu_dereference(tsk->numa_group);
1490 if (!grp)
1491 goto no_join;
1492
1493 my_grp = p->numa_group;
1494 if (grp == my_grp)
1495 goto no_join;
1496
1497 /*
1498 * Only join the other group if its bigger; if we're the bigger group,
1499 * the other task will join us.
1500 */
1501 if (my_grp->nr_tasks > grp->nr_tasks)
1502 goto no_join;
1503
1504 /*
1505 * Tie-break on the grp address.
1506 */
1507 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1508 goto no_join;
1509
1510 /* Always join threads in the same process. */
1511 if (tsk->mm == current->mm)
1512 join = true;
1513
1514 /* Simple filter to avoid false positives due to PID collisions */
1515 if (flags & TNF_SHARED)
1516 join = true;
1517
1518 /* Update priv based on whether false sharing was detected */
1519 *priv = !join;
1520
1521 if (join && !get_numa_group(grp))
1522 goto no_join;
1523
1524 rcu_read_unlock();
1525
1526 if (!join)
1527 return;
1528
1529 double_lock(&my_grp->lock, &grp->lock);
1530
1531 for (i = 0; i < 2*nr_node_ids; i++) {
1532 my_grp->faults[i] -= p->numa_faults[i];
1533 grp->faults[i] += p->numa_faults[i];
1534 }
1535 my_grp->total_faults -= p->total_numa_faults;
1536 grp->total_faults += p->total_numa_faults;
1537
1538 list_move(&p->numa_entry, &grp->task_list);
1539 my_grp->nr_tasks--;
1540 grp->nr_tasks++;
1541
1542 spin_unlock(&my_grp->lock);
1543 spin_unlock(&grp->lock);
1544
1545 rcu_assign_pointer(p->numa_group, grp);
1546
1547 put_numa_group(my_grp);
1548 return;
1549
1550no_join:
1551 rcu_read_unlock();
1552 return;
1553}
1554
1555void task_numa_free(struct task_struct *p)
1556{
1557 struct numa_group *grp = p->numa_group;
1558 int i;
1559 void *numa_faults = p->numa_faults;
1560
1561 if (grp) {
1562 spin_lock(&grp->lock);
1563 for (i = 0; i < 2*nr_node_ids; i++)
1564 grp->faults[i] -= p->numa_faults[i];
1565 grp->total_faults -= p->total_numa_faults;
1566
1567 list_del(&p->numa_entry);
1568 grp->nr_tasks--;
1569 spin_unlock(&grp->lock);
1570 rcu_assign_pointer(p->numa_group, NULL);
1571 put_numa_group(grp);
1572 }
1573
1574 p->numa_faults = NULL;
1575 p->numa_faults_buffer = NULL;
1576 kfree(numa_faults);
845} 1577}
846 1578
847/* 1579/*
848 * Got a PROT_NONE fault for a page on @node. 1580 * Got a PROT_NONE fault for a page on @node.
849 */ 1581 */
850void task_numa_fault(int node, int pages, bool migrated) 1582void task_numa_fault(int last_cpupid, int node, int pages, int flags)
851{ 1583{
852 struct task_struct *p = current; 1584 struct task_struct *p = current;
1585 bool migrated = flags & TNF_MIGRATED;
1586 int priv;
853 1587
854 if (!numabalancing_enabled) 1588 if (!numabalancing_enabled)
855 return; 1589 return;
856 1590
857 /* FIXME: Allocate task-specific structure for placement policy here */ 1591 /* for example, ksmd faulting in a user's mm */
1592 if (!p->mm)
1593 return;
1594
1595 /* Do not worry about placement if exiting */
1596 if (p->state == TASK_DEAD)
1597 return;
1598
1599 /* Allocate buffer to track faults on a per-node basis */
1600 if (unlikely(!p->numa_faults)) {
1601 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
1602
1603 /* numa_faults and numa_faults_buffer share the allocation */
1604 p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
1605 if (!p->numa_faults)
1606 return;
1607
1608 BUG_ON(p->numa_faults_buffer);
1609 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
1610 p->total_numa_faults = 0;
1611 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1612 }
858 1613
859 /* 1614 /*
860 * If pages are properly placed (did not migrate) then scan slower. 1615 * First accesses are treated as private, otherwise consider accesses
861 * This is reset periodically in case of phase changes 1616 * to be private if the accessing pid has not changed
862 */ 1617 */
863 if (!migrated) 1618 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
864 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, 1619 priv = 1;
865 p->numa_scan_period + jiffies_to_msecs(10)); 1620 } else {
1621 priv = cpupid_match_pid(p, last_cpupid);
1622 if (!priv && !(flags & TNF_NO_GROUP))
1623 task_numa_group(p, last_cpupid, flags, &priv);
1624 }
866 1625
867 task_numa_placement(p); 1626 task_numa_placement(p);
1627
1628 /*
1629 * Retry task to preferred node migration periodically, in case it
1630 * case it previously failed, or the scheduler moved us.
1631 */
1632 if (time_after(jiffies, p->numa_migrate_retry))
1633 numa_migrate_preferred(p);
1634
1635 if (migrated)
1636 p->numa_pages_migrated += pages;
1637
1638 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
1639 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
868} 1640}
869 1641
870static void reset_ptenuma_scan(struct task_struct *p) 1642static void reset_ptenuma_scan(struct task_struct *p)
@@ -884,6 +1656,7 @@ void task_numa_work(struct callback_head *work)
884 struct mm_struct *mm = p->mm; 1656 struct mm_struct *mm = p->mm;
885 struct vm_area_struct *vma; 1657 struct vm_area_struct *vma;
886 unsigned long start, end; 1658 unsigned long start, end;
1659 unsigned long nr_pte_updates = 0;
887 long pages; 1660 long pages;
888 1661
889 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); 1662 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -900,35 +1673,9 @@ void task_numa_work(struct callback_head *work)
900 if (p->flags & PF_EXITING) 1673 if (p->flags & PF_EXITING)
901 return; 1674 return;
902 1675
903 /* 1676 if (!mm->numa_next_scan) {
904 * We do not care about task placement until a task runs on a node 1677 mm->numa_next_scan = now +
905 * other than the first one used by the address space. This is 1678 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
906 * largely because migrations are driven by what CPU the task
907 * is running on. If it's never scheduled on another node, it'll
908 * not migrate so why bother trapping the fault.
909 */
910 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
911 mm->first_nid = numa_node_id();
912 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
913 /* Are we running on a new node yet? */
914 if (numa_node_id() == mm->first_nid &&
915 !sched_feat_numa(NUMA_FORCE))
916 return;
917
918 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
919 }
920
921 /*
922 * Reset the scan period if enough time has gone by. Objective is that
923 * scanning will be reduced if pages are properly placed. As tasks
924 * can enter different phases this needs to be re-examined. Lacking
925 * proper tracking of reference behaviour, this blunt hammer is used.
926 */
927 migrate = mm->numa_next_reset;
928 if (time_after(now, migrate)) {
929 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
930 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
931 xchg(&mm->numa_next_reset, next_scan);
932 } 1679 }
933 1680
934 /* 1681 /*
@@ -938,20 +1685,20 @@ void task_numa_work(struct callback_head *work)
938 if (time_before(now, migrate)) 1685 if (time_before(now, migrate))
939 return; 1686 return;
940 1687
941 if (p->numa_scan_period == 0) 1688 if (p->numa_scan_period == 0) {
942 p->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1689 p->numa_scan_period_max = task_scan_max(p);
1690 p->numa_scan_period = task_scan_min(p);
1691 }
943 1692
944 next_scan = now + msecs_to_jiffies(p->numa_scan_period); 1693 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
945 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) 1694 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
946 return; 1695 return;
947 1696
948 /* 1697 /*
949 * Do not set pte_numa if the current running node is rate-limited. 1698 * Delay this task enough that another task of this mm will likely win
950 * This loses statistics on the fault but if we are unwilling to 1699 * the next time around.
951 * migrate to this node, it is less likely we can do useful work
952 */ 1700 */
953 if (migrate_ratelimited(numa_node_id())) 1701 p->node_stamp += 2 * TICK_NSEC;
954 return;
955 1702
956 start = mm->numa_scan_offset; 1703 start = mm->numa_scan_offset;
957 pages = sysctl_numa_balancing_scan_size; 1704 pages = sysctl_numa_balancing_scan_size;
@@ -967,18 +1714,32 @@ void task_numa_work(struct callback_head *work)
967 vma = mm->mmap; 1714 vma = mm->mmap;
968 } 1715 }
969 for (; vma; vma = vma->vm_next) { 1716 for (; vma; vma = vma->vm_next) {
970 if (!vma_migratable(vma)) 1717 if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
971 continue; 1718 continue;
972 1719
973 /* Skip small VMAs. They are not likely to be of relevance */ 1720 /*
974 if (vma->vm_end - vma->vm_start < HPAGE_SIZE) 1721 * Shared library pages mapped by multiple processes are not
1722 * migrated as it is expected they are cache replicated. Avoid
1723 * hinting faults in read-only file-backed mappings or the vdso
1724 * as migrating the pages will be of marginal benefit.
1725 */
1726 if (!vma->vm_mm ||
1727 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
975 continue; 1728 continue;
976 1729
977 do { 1730 do {
978 start = max(start, vma->vm_start); 1731 start = max(start, vma->vm_start);
979 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 1732 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
980 end = min(end, vma->vm_end); 1733 end = min(end, vma->vm_end);
981 pages -= change_prot_numa(vma, start, end); 1734 nr_pte_updates += change_prot_numa(vma, start, end);
1735
1736 /*
1737 * Scan sysctl_numa_balancing_scan_size but ensure that
1738 * at least one PTE is updated so that unused virtual
1739 * address space is quickly skipped.
1740 */
1741 if (nr_pte_updates)
1742 pages -= (end - start) >> PAGE_SHIFT;
982 1743
983 start = end; 1744 start = end;
984 if (pages <= 0) 1745 if (pages <= 0)
@@ -988,10 +1749,10 @@ void task_numa_work(struct callback_head *work)
988 1749
989out: 1750out:
990 /* 1751 /*
991 * It is possible to reach the end of the VMA list but the last few VMAs are 1752 * It is possible to reach the end of the VMA list but the last few
992 * not guaranteed to the vma_migratable. If they are not, we would find the 1753 * VMAs are not guaranteed to the vma_migratable. If they are not, we
993 * !migratable VMA on the next scan but not reset the scanner to the start 1754 * would find the !migratable VMA on the next scan but not reset the
994 * so check it now. 1755 * scanner to the start so check it now.
995 */ 1756 */
996 if (vma) 1757 if (vma)
997 mm->numa_scan_offset = start; 1758 mm->numa_scan_offset = start;
@@ -1025,8 +1786,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1025 1786
1026 if (now - curr->node_stamp > period) { 1787 if (now - curr->node_stamp > period) {
1027 if (!curr->node_stamp) 1788 if (!curr->node_stamp)
1028 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1789 curr->numa_scan_period = task_scan_min(curr);
1029 curr->node_stamp = now; 1790 curr->node_stamp += period;
1030 1791
1031 if (!time_before(jiffies, curr->mm->numa_next_scan)) { 1792 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1032 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ 1793 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -1038,6 +1799,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1038static void task_tick_numa(struct rq *rq, struct task_struct *curr) 1799static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1039{ 1800{
1040} 1801}
1802
1803static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1804{
1805}
1806
1807static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1808{
1809}
1041#endif /* CONFIG_NUMA_BALANCING */ 1810#endif /* CONFIG_NUMA_BALANCING */
1042 1811
1043static void 1812static void
@@ -1047,8 +1816,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1047 if (!parent_entity(se)) 1816 if (!parent_entity(se))
1048 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 1817 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1049#ifdef CONFIG_SMP 1818#ifdef CONFIG_SMP
1050 if (entity_is_task(se)) 1819 if (entity_is_task(se)) {
1051 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); 1820 struct rq *rq = rq_of(cfs_rq);
1821
1822 account_numa_enqueue(rq, task_of(se));
1823 list_add(&se->group_node, &rq->cfs_tasks);
1824 }
1052#endif 1825#endif
1053 cfs_rq->nr_running++; 1826 cfs_rq->nr_running++;
1054} 1827}
@@ -1059,8 +1832,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1059 update_load_sub(&cfs_rq->load, se->load.weight); 1832 update_load_sub(&cfs_rq->load, se->load.weight);
1060 if (!parent_entity(se)) 1833 if (!parent_entity(se))
1061 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); 1834 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1062 if (entity_is_task(se)) 1835 if (entity_is_task(se)) {
1836 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
1063 list_del_init(&se->group_node); 1837 list_del_init(&se->group_node);
1838 }
1064 cfs_rq->nr_running--; 1839 cfs_rq->nr_running--;
1065} 1840}
1066 1841
@@ -2032,6 +2807,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
2032 */ 2807 */
2033 update_entity_load_avg(curr, 1); 2808 update_entity_load_avg(curr, 1);
2034 update_cfs_rq_blocked_load(cfs_rq, 1); 2809 update_cfs_rq_blocked_load(cfs_rq, 1);
2810 update_cfs_shares(cfs_rq);
2035 2811
2036#ifdef CONFIG_SCHED_HRTICK 2812#ifdef CONFIG_SCHED_HRTICK
2037 /* 2813 /*
@@ -2069,13 +2845,14 @@ static inline bool cfs_bandwidth_used(void)
2069 return static_key_false(&__cfs_bandwidth_used); 2845 return static_key_false(&__cfs_bandwidth_used);
2070} 2846}
2071 2847
2072void account_cfs_bandwidth_used(int enabled, int was_enabled) 2848void cfs_bandwidth_usage_inc(void)
2073{ 2849{
2074 /* only need to count groups transitioning between enabled/!enabled */ 2850 static_key_slow_inc(&__cfs_bandwidth_used);
2075 if (enabled && !was_enabled) 2851}
2076 static_key_slow_inc(&__cfs_bandwidth_used); 2852
2077 else if (!enabled && was_enabled) 2853void cfs_bandwidth_usage_dec(void)
2078 static_key_slow_dec(&__cfs_bandwidth_used); 2854{
2855 static_key_slow_dec(&__cfs_bandwidth_used);
2079} 2856}
2080#else /* HAVE_JUMP_LABEL */ 2857#else /* HAVE_JUMP_LABEL */
2081static bool cfs_bandwidth_used(void) 2858static bool cfs_bandwidth_used(void)
@@ -2083,7 +2860,8 @@ static bool cfs_bandwidth_used(void)
2083 return true; 2860 return true;
2084} 2861}
2085 2862
2086void account_cfs_bandwidth_used(int enabled, int was_enabled) {} 2863void cfs_bandwidth_usage_inc(void) {}
2864void cfs_bandwidth_usage_dec(void) {}
2087#endif /* HAVE_JUMP_LABEL */ 2865#endif /* HAVE_JUMP_LABEL */
2088 2866
2089/* 2867/*
@@ -2334,6 +3112,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2334 cfs_rq->throttled_clock = rq_clock(rq); 3112 cfs_rq->throttled_clock = rq_clock(rq);
2335 raw_spin_lock(&cfs_b->lock); 3113 raw_spin_lock(&cfs_b->lock);
2336 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 3114 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3115 if (!cfs_b->timer_active)
3116 __start_cfs_bandwidth(cfs_b);
2337 raw_spin_unlock(&cfs_b->lock); 3117 raw_spin_unlock(&cfs_b->lock);
2338} 3118}
2339 3119
@@ -2447,6 +3227,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2447 if (idle) 3227 if (idle)
2448 goto out_unlock; 3228 goto out_unlock;
2449 3229
3230 /*
3231 * if we have relooped after returning idle once, we need to update our
3232 * status as actually running, so that other cpus doing
3233 * __start_cfs_bandwidth will stop trying to cancel us.
3234 */
3235 cfs_b->timer_active = 1;
3236
2450 __refill_cfs_bandwidth_runtime(cfs_b); 3237 __refill_cfs_bandwidth_runtime(cfs_b);
2451 3238
2452 if (!throttled) { 3239 if (!throttled) {
@@ -2507,7 +3294,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2507/* how long we wait to gather additional slack before distributing */ 3294/* how long we wait to gather additional slack before distributing */
2508static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; 3295static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2509 3296
2510/* are we near the end of the current quota period? */ 3297/*
3298 * Are we near the end of the current quota period?
3299 *
3300 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3301 * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
3302 * migrate_hrtimers, base is never cleared, so we are fine.
3303 */
2511static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) 3304static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2512{ 3305{
2513 struct hrtimer *refresh_timer = &cfs_b->period_timer; 3306 struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -2583,10 +3376,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2583 u64 expires; 3376 u64 expires;
2584 3377
2585 /* confirm we're still not at a refresh boundary */ 3378 /* confirm we're still not at a refresh boundary */
2586 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) 3379 raw_spin_lock(&cfs_b->lock);
3380 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3381 raw_spin_unlock(&cfs_b->lock);
2587 return; 3382 return;
3383 }
2588 3384
2589 raw_spin_lock(&cfs_b->lock);
2590 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { 3385 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2591 runtime = cfs_b->runtime; 3386 runtime = cfs_b->runtime;
2592 cfs_b->runtime = 0; 3387 cfs_b->runtime = 0;
@@ -2707,11 +3502,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2707 * (timer_active==0 becomes visible before the hrtimer call-back 3502 * (timer_active==0 becomes visible before the hrtimer call-back
2708 * terminates). In either case we ensure that it's re-programmed 3503 * terminates). In either case we ensure that it's re-programmed
2709 */ 3504 */
2710 while (unlikely(hrtimer_active(&cfs_b->period_timer))) { 3505 while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
3506 hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
3507 /* bounce the lock to allow do_sched_cfs_period_timer to run */
2711 raw_spin_unlock(&cfs_b->lock); 3508 raw_spin_unlock(&cfs_b->lock);
2712 /* ensure cfs_b->lock is available while we wait */ 3509 cpu_relax();
2713 hrtimer_cancel(&cfs_b->period_timer);
2714
2715 raw_spin_lock(&cfs_b->lock); 3510 raw_spin_lock(&cfs_b->lock);
2716 /* if someone else restarted the timer then we're done */ 3511 /* if someone else restarted the timer then we're done */
2717 if (cfs_b->timer_active) 3512 if (cfs_b->timer_active)
@@ -3017,6 +3812,23 @@ static unsigned long cpu_avg_load_per_task(int cpu)
3017 return 0; 3812 return 0;
3018} 3813}
3019 3814
3815static void record_wakee(struct task_struct *p)
3816{
3817 /*
3818 * Rough decay (wiping) for cost saving, don't worry
3819 * about the boundary, really active task won't care
3820 * about the loss.
3821 */
3822 if (jiffies > current->wakee_flip_decay_ts + HZ) {
3823 current->wakee_flips = 0;
3824 current->wakee_flip_decay_ts = jiffies;
3825 }
3826
3827 if (current->last_wakee != p) {
3828 current->last_wakee = p;
3829 current->wakee_flips++;
3830 }
3831}
3020 3832
3021static void task_waking_fair(struct task_struct *p) 3833static void task_waking_fair(struct task_struct *p)
3022{ 3834{
@@ -3037,6 +3849,7 @@ static void task_waking_fair(struct task_struct *p)
3037#endif 3849#endif
3038 3850
3039 se->vruntime -= min_vruntime; 3851 se->vruntime -= min_vruntime;
3852 record_wakee(p);
3040} 3853}
3041 3854
3042#ifdef CONFIG_FAIR_GROUP_SCHED 3855#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3094,7 +3907,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3094{ 3907{
3095 struct sched_entity *se = tg->se[cpu]; 3908 struct sched_entity *se = tg->se[cpu];
3096 3909
3097 if (!tg->parent) /* the trivial, non-cgroup case */ 3910 if (!tg->parent || !wl) /* the trivial, non-cgroup case */
3098 return wl; 3911 return wl;
3099 3912
3100 for_each_sched_entity(se) { 3913 for_each_sched_entity(se) {
@@ -3147,14 +3960,35 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3147} 3960}
3148#else 3961#else
3149 3962
3150static inline unsigned long effective_load(struct task_group *tg, int cpu, 3963static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3151 unsigned long wl, unsigned long wg)
3152{ 3964{
3153 return wl; 3965 return wl;
3154} 3966}
3155 3967
3156#endif 3968#endif
3157 3969
3970static int wake_wide(struct task_struct *p)
3971{
3972 int factor = this_cpu_read(sd_llc_size);
3973
3974 /*
3975 * Yeah, it's the switching-frequency, could means many wakee or
3976 * rapidly switch, use factor here will just help to automatically
3977 * adjust the loose-degree, so bigger node will lead to more pull.
3978 */
3979 if (p->wakee_flips > factor) {
3980 /*
3981 * wakee is somewhat hot, it needs certain amount of cpu
3982 * resource, so if waker is far more hot, prefer to leave
3983 * it alone.
3984 */
3985 if (current->wakee_flips > (factor * p->wakee_flips))
3986 return 1;
3987 }
3988
3989 return 0;
3990}
3991
3158static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 3992static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3159{ 3993{
3160 s64 this_load, load; 3994 s64 this_load, load;
@@ -3164,6 +3998,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3164 unsigned long weight; 3998 unsigned long weight;
3165 int balanced; 3999 int balanced;
3166 4000
4001 /*
4002 * If we wake multiple tasks be careful to not bounce
4003 * ourselves around too much.
4004 */
4005 if (wake_wide(p))
4006 return 0;
4007
3167 idx = sd->wake_idx; 4008 idx = sd->wake_idx;
3168 this_cpu = smp_processor_id(); 4009 this_cpu = smp_processor_id();
3169 prev_cpu = task_cpu(p); 4010 prev_cpu = task_cpu(p);
@@ -3372,11 +4213,10 @@ done:
3372 * preempt must be disabled. 4213 * preempt must be disabled.
3373 */ 4214 */
3374static int 4215static int
3375select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 4216select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
3376{ 4217{
3377 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 4218 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
3378 int cpu = smp_processor_id(); 4219 int cpu = smp_processor_id();
3379 int prev_cpu = task_cpu(p);
3380 int new_cpu = cpu; 4220 int new_cpu = cpu;
3381 int want_affine = 0; 4221 int want_affine = 0;
3382 int sync = wake_flags & WF_SYNC; 4222 int sync = wake_flags & WF_SYNC;
@@ -3856,9 +4696,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
3856 4696
3857static unsigned long __read_mostly max_load_balance_interval = HZ/10; 4697static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3858 4698
4699enum fbq_type { regular, remote, all };
4700
3859#define LBF_ALL_PINNED 0x01 4701#define LBF_ALL_PINNED 0x01
3860#define LBF_NEED_BREAK 0x02 4702#define LBF_NEED_BREAK 0x02
3861#define LBF_SOME_PINNED 0x04 4703#define LBF_DST_PINNED 0x04
4704#define LBF_SOME_PINNED 0x08
3862 4705
3863struct lb_env { 4706struct lb_env {
3864 struct sched_domain *sd; 4707 struct sched_domain *sd;
@@ -3881,6 +4724,8 @@ struct lb_env {
3881 unsigned int loop; 4724 unsigned int loop;
3882 unsigned int loop_break; 4725 unsigned int loop_break;
3883 unsigned int loop_max; 4726 unsigned int loop_max;
4727
4728 enum fbq_type fbq_type;
3884}; 4729};
3885 4730
3886/* 4731/*
@@ -3927,6 +4772,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3927 return delta < (s64)sysctl_sched_migration_cost; 4772 return delta < (s64)sysctl_sched_migration_cost;
3928} 4773}
3929 4774
4775#ifdef CONFIG_NUMA_BALANCING
4776/* Returns true if the destination node has incurred more faults */
4777static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4778{
4779 int src_nid, dst_nid;
4780
4781 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
4782 !(env->sd->flags & SD_NUMA)) {
4783 return false;
4784 }
4785
4786 src_nid = cpu_to_node(env->src_cpu);
4787 dst_nid = cpu_to_node(env->dst_cpu);
4788
4789 if (src_nid == dst_nid)
4790 return false;
4791
4792 /* Always encourage migration to the preferred node. */
4793 if (dst_nid == p->numa_preferred_nid)
4794 return true;
4795
4796 /* If both task and group weight improve, this move is a winner. */
4797 if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
4798 group_weight(p, dst_nid) > group_weight(p, src_nid))
4799 return true;
4800
4801 return false;
4802}
4803
4804
4805static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4806{
4807 int src_nid, dst_nid;
4808
4809 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
4810 return false;
4811
4812 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
4813 return false;
4814
4815 src_nid = cpu_to_node(env->src_cpu);
4816 dst_nid = cpu_to_node(env->dst_cpu);
4817
4818 if (src_nid == dst_nid)
4819 return false;
4820
4821 /* Migrating away from the preferred node is always bad. */
4822 if (src_nid == p->numa_preferred_nid)
4823 return true;
4824
4825 /* If either task or group weight get worse, don't do it. */
4826 if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
4827 group_weight(p, dst_nid) < group_weight(p, src_nid))
4828 return true;
4829
4830 return false;
4831}
4832
4833#else
4834static inline bool migrate_improves_locality(struct task_struct *p,
4835 struct lb_env *env)
4836{
4837 return false;
4838}
4839
4840static inline bool migrate_degrades_locality(struct task_struct *p,
4841 struct lb_env *env)
4842{
4843 return false;
4844}
4845#endif
4846
3930/* 4847/*
3931 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 4848 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3932 */ 4849 */
@@ -3949,6 +4866,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3949 4866
3950 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 4867 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3951 4868
4869 env->flags |= LBF_SOME_PINNED;
4870
3952 /* 4871 /*
3953 * Remember if this task can be migrated to any other cpu in 4872 * Remember if this task can be migrated to any other cpu in
3954 * our sched_group. We may want to revisit it if we couldn't 4873 * our sched_group. We may want to revisit it if we couldn't
@@ -3957,13 +4876,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3957 * Also avoid computing new_dst_cpu if we have already computed 4876 * Also avoid computing new_dst_cpu if we have already computed
3958 * one in current iteration. 4877 * one in current iteration.
3959 */ 4878 */
3960 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) 4879 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
3961 return 0; 4880 return 0;
3962 4881
3963 /* Prevent to re-select dst_cpu via env's cpus */ 4882 /* Prevent to re-select dst_cpu via env's cpus */
3964 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 4883 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
3965 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 4884 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
3966 env->flags |= LBF_SOME_PINNED; 4885 env->flags |= LBF_DST_PINNED;
3967 env->new_dst_cpu = cpu; 4886 env->new_dst_cpu = cpu;
3968 break; 4887 break;
3969 } 4888 }
@@ -3982,11 +4901,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3982 4901
3983 /* 4902 /*
3984 * Aggressive migration if: 4903 * Aggressive migration if:
3985 * 1) task is cache cold, or 4904 * 1) destination numa is preferred
3986 * 2) too many balance attempts have failed. 4905 * 2) task is cache cold, or
4906 * 3) too many balance attempts have failed.
3987 */ 4907 */
3988
3989 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); 4908 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
4909 if (!tsk_cache_hot)
4910 tsk_cache_hot = migrate_degrades_locality(p, env);
4911
4912 if (migrate_improves_locality(p, env)) {
4913#ifdef CONFIG_SCHEDSTATS
4914 if (tsk_cache_hot) {
4915 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4916 schedstat_inc(p, se.statistics.nr_forced_migrations);
4917 }
4918#endif
4919 return 1;
4920 }
4921
3990 if (!tsk_cache_hot || 4922 if (!tsk_cache_hot ||
3991 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 4923 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3992 4924
@@ -4029,8 +4961,6 @@ static int move_one_task(struct lb_env *env)
4029 return 0; 4961 return 0;
4030} 4962}
4031 4963
4032static unsigned long task_h_load(struct task_struct *p);
4033
4034static const unsigned int sched_nr_migrate_break = 32; 4964static const unsigned int sched_nr_migrate_break = 32;
4035 4965
4036/* 4966/*
@@ -4171,47 +5101,48 @@ static void update_blocked_averages(int cpu)
4171} 5101}
4172 5102
4173/* 5103/*
4174 * Compute the cpu's hierarchical load factor for each task group. 5104 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
4175 * This needs to be done in a top-down fashion because the load of a child 5105 * This needs to be done in a top-down fashion because the load of a child
4176 * group is a fraction of its parents load. 5106 * group is a fraction of its parents load.
4177 */ 5107 */
4178static int tg_load_down(struct task_group *tg, void *data) 5108static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
4179{
4180 unsigned long load;
4181 long cpu = (long)data;
4182
4183 if (!tg->parent) {
4184 load = cpu_rq(cpu)->avg.load_avg_contrib;
4185 } else {
4186 load = tg->parent->cfs_rq[cpu]->h_load;
4187 load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
4188 tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
4189 }
4190
4191 tg->cfs_rq[cpu]->h_load = load;
4192
4193 return 0;
4194}
4195
4196static void update_h_load(long cpu)
4197{ 5109{
4198 struct rq *rq = cpu_rq(cpu); 5110 struct rq *rq = rq_of(cfs_rq);
5111 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
4199 unsigned long now = jiffies; 5112 unsigned long now = jiffies;
5113 unsigned long load;
4200 5114
4201 if (rq->h_load_throttle == now) 5115 if (cfs_rq->last_h_load_update == now)
4202 return; 5116 return;
4203 5117
4204 rq->h_load_throttle = now; 5118 cfs_rq->h_load_next = NULL;
5119 for_each_sched_entity(se) {
5120 cfs_rq = cfs_rq_of(se);
5121 cfs_rq->h_load_next = se;
5122 if (cfs_rq->last_h_load_update == now)
5123 break;
5124 }
5125
5126 if (!se) {
5127 cfs_rq->h_load = cfs_rq->runnable_load_avg;
5128 cfs_rq->last_h_load_update = now;
5129 }
4205 5130
4206 rcu_read_lock(); 5131 while ((se = cfs_rq->h_load_next) != NULL) {
4207 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 5132 load = cfs_rq->h_load;
4208 rcu_read_unlock(); 5133 load = div64_ul(load * se->avg.load_avg_contrib,
5134 cfs_rq->runnable_load_avg + 1);
5135 cfs_rq = group_cfs_rq(se);
5136 cfs_rq->h_load = load;
5137 cfs_rq->last_h_load_update = now;
5138 }
4209} 5139}
4210 5140
4211static unsigned long task_h_load(struct task_struct *p) 5141static unsigned long task_h_load(struct task_struct *p)
4212{ 5142{
4213 struct cfs_rq *cfs_rq = task_cfs_rq(p); 5143 struct cfs_rq *cfs_rq = task_cfs_rq(p);
4214 5144
5145 update_cfs_rq_h_load(cfs_rq);
4215 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, 5146 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
4216 cfs_rq->runnable_load_avg + 1); 5147 cfs_rq->runnable_load_avg + 1);
4217} 5148}
@@ -4220,10 +5151,6 @@ static inline void update_blocked_averages(int cpu)
4220{ 5151{
4221} 5152}
4222 5153
4223static inline void update_h_load(long cpu)
4224{
4225}
4226
4227static unsigned long task_h_load(struct task_struct *p) 5154static unsigned long task_h_load(struct task_struct *p)
4228{ 5155{
4229 return p->se.avg.load_avg_contrib; 5156 return p->se.avg.load_avg_contrib;
@@ -4232,54 +5159,66 @@ static unsigned long task_h_load(struct task_struct *p)
4232 5159
4233/********** Helpers for find_busiest_group ************************/ 5160/********** Helpers for find_busiest_group ************************/
4234/* 5161/*
4235 * sd_lb_stats - Structure to store the statistics of a sched_domain
4236 * during load balancing.
4237 */
4238struct sd_lb_stats {
4239 struct sched_group *busiest; /* Busiest group in this sd */
4240 struct sched_group *this; /* Local group in this sd */
4241 unsigned long total_load; /* Total load of all groups in sd */
4242 unsigned long total_pwr; /* Total power of all groups in sd */
4243 unsigned long avg_load; /* Average load across all groups in sd */
4244
4245 /** Statistics of this group */
4246 unsigned long this_load;
4247 unsigned long this_load_per_task;
4248 unsigned long this_nr_running;
4249 unsigned long this_has_capacity;
4250 unsigned int this_idle_cpus;
4251
4252 /* Statistics of the busiest group */
4253 unsigned int busiest_idle_cpus;
4254 unsigned long max_load;
4255 unsigned long busiest_load_per_task;
4256 unsigned long busiest_nr_running;
4257 unsigned long busiest_group_capacity;
4258 unsigned long busiest_has_capacity;
4259 unsigned int busiest_group_weight;
4260
4261 int group_imb; /* Is there imbalance in this sd */
4262};
4263
4264/*
4265 * sg_lb_stats - stats of a sched_group required for load_balancing 5162 * sg_lb_stats - stats of a sched_group required for load_balancing
4266 */ 5163 */
4267struct sg_lb_stats { 5164struct sg_lb_stats {
4268 unsigned long avg_load; /*Avg load across the CPUs of the group */ 5165 unsigned long avg_load; /*Avg load across the CPUs of the group */
4269 unsigned long group_load; /* Total load over the CPUs of the group */ 5166 unsigned long group_load; /* Total load over the CPUs of the group */
4270 unsigned long sum_nr_running; /* Nr tasks running in the group */
4271 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 5167 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
4272 unsigned long group_capacity; 5168 unsigned long load_per_task;
4273 unsigned long idle_cpus; 5169 unsigned long group_power;
4274 unsigned long group_weight; 5170 unsigned int sum_nr_running; /* Nr tasks running in the group */
5171 unsigned int group_capacity;
5172 unsigned int idle_cpus;
5173 unsigned int group_weight;
4275 int group_imb; /* Is there an imbalance in the group ? */ 5174 int group_imb; /* Is there an imbalance in the group ? */
4276 int group_has_capacity; /* Is there extra capacity in the group? */ 5175 int group_has_capacity; /* Is there extra capacity in the group? */
5176#ifdef CONFIG_NUMA_BALANCING
5177 unsigned int nr_numa_running;
5178 unsigned int nr_preferred_running;
5179#endif
5180};
5181
5182/*
5183 * sd_lb_stats - Structure to store the statistics of a sched_domain
5184 * during load balancing.
5185 */
5186struct sd_lb_stats {
5187 struct sched_group *busiest; /* Busiest group in this sd */
5188 struct sched_group *local; /* Local group in this sd */
5189 unsigned long total_load; /* Total load of all groups in sd */
5190 unsigned long total_pwr; /* Total power of all groups in sd */
5191 unsigned long avg_load; /* Average load across all groups in sd */
5192
5193 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
5194 struct sg_lb_stats local_stat; /* Statistics of the local group */
4277}; 5195};
4278 5196
5197static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
5198{
5199 /*
5200 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
5201 * local_stat because update_sg_lb_stats() does a full clear/assignment.
5202 * We must however clear busiest_stat::avg_load because
5203 * update_sd_pick_busiest() reads this before assignment.
5204 */
5205 *sds = (struct sd_lb_stats){
5206 .busiest = NULL,
5207 .local = NULL,
5208 .total_load = 0UL,
5209 .total_pwr = 0UL,
5210 .busiest_stat = {
5211 .avg_load = 0UL,
5212 },
5213 };
5214}
5215
4279/** 5216/**
4280 * get_sd_load_idx - Obtain the load index for a given sched domain. 5217 * get_sd_load_idx - Obtain the load index for a given sched domain.
4281 * @sd: The sched_domain whose load_idx is to be obtained. 5218 * @sd: The sched_domain whose load_idx is to be obtained.
4282 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 5219 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
5220 *
5221 * Return: The load index.
4283 */ 5222 */
4284static inline int get_sd_load_idx(struct sched_domain *sd, 5223static inline int get_sd_load_idx(struct sched_domain *sd,
4285 enum cpu_idle_type idle) 5224 enum cpu_idle_type idle)
@@ -4394,7 +5333,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
4394{ 5333{
4395 struct sched_domain *child = sd->child; 5334 struct sched_domain *child = sd->child;
4396 struct sched_group *group, *sdg = sd->groups; 5335 struct sched_group *group, *sdg = sd->groups;
4397 unsigned long power; 5336 unsigned long power, power_orig;
4398 unsigned long interval; 5337 unsigned long interval;
4399 5338
4400 interval = msecs_to_jiffies(sd->balance_interval); 5339 interval = msecs_to_jiffies(sd->balance_interval);
@@ -4406,7 +5345,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
4406 return; 5345 return;
4407 } 5346 }
4408 5347
4409 power = 0; 5348 power_orig = power = 0;
4410 5349
4411 if (child->flags & SD_OVERLAP) { 5350 if (child->flags & SD_OVERLAP) {
4412 /* 5351 /*
@@ -4414,8 +5353,12 @@ void update_group_power(struct sched_domain *sd, int cpu)
4414 * span the current group. 5353 * span the current group.
4415 */ 5354 */
4416 5355
4417 for_each_cpu(cpu, sched_group_cpus(sdg)) 5356 for_each_cpu(cpu, sched_group_cpus(sdg)) {
4418 power += power_of(cpu); 5357 struct sched_group *sg = cpu_rq(cpu)->sd->groups;
5358
5359 power_orig += sg->sgp->power_orig;
5360 power += sg->sgp->power;
5361 }
4419 } else { 5362 } else {
4420 /* 5363 /*
4421 * !SD_OVERLAP domains can assume that child groups 5364 * !SD_OVERLAP domains can assume that child groups
@@ -4424,12 +5367,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
4424 5367
4425 group = child->groups; 5368 group = child->groups;
4426 do { 5369 do {
5370 power_orig += group->sgp->power_orig;
4427 power += group->sgp->power; 5371 power += group->sgp->power;
4428 group = group->next; 5372 group = group->next;
4429 } while (group != child->groups); 5373 } while (group != child->groups);
4430 } 5374 }
4431 5375
4432 sdg->sgp->power_orig = sdg->sgp->power = power; 5376 sdg->sgp->power_orig = power_orig;
5377 sdg->sgp->power = power;
4433} 5378}
4434 5379
4435/* 5380/*
@@ -4457,33 +5402,84 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4457 return 0; 5402 return 0;
4458} 5403}
4459 5404
5405/*
5406 * Group imbalance indicates (and tries to solve) the problem where balancing
5407 * groups is inadequate due to tsk_cpus_allowed() constraints.
5408 *
5409 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
5410 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
5411 * Something like:
5412 *
5413 * { 0 1 2 3 } { 4 5 6 7 }
5414 * * * * *
5415 *
5416 * If we were to balance group-wise we'd place two tasks in the first group and
5417 * two tasks in the second group. Clearly this is undesired as it will overload
5418 * cpu 3 and leave one of the cpus in the second group unused.
5419 *
5420 * The current solution to this issue is detecting the skew in the first group
5421 * by noticing the lower domain failed to reach balance and had difficulty
5422 * moving tasks due to affinity constraints.
5423 *
5424 * When this is so detected; this group becomes a candidate for busiest; see
5425 * update_sd_pick_busiest(). And calculate_imbalance() and
5426 * find_busiest_group() avoid some of the usual balance conditions to allow it
5427 * to create an effective group imbalance.
5428 *
5429 * This is a somewhat tricky proposition since the next run might not find the
5430 * group imbalance and decide the groups need to be balanced again. A most
5431 * subtle and fragile situation.
5432 */
5433
5434static inline int sg_imbalanced(struct sched_group *group)
5435{
5436 return group->sgp->imbalance;
5437}
5438
5439/*
5440 * Compute the group capacity.
5441 *
5442 * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
5443 * first dividing out the smt factor and computing the actual number of cores
5444 * and limit power unit capacity with that.
5445 */
5446static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
5447{
5448 unsigned int capacity, smt, cpus;
5449 unsigned int power, power_orig;
5450
5451 power = group->sgp->power;
5452 power_orig = group->sgp->power_orig;
5453 cpus = group->group_weight;
5454
5455 /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
5456 smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
5457 capacity = cpus / smt; /* cores */
5458
5459 capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
5460 if (!capacity)
5461 capacity = fix_small_capacity(env->sd, group);
5462
5463 return capacity;
5464}
5465
4460/** 5466/**
4461 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 5467 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
4462 * @env: The load balancing environment. 5468 * @env: The load balancing environment.
4463 * @group: sched_group whose statistics are to be updated. 5469 * @group: sched_group whose statistics are to be updated.
4464 * @load_idx: Load index of sched_domain of this_cpu for load calc. 5470 * @load_idx: Load index of sched_domain of this_cpu for load calc.
4465 * @local_group: Does group contain this_cpu. 5471 * @local_group: Does group contain this_cpu.
4466 * @balance: Should we balance.
4467 * @sgs: variable to hold the statistics for this group. 5472 * @sgs: variable to hold the statistics for this group.
4468 */ 5473 */
4469static inline void update_sg_lb_stats(struct lb_env *env, 5474static inline void update_sg_lb_stats(struct lb_env *env,
4470 struct sched_group *group, int load_idx, 5475 struct sched_group *group, int load_idx,
4471 int local_group, int *balance, struct sg_lb_stats *sgs) 5476 int local_group, struct sg_lb_stats *sgs)
4472{ 5477{
4473 unsigned long nr_running, max_nr_running, min_nr_running; 5478 unsigned long nr_running;
4474 unsigned long load, max_cpu_load, min_cpu_load; 5479 unsigned long load;
4475 unsigned int balance_cpu = -1, first_idle_cpu = 0;
4476 unsigned long avg_load_per_task = 0;
4477 int i; 5480 int i;
4478 5481
4479 if (local_group) 5482 memset(sgs, 0, sizeof(*sgs));
4480 balance_cpu = group_balance_cpu(group);
4481
4482 /* Tally up the load of all CPUs in the group */
4483 max_cpu_load = 0;
4484 min_cpu_load = ~0UL;
4485 max_nr_running = 0;
4486 min_nr_running = ~0UL;
4487 5483
4488 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5484 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4489 struct rq *rq = cpu_rq(i); 5485 struct rq *rq = cpu_rq(i);
@@ -4491,76 +5487,34 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4491 nr_running = rq->nr_running; 5487 nr_running = rq->nr_running;
4492 5488
4493 /* Bias balancing toward cpus of our domain */ 5489 /* Bias balancing toward cpus of our domain */
4494 if (local_group) { 5490 if (local_group)
4495 if (idle_cpu(i) && !first_idle_cpu &&
4496 cpumask_test_cpu(i, sched_group_mask(group))) {
4497 first_idle_cpu = 1;
4498 balance_cpu = i;
4499 }
4500
4501 load = target_load(i, load_idx); 5491 load = target_load(i, load_idx);
4502 } else { 5492 else
4503 load = source_load(i, load_idx); 5493 load = source_load(i, load_idx);
4504 if (load > max_cpu_load)
4505 max_cpu_load = load;
4506 if (min_cpu_load > load)
4507 min_cpu_load = load;
4508
4509 if (nr_running > max_nr_running)
4510 max_nr_running = nr_running;
4511 if (min_nr_running > nr_running)
4512 min_nr_running = nr_running;
4513 }
4514 5494
4515 sgs->group_load += load; 5495 sgs->group_load += load;
4516 sgs->sum_nr_running += nr_running; 5496 sgs->sum_nr_running += nr_running;
5497#ifdef CONFIG_NUMA_BALANCING
5498 sgs->nr_numa_running += rq->nr_numa_running;
5499 sgs->nr_preferred_running += rq->nr_preferred_running;
5500#endif
4517 sgs->sum_weighted_load += weighted_cpuload(i); 5501 sgs->sum_weighted_load += weighted_cpuload(i);
4518 if (idle_cpu(i)) 5502 if (idle_cpu(i))
4519 sgs->idle_cpus++; 5503 sgs->idle_cpus++;
4520 } 5504 }
4521 5505
4522 /*
4523 * First idle cpu or the first cpu(busiest) in this sched group
4524 * is eligible for doing load balancing at this and above
4525 * domains. In the newly idle case, we will allow all the cpu's
4526 * to do the newly idle load balance.
4527 */
4528 if (local_group) {
4529 if (env->idle != CPU_NEWLY_IDLE) {
4530 if (balance_cpu != env->dst_cpu) {
4531 *balance = 0;
4532 return;
4533 }
4534 update_group_power(env->sd, env->dst_cpu);
4535 } else if (time_after_eq(jiffies, group->sgp->next_update))
4536 update_group_power(env->sd, env->dst_cpu);
4537 }
4538
4539 /* Adjust by relative CPU power of the group */ 5506 /* Adjust by relative CPU power of the group */
4540 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; 5507 sgs->group_power = group->sgp->power;
5508 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
4541 5509
4542 /*
4543 * Consider the group unbalanced when the imbalance is larger
4544 * than the average weight of a task.
4545 *
4546 * APZ: with cgroup the avg task weight can vary wildly and
4547 * might not be a suitable number - should we keep a
4548 * normalized nr_running number somewhere that negates
4549 * the hierarchy?
4550 */
4551 if (sgs->sum_nr_running) 5510 if (sgs->sum_nr_running)
4552 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 5511 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4553 5512
4554 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
4555 (max_nr_running - min_nr_running) > 1)
4556 sgs->group_imb = 1;
4557
4558 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
4559 SCHED_POWER_SCALE);
4560 if (!sgs->group_capacity)
4561 sgs->group_capacity = fix_small_capacity(env->sd, group);
4562 sgs->group_weight = group->group_weight; 5513 sgs->group_weight = group->group_weight;
4563 5514
5515 sgs->group_imb = sg_imbalanced(group);
5516 sgs->group_capacity = sg_capacity(env, group);
5517
4564 if (sgs->group_capacity > sgs->sum_nr_running) 5518 if (sgs->group_capacity > sgs->sum_nr_running)
4565 sgs->group_has_capacity = 1; 5519 sgs->group_has_capacity = 1;
4566} 5520}
@@ -4574,13 +5528,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4574 * 5528 *
4575 * Determine if @sg is a busier group than the previously selected 5529 * Determine if @sg is a busier group than the previously selected
4576 * busiest group. 5530 * busiest group.
5531 *
5532 * Return: %true if @sg is a busier group than the previously selected
5533 * busiest group. %false otherwise.
4577 */ 5534 */
4578static bool update_sd_pick_busiest(struct lb_env *env, 5535static bool update_sd_pick_busiest(struct lb_env *env,
4579 struct sd_lb_stats *sds, 5536 struct sd_lb_stats *sds,
4580 struct sched_group *sg, 5537 struct sched_group *sg,
4581 struct sg_lb_stats *sgs) 5538 struct sg_lb_stats *sgs)
4582{ 5539{
4583 if (sgs->avg_load <= sds->max_load) 5540 if (sgs->avg_load <= sds->busiest_stat.avg_load)
4584 return false; 5541 return false;
4585 5542
4586 if (sgs->sum_nr_running > sgs->group_capacity) 5543 if (sgs->sum_nr_running > sgs->group_capacity)
@@ -4606,18 +5563,46 @@ static bool update_sd_pick_busiest(struct lb_env *env,
4606 return false; 5563 return false;
4607} 5564}
4608 5565
5566#ifdef CONFIG_NUMA_BALANCING
5567static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5568{
5569 if (sgs->sum_nr_running > sgs->nr_numa_running)
5570 return regular;
5571 if (sgs->sum_nr_running > sgs->nr_preferred_running)
5572 return remote;
5573 return all;
5574}
5575
5576static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5577{
5578 if (rq->nr_running > rq->nr_numa_running)
5579 return regular;
5580 if (rq->nr_running > rq->nr_preferred_running)
5581 return remote;
5582 return all;
5583}
5584#else
5585static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5586{
5587 return all;
5588}
5589
5590static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5591{
5592 return regular;
5593}
5594#endif /* CONFIG_NUMA_BALANCING */
5595
4609/** 5596/**
4610 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 5597 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
4611 * @env: The load balancing environment. 5598 * @env: The load balancing environment.
4612 * @balance: Should we balance.
4613 * @sds: variable to hold the statistics for this sched_domain. 5599 * @sds: variable to hold the statistics for this sched_domain.
4614 */ 5600 */
4615static inline void update_sd_lb_stats(struct lb_env *env, 5601static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
4616 int *balance, struct sd_lb_stats *sds)
4617{ 5602{
4618 struct sched_domain *child = env->sd->child; 5603 struct sched_domain *child = env->sd->child;
4619 struct sched_group *sg = env->sd->groups; 5604 struct sched_group *sg = env->sd->groups;
4620 struct sg_lb_stats sgs; 5605 struct sg_lb_stats tmp_sgs;
4621 int load_idx, prefer_sibling = 0; 5606 int load_idx, prefer_sibling = 0;
4622 5607
4623 if (child && child->flags & SD_PREFER_SIBLING) 5608 if (child && child->flags & SD_PREFER_SIBLING)
@@ -4626,17 +5611,23 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4626 load_idx = get_sd_load_idx(env->sd, env->idle); 5611 load_idx = get_sd_load_idx(env->sd, env->idle);
4627 5612
4628 do { 5613 do {
5614 struct sg_lb_stats *sgs = &tmp_sgs;
4629 int local_group; 5615 int local_group;
4630 5616
4631 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); 5617 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
4632 memset(&sgs, 0, sizeof(sgs)); 5618 if (local_group) {
4633 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); 5619 sds->local = sg;
5620 sgs = &sds->local_stat;
4634 5621
4635 if (local_group && !(*balance)) 5622 if (env->idle != CPU_NEWLY_IDLE ||
4636 return; 5623 time_after_eq(jiffies, sg->sgp->next_update))
5624 update_group_power(env->sd, env->dst_cpu);
5625 }
4637 5626
4638 sds->total_load += sgs.group_load; 5627 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
4639 sds->total_pwr += sg->sgp->power; 5628
5629 if (local_group)
5630 goto next_group;
4640 5631
4641 /* 5632 /*
4642 * In case the child domain prefers tasks go to siblings 5633 * In case the child domain prefers tasks go to siblings
@@ -4648,30 +5639,25 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4648 * heaviest group when it is already under-utilized (possible 5639 * heaviest group when it is already under-utilized (possible
4649 * with a large weight task outweighs the tasks on the system). 5640 * with a large weight task outweighs the tasks on the system).
4650 */ 5641 */
4651 if (prefer_sibling && !local_group && sds->this_has_capacity) 5642 if (prefer_sibling && sds->local &&
4652 sgs.group_capacity = min(sgs.group_capacity, 1UL); 5643 sds->local_stat.group_has_capacity)
5644 sgs->group_capacity = min(sgs->group_capacity, 1U);
4653 5645
4654 if (local_group) { 5646 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
4655 sds->this_load = sgs.avg_load;
4656 sds->this = sg;
4657 sds->this_nr_running = sgs.sum_nr_running;
4658 sds->this_load_per_task = sgs.sum_weighted_load;
4659 sds->this_has_capacity = sgs.group_has_capacity;
4660 sds->this_idle_cpus = sgs.idle_cpus;
4661 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
4662 sds->max_load = sgs.avg_load;
4663 sds->busiest = sg; 5647 sds->busiest = sg;
4664 sds->busiest_nr_running = sgs.sum_nr_running; 5648 sds->busiest_stat = *sgs;
4665 sds->busiest_idle_cpus = sgs.idle_cpus;
4666 sds->busiest_group_capacity = sgs.group_capacity;
4667 sds->busiest_load_per_task = sgs.sum_weighted_load;
4668 sds->busiest_has_capacity = sgs.group_has_capacity;
4669 sds->busiest_group_weight = sgs.group_weight;
4670 sds->group_imb = sgs.group_imb;
4671 } 5649 }
4672 5650
5651next_group:
5652 /* Now, start updating sd_lb_stats */
5653 sds->total_load += sgs->group_load;
5654 sds->total_pwr += sgs->group_power;
5655
4673 sg = sg->next; 5656 sg = sg->next;
4674 } while (sg != env->sd->groups); 5657 } while (sg != env->sd->groups);
5658
5659 if (env->sd->flags & SD_NUMA)
5660 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
4675} 5661}
4676 5662
4677/** 5663/**
@@ -4691,7 +5677,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4691 * assuming lower CPU number will be equivalent to lower a SMT thread 5677 * assuming lower CPU number will be equivalent to lower a SMT thread
4692 * number. 5678 * number.
4693 * 5679 *
4694 * Returns 1 when packing is required and a task should be moved to 5680 * Return: 1 when packing is required and a task should be moved to
4695 * this CPU. The amount of the imbalance is returned in *imbalance. 5681 * this CPU. The amount of the imbalance is returned in *imbalance.
4696 * 5682 *
4697 * @env: The load balancing environment. 5683 * @env: The load balancing environment.
@@ -4712,7 +5698,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4712 return 0; 5698 return 0;
4713 5699
4714 env->imbalance = DIV_ROUND_CLOSEST( 5700 env->imbalance = DIV_ROUND_CLOSEST(
4715 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); 5701 sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
5702 SCHED_POWER_SCALE);
4716 5703
4717 return 1; 5704 return 1;
4718} 5705}
@@ -4730,24 +5717,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4730 unsigned long tmp, pwr_now = 0, pwr_move = 0; 5717 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4731 unsigned int imbn = 2; 5718 unsigned int imbn = 2;
4732 unsigned long scaled_busy_load_per_task; 5719 unsigned long scaled_busy_load_per_task;
5720 struct sg_lb_stats *local, *busiest;
4733 5721
4734 if (sds->this_nr_running) { 5722 local = &sds->local_stat;
4735 sds->this_load_per_task /= sds->this_nr_running; 5723 busiest = &sds->busiest_stat;
4736 if (sds->busiest_load_per_task > 5724
4737 sds->this_load_per_task) 5725 if (!local->sum_nr_running)
4738 imbn = 1; 5726 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
4739 } else { 5727 else if (busiest->load_per_task > local->load_per_task)
4740 sds->this_load_per_task = 5728 imbn = 1;
4741 cpu_avg_load_per_task(env->dst_cpu);
4742 }
4743 5729
4744 scaled_busy_load_per_task = sds->busiest_load_per_task 5730 scaled_busy_load_per_task =
4745 * SCHED_POWER_SCALE; 5731 (busiest->load_per_task * SCHED_POWER_SCALE) /
4746 scaled_busy_load_per_task /= sds->busiest->sgp->power; 5732 busiest->group_power;
4747 5733
4748 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 5734 if (busiest->avg_load + scaled_busy_load_per_task >=
4749 (scaled_busy_load_per_task * imbn)) { 5735 local->avg_load + (scaled_busy_load_per_task * imbn)) {
4750 env->imbalance = sds->busiest_load_per_task; 5736 env->imbalance = busiest->load_per_task;
4751 return; 5737 return;
4752 } 5738 }
4753 5739
@@ -4757,34 +5743,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4757 * moving them. 5743 * moving them.
4758 */ 5744 */
4759 5745
4760 pwr_now += sds->busiest->sgp->power * 5746 pwr_now += busiest->group_power *
4761 min(sds->busiest_load_per_task, sds->max_load); 5747 min(busiest->load_per_task, busiest->avg_load);
4762 pwr_now += sds->this->sgp->power * 5748 pwr_now += local->group_power *
4763 min(sds->this_load_per_task, sds->this_load); 5749 min(local->load_per_task, local->avg_load);
4764 pwr_now /= SCHED_POWER_SCALE; 5750 pwr_now /= SCHED_POWER_SCALE;
4765 5751
4766 /* Amount of load we'd subtract */ 5752 /* Amount of load we'd subtract */
4767 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 5753 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
4768 sds->busiest->sgp->power; 5754 busiest->group_power;
4769 if (sds->max_load > tmp) 5755 if (busiest->avg_load > tmp) {
4770 pwr_move += sds->busiest->sgp->power * 5756 pwr_move += busiest->group_power *
4771 min(sds->busiest_load_per_task, sds->max_load - tmp); 5757 min(busiest->load_per_task,
5758 busiest->avg_load - tmp);
5759 }
4772 5760
4773 /* Amount of load we'd add */ 5761 /* Amount of load we'd add */
4774 if (sds->max_load * sds->busiest->sgp->power < 5762 if (busiest->avg_load * busiest->group_power <
4775 sds->busiest_load_per_task * SCHED_POWER_SCALE) 5763 busiest->load_per_task * SCHED_POWER_SCALE) {
4776 tmp = (sds->max_load * sds->busiest->sgp->power) / 5764 tmp = (busiest->avg_load * busiest->group_power) /
4777 sds->this->sgp->power; 5765 local->group_power;
4778 else 5766 } else {
4779 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 5767 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
4780 sds->this->sgp->power; 5768 local->group_power;
4781 pwr_move += sds->this->sgp->power * 5769 }
4782 min(sds->this_load_per_task, sds->this_load + tmp); 5770 pwr_move += local->group_power *
5771 min(local->load_per_task, local->avg_load + tmp);
4783 pwr_move /= SCHED_POWER_SCALE; 5772 pwr_move /= SCHED_POWER_SCALE;
4784 5773
4785 /* Move if we gain throughput */ 5774 /* Move if we gain throughput */
4786 if (pwr_move > pwr_now) 5775 if (pwr_move > pwr_now)
4787 env->imbalance = sds->busiest_load_per_task; 5776 env->imbalance = busiest->load_per_task;
4788} 5777}
4789 5778
4790/** 5779/**
@@ -4796,11 +5785,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4796static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) 5785static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4797{ 5786{
4798 unsigned long max_pull, load_above_capacity = ~0UL; 5787 unsigned long max_pull, load_above_capacity = ~0UL;
5788 struct sg_lb_stats *local, *busiest;
5789
5790 local = &sds->local_stat;
5791 busiest = &sds->busiest_stat;
4799 5792
4800 sds->busiest_load_per_task /= sds->busiest_nr_running; 5793 if (busiest->group_imb) {
4801 if (sds->group_imb) { 5794 /*
4802 sds->busiest_load_per_task = 5795 * In the group_imb case we cannot rely on group-wide averages
4803 min(sds->busiest_load_per_task, sds->avg_load); 5796 * to ensure cpu-load equilibrium, look at wider averages. XXX
5797 */
5798 busiest->load_per_task =
5799 min(busiest->load_per_task, sds->avg_load);
4804 } 5800 }
4805 5801
4806 /* 5802 /*
@@ -4808,21 +5804,23 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4808 * max load less than avg load(as we skip the groups at or below 5804 * max load less than avg load(as we skip the groups at or below
4809 * its cpu_power, while calculating max_load..) 5805 * its cpu_power, while calculating max_load..)
4810 */ 5806 */
4811 if (sds->max_load < sds->avg_load) { 5807 if (busiest->avg_load <= sds->avg_load ||
5808 local->avg_load >= sds->avg_load) {
4812 env->imbalance = 0; 5809 env->imbalance = 0;
4813 return fix_small_imbalance(env, sds); 5810 return fix_small_imbalance(env, sds);
4814 } 5811 }
4815 5812
4816 if (!sds->group_imb) { 5813 if (!busiest->group_imb) {
4817 /* 5814 /*
4818 * Don't want to pull so many tasks that a group would go idle. 5815 * Don't want to pull so many tasks that a group would go idle.
5816 * Except of course for the group_imb case, since then we might
5817 * have to drop below capacity to reach cpu-load equilibrium.
4819 */ 5818 */
4820 load_above_capacity = (sds->busiest_nr_running - 5819 load_above_capacity =
4821 sds->busiest_group_capacity); 5820 (busiest->sum_nr_running - busiest->group_capacity);
4822 5821
4823 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); 5822 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4824 5823 load_above_capacity /= busiest->group_power;
4825 load_above_capacity /= sds->busiest->sgp->power;
4826 } 5824 }
4827 5825
4828 /* 5826 /*
@@ -4832,15 +5830,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4832 * we also don't want to reduce the group load below the group capacity 5830 * we also don't want to reduce the group load below the group capacity
4833 * (so that we can implement power-savings policies etc). Thus we look 5831 * (so that we can implement power-savings policies etc). Thus we look
4834 * for the minimum possible imbalance. 5832 * for the minimum possible imbalance.
4835 * Be careful of negative numbers as they'll appear as very large values
4836 * with unsigned longs.
4837 */ 5833 */
4838 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 5834 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
4839 5835
4840 /* How much load to actually move to equalise the imbalance */ 5836 /* How much load to actually move to equalise the imbalance */
4841 env->imbalance = min(max_pull * sds->busiest->sgp->power, 5837 env->imbalance = min(
4842 (sds->avg_load - sds->this_load) * sds->this->sgp->power) 5838 max_pull * busiest->group_power,
4843 / SCHED_POWER_SCALE; 5839 (sds->avg_load - local->avg_load) * local->group_power
5840 ) / SCHED_POWER_SCALE;
4844 5841
4845 /* 5842 /*
4846 * if *imbalance is less than the average load per runnable task 5843 * if *imbalance is less than the average load per runnable task
@@ -4848,9 +5845,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4848 * a think about bumping its value to force at least one task to be 5845 * a think about bumping its value to force at least one task to be
4849 * moved 5846 * moved
4850 */ 5847 */
4851 if (env->imbalance < sds->busiest_load_per_task) 5848 if (env->imbalance < busiest->load_per_task)
4852 return fix_small_imbalance(env, sds); 5849 return fix_small_imbalance(env, sds);
4853
4854} 5850}
4855 5851
4856/******* find_busiest_group() helpers end here *********************/ 5852/******* find_busiest_group() helpers end here *********************/
@@ -4866,69 +5862,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4866 * to restore balance. 5862 * to restore balance.
4867 * 5863 *
4868 * @env: The load balancing environment. 5864 * @env: The load balancing environment.
4869 * @balance: Pointer to a variable indicating if this_cpu
4870 * is the appropriate cpu to perform load balancing at this_level.
4871 * 5865 *
4872 * Returns: - the busiest group if imbalance exists. 5866 * Return: - The busiest group if imbalance exists.
4873 * - If no imbalance and user has opted for power-savings balance, 5867 * - If no imbalance and user has opted for power-savings balance,
4874 * return the least loaded group whose CPUs can be 5868 * return the least loaded group whose CPUs can be
4875 * put to idle by rebalancing its tasks onto our group. 5869 * put to idle by rebalancing its tasks onto our group.
4876 */ 5870 */
4877static struct sched_group * 5871static struct sched_group *find_busiest_group(struct lb_env *env)
4878find_busiest_group(struct lb_env *env, int *balance)
4879{ 5872{
5873 struct sg_lb_stats *local, *busiest;
4880 struct sd_lb_stats sds; 5874 struct sd_lb_stats sds;
4881 5875
4882 memset(&sds, 0, sizeof(sds)); 5876 init_sd_lb_stats(&sds);
4883 5877
4884 /* 5878 /*
4885 * Compute the various statistics relavent for load balancing at 5879 * Compute the various statistics relavent for load balancing at
4886 * this level. 5880 * this level.
4887 */ 5881 */
4888 update_sd_lb_stats(env, balance, &sds); 5882 update_sd_lb_stats(env, &sds);
4889 5883 local = &sds.local_stat;
4890 /* 5884 busiest = &sds.busiest_stat;
4891 * this_cpu is not the appropriate cpu to perform load balancing at
4892 * this level.
4893 */
4894 if (!(*balance))
4895 goto ret;
4896 5885
4897 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && 5886 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4898 check_asym_packing(env, &sds)) 5887 check_asym_packing(env, &sds))
4899 return sds.busiest; 5888 return sds.busiest;
4900 5889
4901 /* There is no busy sibling group to pull tasks from */ 5890 /* There is no busy sibling group to pull tasks from */
4902 if (!sds.busiest || sds.busiest_nr_running == 0) 5891 if (!sds.busiest || busiest->sum_nr_running == 0)
4903 goto out_balanced; 5892 goto out_balanced;
4904 5893
4905 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; 5894 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4906 5895
4907 /* 5896 /*
4908 * If the busiest group is imbalanced the below checks don't 5897 * If the busiest group is imbalanced the below checks don't
4909 * work because they assumes all things are equal, which typically 5898 * work because they assume all things are equal, which typically
4910 * isn't true due to cpus_allowed constraints and the like. 5899 * isn't true due to cpus_allowed constraints and the like.
4911 */ 5900 */
4912 if (sds.group_imb) 5901 if (busiest->group_imb)
4913 goto force_balance; 5902 goto force_balance;
4914 5903
4915 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 5904 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4916 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 5905 if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
4917 !sds.busiest_has_capacity) 5906 !busiest->group_has_capacity)
4918 goto force_balance; 5907 goto force_balance;
4919 5908
4920 /* 5909 /*
4921 * If the local group is more busy than the selected busiest group 5910 * If the local group is more busy than the selected busiest group
4922 * don't try and pull any tasks. 5911 * don't try and pull any tasks.
4923 */ 5912 */
4924 if (sds.this_load >= sds.max_load) 5913 if (local->avg_load >= busiest->avg_load)
4925 goto out_balanced; 5914 goto out_balanced;
4926 5915
4927 /* 5916 /*
4928 * Don't pull any tasks if this group is already above the domain 5917 * Don't pull any tasks if this group is already above the domain
4929 * average load. 5918 * average load.
4930 */ 5919 */
4931 if (sds.this_load >= sds.avg_load) 5920 if (local->avg_load >= sds.avg_load)
4932 goto out_balanced; 5921 goto out_balanced;
4933 5922
4934 if (env->idle == CPU_IDLE) { 5923 if (env->idle == CPU_IDLE) {
@@ -4938,15 +5927,16 @@ find_busiest_group(struct lb_env *env, int *balance)
4938 * there is no imbalance between this and busiest group 5927 * there is no imbalance between this and busiest group
4939 * wrt to idle cpu's, it is balanced. 5928 * wrt to idle cpu's, it is balanced.
4940 */ 5929 */
4941 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && 5930 if ((local->idle_cpus < busiest->idle_cpus) &&
4942 sds.busiest_nr_running <= sds.busiest_group_weight) 5931 busiest->sum_nr_running <= busiest->group_weight)
4943 goto out_balanced; 5932 goto out_balanced;
4944 } else { 5933 } else {
4945 /* 5934 /*
4946 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 5935 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4947 * imbalance_pct to be conservative. 5936 * imbalance_pct to be conservative.
4948 */ 5937 */
4949 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) 5938 if (100 * busiest->avg_load <=
5939 env->sd->imbalance_pct * local->avg_load)
4950 goto out_balanced; 5940 goto out_balanced;
4951 } 5941 }
4952 5942
@@ -4956,7 +5946,6 @@ force_balance:
4956 return sds.busiest; 5946 return sds.busiest;
4957 5947
4958out_balanced: 5948out_balanced:
4959ret:
4960 env->imbalance = 0; 5949 env->imbalance = 0;
4961 return NULL; 5950 return NULL;
4962} 5951}
@@ -4968,22 +5957,43 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4968 struct sched_group *group) 5957 struct sched_group *group)
4969{ 5958{
4970 struct rq *busiest = NULL, *rq; 5959 struct rq *busiest = NULL, *rq;
4971 unsigned long max_load = 0; 5960 unsigned long busiest_load = 0, busiest_power = 1;
4972 int i; 5961 int i;
4973 5962
4974 for_each_cpu(i, sched_group_cpus(group)) { 5963 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4975 unsigned long power = power_of(i); 5964 unsigned long power, capacity, wl;
4976 unsigned long capacity = DIV_ROUND_CLOSEST(power, 5965 enum fbq_type rt;
4977 SCHED_POWER_SCALE);
4978 unsigned long wl;
4979 5966
4980 if (!capacity) 5967 rq = cpu_rq(i);
4981 capacity = fix_small_capacity(env->sd, group); 5968 rt = fbq_classify_rq(rq);
4982 5969
4983 if (!cpumask_test_cpu(i, env->cpus)) 5970 /*
5971 * We classify groups/runqueues into three groups:
5972 * - regular: there are !numa tasks
5973 * - remote: there are numa tasks that run on the 'wrong' node
5974 * - all: there is no distinction
5975 *
5976 * In order to avoid migrating ideally placed numa tasks,
5977 * ignore those when there's better options.
5978 *
5979 * If we ignore the actual busiest queue to migrate another
5980 * task, the next balance pass can still reduce the busiest
5981 * queue by moving tasks around inside the node.
5982 *
5983 * If we cannot move enough load due to this classification
5984 * the next pass will adjust the group classification and
5985 * allow migration of more tasks.
5986 *
5987 * Both cases only affect the total convergence complexity.
5988 */
5989 if (rt > env->fbq_type)
4984 continue; 5990 continue;
4985 5991
4986 rq = cpu_rq(i); 5992 power = power_of(i);
5993 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
5994 if (!capacity)
5995 capacity = fix_small_capacity(env->sd, group);
5996
4987 wl = weighted_cpuload(i); 5997 wl = weighted_cpuload(i);
4988 5998
4989 /* 5999 /*
@@ -4998,11 +6008,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4998 * the weighted_cpuload() scaled with the cpu power, so that 6008 * the weighted_cpuload() scaled with the cpu power, so that
4999 * the load can be moved away from the cpu that is potentially 6009 * the load can be moved away from the cpu that is potentially
5000 * running at a lower capacity. 6010 * running at a lower capacity.
6011 *
6012 * Thus we're looking for max(wl_i / power_i), crosswise
6013 * multiplication to rid ourselves of the division works out
6014 * to: wl_i * power_j > wl_j * power_i; where j is our
6015 * previous maximum.
5001 */ 6016 */
5002 wl = (wl * SCHED_POWER_SCALE) / power; 6017 if (wl * busiest_power > busiest_load * power) {
5003 6018 busiest_load = wl;
5004 if (wl > max_load) { 6019 busiest_power = power;
5005 max_load = wl;
5006 busiest = rq; 6020 busiest = rq;
5007 } 6021 }
5008 } 6022 }
@@ -5039,15 +6053,50 @@ static int need_active_balance(struct lb_env *env)
5039 6053
5040static int active_load_balance_cpu_stop(void *data); 6054static int active_load_balance_cpu_stop(void *data);
5041 6055
6056static int should_we_balance(struct lb_env *env)
6057{
6058 struct sched_group *sg = env->sd->groups;
6059 struct cpumask *sg_cpus, *sg_mask;
6060 int cpu, balance_cpu = -1;
6061
6062 /*
6063 * In the newly idle case, we will allow all the cpu's
6064 * to do the newly idle load balance.
6065 */
6066 if (env->idle == CPU_NEWLY_IDLE)
6067 return 1;
6068
6069 sg_cpus = sched_group_cpus(sg);
6070 sg_mask = sched_group_mask(sg);
6071 /* Try to find first idle cpu */
6072 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
6073 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
6074 continue;
6075
6076 balance_cpu = cpu;
6077 break;
6078 }
6079
6080 if (balance_cpu == -1)
6081 balance_cpu = group_balance_cpu(sg);
6082
6083 /*
6084 * First idle cpu or the first cpu(busiest) in this sched group
6085 * is eligible for doing load balancing at this and above domains.
6086 */
6087 return balance_cpu == env->dst_cpu;
6088}
6089
5042/* 6090/*
5043 * Check this_cpu to ensure it is balanced within domain. Attempt to move 6091 * Check this_cpu to ensure it is balanced within domain. Attempt to move
5044 * tasks if there is an imbalance. 6092 * tasks if there is an imbalance.
5045 */ 6093 */
5046static int load_balance(int this_cpu, struct rq *this_rq, 6094static int load_balance(int this_cpu, struct rq *this_rq,
5047 struct sched_domain *sd, enum cpu_idle_type idle, 6095 struct sched_domain *sd, enum cpu_idle_type idle,
5048 int *balance) 6096 int *continue_balancing)
5049{ 6097{
5050 int ld_moved, cur_ld_moved, active_balance = 0; 6098 int ld_moved, cur_ld_moved, active_balance = 0;
6099 struct sched_domain *sd_parent = sd->parent;
5051 struct sched_group *group; 6100 struct sched_group *group;
5052 struct rq *busiest; 6101 struct rq *busiest;
5053 unsigned long flags; 6102 unsigned long flags;
@@ -5061,6 +6110,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5061 .idle = idle, 6110 .idle = idle,
5062 .loop_break = sched_nr_migrate_break, 6111 .loop_break = sched_nr_migrate_break,
5063 .cpus = cpus, 6112 .cpus = cpus,
6113 .fbq_type = all,
5064 }; 6114 };
5065 6115
5066 /* 6116 /*
@@ -5075,11 +6125,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5075 schedstat_inc(sd, lb_count[idle]); 6125 schedstat_inc(sd, lb_count[idle]);
5076 6126
5077redo: 6127redo:
5078 group = find_busiest_group(&env, balance); 6128 if (!should_we_balance(&env)) {
5079 6129 *continue_balancing = 0;
5080 if (*balance == 0)
5081 goto out_balanced; 6130 goto out_balanced;
6131 }
5082 6132
6133 group = find_busiest_group(&env);
5083 if (!group) { 6134 if (!group) {
5084 schedstat_inc(sd, lb_nobusyg[idle]); 6135 schedstat_inc(sd, lb_nobusyg[idle]);
5085 goto out_balanced; 6136 goto out_balanced;
@@ -5108,7 +6159,6 @@ redo:
5108 env.src_rq = busiest; 6159 env.src_rq = busiest;
5109 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 6160 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
5110 6161
5111 update_h_load(env.src_cpu);
5112more_balance: 6162more_balance:
5113 local_irq_save(flags); 6163 local_irq_save(flags);
5114 double_rq_lock(env.dst_rq, busiest); 6164 double_rq_lock(env.dst_rq, busiest);
@@ -5152,17 +6202,17 @@ more_balance:
5152 * moreover subsequent load balance cycles should correct the 6202 * moreover subsequent load balance cycles should correct the
5153 * excess load moved. 6203 * excess load moved.
5154 */ 6204 */
5155 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { 6205 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
6206
6207 /* Prevent to re-select dst_cpu via env's cpus */
6208 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5156 6209
5157 env.dst_rq = cpu_rq(env.new_dst_cpu); 6210 env.dst_rq = cpu_rq(env.new_dst_cpu);
5158 env.dst_cpu = env.new_dst_cpu; 6211 env.dst_cpu = env.new_dst_cpu;
5159 env.flags &= ~LBF_SOME_PINNED; 6212 env.flags &= ~LBF_DST_PINNED;
5160 env.loop = 0; 6213 env.loop = 0;
5161 env.loop_break = sched_nr_migrate_break; 6214 env.loop_break = sched_nr_migrate_break;
5162 6215
5163 /* Prevent to re-select dst_cpu via env's cpus */
5164 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5165
5166 /* 6216 /*
5167 * Go back to "more_balance" rather than "redo" since we 6217 * Go back to "more_balance" rather than "redo" since we
5168 * need to continue with same src_cpu. 6218 * need to continue with same src_cpu.
@@ -5170,6 +6220,18 @@ more_balance:
5170 goto more_balance; 6220 goto more_balance;
5171 } 6221 }
5172 6222
6223 /*
6224 * We failed to reach balance because of affinity.
6225 */
6226 if (sd_parent) {
6227 int *group_imbalance = &sd_parent->groups->sgp->imbalance;
6228
6229 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
6230 *group_imbalance = 1;
6231 } else if (*group_imbalance)
6232 *group_imbalance = 0;
6233 }
6234
5173 /* All tasks on this runqueue were pinned by CPU affinity */ 6235 /* All tasks on this runqueue were pinned by CPU affinity */
5174 if (unlikely(env.flags & LBF_ALL_PINNED)) { 6236 if (unlikely(env.flags & LBF_ALL_PINNED)) {
5175 cpumask_clear_cpu(cpu_of(busiest), cpus); 6237 cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5277,6 +6339,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5277 struct sched_domain *sd; 6339 struct sched_domain *sd;
5278 int pulled_task = 0; 6340 int pulled_task = 0;
5279 unsigned long next_balance = jiffies + HZ; 6341 unsigned long next_balance = jiffies + HZ;
6342 u64 curr_cost = 0;
5280 6343
5281 this_rq->idle_stamp = rq_clock(this_rq); 6344 this_rq->idle_stamp = rq_clock(this_rq);
5282 6345
@@ -5292,15 +6355,28 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5292 rcu_read_lock(); 6355 rcu_read_lock();
5293 for_each_domain(this_cpu, sd) { 6356 for_each_domain(this_cpu, sd) {
5294 unsigned long interval; 6357 unsigned long interval;
5295 int balance = 1; 6358 int continue_balancing = 1;
6359 u64 t0, domain_cost;
5296 6360
5297 if (!(sd->flags & SD_LOAD_BALANCE)) 6361 if (!(sd->flags & SD_LOAD_BALANCE))
5298 continue; 6362 continue;
5299 6363
6364 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
6365 break;
6366
5300 if (sd->flags & SD_BALANCE_NEWIDLE) { 6367 if (sd->flags & SD_BALANCE_NEWIDLE) {
6368 t0 = sched_clock_cpu(this_cpu);
6369
5301 /* If we've pulled tasks over stop searching: */ 6370 /* If we've pulled tasks over stop searching: */
5302 pulled_task = load_balance(this_cpu, this_rq, 6371 pulled_task = load_balance(this_cpu, this_rq,
5303 sd, CPU_NEWLY_IDLE, &balance); 6372 sd, CPU_NEWLY_IDLE,
6373 &continue_balancing);
6374
6375 domain_cost = sched_clock_cpu(this_cpu) - t0;
6376 if (domain_cost > sd->max_newidle_lb_cost)
6377 sd->max_newidle_lb_cost = domain_cost;
6378
6379 curr_cost += domain_cost;
5304 } 6380 }
5305 6381
5306 interval = msecs_to_jiffies(sd->balance_interval); 6382 interval = msecs_to_jiffies(sd->balance_interval);
@@ -5322,6 +6398,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5322 */ 6398 */
5323 this_rq->next_balance = next_balance; 6399 this_rq->next_balance = next_balance;
5324 } 6400 }
6401
6402 if (curr_cost > this_rq->max_idle_balance_cost)
6403 this_rq->max_idle_balance_cost = curr_cost;
5325} 6404}
5326 6405
5327/* 6406/*
@@ -5455,16 +6534,16 @@ static inline void nohz_balance_exit_idle(int cpu)
5455static inline void set_cpu_sd_state_busy(void) 6534static inline void set_cpu_sd_state_busy(void)
5456{ 6535{
5457 struct sched_domain *sd; 6536 struct sched_domain *sd;
6537 int cpu = smp_processor_id();
5458 6538
5459 rcu_read_lock(); 6539 rcu_read_lock();
5460 sd = rcu_dereference_check_sched_domain(this_rq()->sd); 6540 sd = rcu_dereference(per_cpu(sd_busy, cpu));
5461 6541
5462 if (!sd || !sd->nohz_idle) 6542 if (!sd || !sd->nohz_idle)
5463 goto unlock; 6543 goto unlock;
5464 sd->nohz_idle = 0; 6544 sd->nohz_idle = 0;
5465 6545
5466 for (; sd; sd = sd->parent) 6546 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5467 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5468unlock: 6547unlock:
5469 rcu_read_unlock(); 6548 rcu_read_unlock();
5470} 6549}
@@ -5472,16 +6551,16 @@ unlock:
5472void set_cpu_sd_state_idle(void) 6551void set_cpu_sd_state_idle(void)
5473{ 6552{
5474 struct sched_domain *sd; 6553 struct sched_domain *sd;
6554 int cpu = smp_processor_id();
5475 6555
5476 rcu_read_lock(); 6556 rcu_read_lock();
5477 sd = rcu_dereference_check_sched_domain(this_rq()->sd); 6557 sd = rcu_dereference(per_cpu(sd_busy, cpu));
5478 6558
5479 if (!sd || sd->nohz_idle) 6559 if (!sd || sd->nohz_idle)
5480 goto unlock; 6560 goto unlock;
5481 sd->nohz_idle = 1; 6561 sd->nohz_idle = 1;
5482 6562
5483 for (; sd; sd = sd->parent) 6563 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5484 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5485unlock: 6564unlock:
5486 rcu_read_unlock(); 6565 rcu_read_unlock();
5487} 6566}
@@ -5538,22 +6617,46 @@ void update_max_interval(void)
5538 */ 6617 */
5539static void rebalance_domains(int cpu, enum cpu_idle_type idle) 6618static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5540{ 6619{
5541 int balance = 1; 6620 int continue_balancing = 1;
5542 struct rq *rq = cpu_rq(cpu); 6621 struct rq *rq = cpu_rq(cpu);
5543 unsigned long interval; 6622 unsigned long interval;
5544 struct sched_domain *sd; 6623 struct sched_domain *sd;
5545 /* Earliest time when we have to do rebalance again */ 6624 /* Earliest time when we have to do rebalance again */
5546 unsigned long next_balance = jiffies + 60*HZ; 6625 unsigned long next_balance = jiffies + 60*HZ;
5547 int update_next_balance = 0; 6626 int update_next_balance = 0;
5548 int need_serialize; 6627 int need_serialize, need_decay = 0;
6628 u64 max_cost = 0;
5549 6629
5550 update_blocked_averages(cpu); 6630 update_blocked_averages(cpu);
5551 6631
5552 rcu_read_lock(); 6632 rcu_read_lock();
5553 for_each_domain(cpu, sd) { 6633 for_each_domain(cpu, sd) {
6634 /*
6635 * Decay the newidle max times here because this is a regular
6636 * visit to all the domains. Decay ~1% per second.
6637 */
6638 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
6639 sd->max_newidle_lb_cost =
6640 (sd->max_newidle_lb_cost * 253) / 256;
6641 sd->next_decay_max_lb_cost = jiffies + HZ;
6642 need_decay = 1;
6643 }
6644 max_cost += sd->max_newidle_lb_cost;
6645
5554 if (!(sd->flags & SD_LOAD_BALANCE)) 6646 if (!(sd->flags & SD_LOAD_BALANCE))
5555 continue; 6647 continue;
5556 6648
6649 /*
6650 * Stop the load balance at this level. There is another
6651 * CPU in our sched group which is doing load balancing more
6652 * actively.
6653 */
6654 if (!continue_balancing) {
6655 if (need_decay)
6656 continue;
6657 break;
6658 }
6659
5557 interval = sd->balance_interval; 6660 interval = sd->balance_interval;
5558 if (idle != CPU_IDLE) 6661 if (idle != CPU_IDLE)
5559 interval *= sd->busy_factor; 6662 interval *= sd->busy_factor;
@@ -5570,9 +6673,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5570 } 6673 }
5571 6674
5572 if (time_after_eq(jiffies, sd->last_balance + interval)) { 6675 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5573 if (load_balance(cpu, rq, sd, idle, &balance)) { 6676 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
5574 /* 6677 /*
5575 * The LBF_SOME_PINNED logic could have changed 6678 * The LBF_DST_PINNED logic could have changed
5576 * env->dst_cpu, so we can't know our idle 6679 * env->dst_cpu, so we can't know our idle
5577 * state even if we migrated tasks. Update it. 6680 * state even if we migrated tasks. Update it.
5578 */ 6681 */
@@ -5587,14 +6690,14 @@ out:
5587 next_balance = sd->last_balance + interval; 6690 next_balance = sd->last_balance + interval;
5588 update_next_balance = 1; 6691 update_next_balance = 1;
5589 } 6692 }
5590 6693 }
6694 if (need_decay) {
5591 /* 6695 /*
5592 * Stop the load balance at this level. There is another 6696 * Ensure the rq-wide value also decays but keep it at a
5593 * CPU in our sched group which is doing load balancing more 6697 * reasonable floor to avoid funnies with rq->avg_idle.
5594 * actively.
5595 */ 6698 */
5596 if (!balance) 6699 rq->max_idle_balance_cost =
5597 break; 6700 max((u64)sysctl_sched_migration_cost, max_cost);
5598 } 6701 }
5599 rcu_read_unlock(); 6702 rcu_read_unlock();
5600 6703
@@ -5664,6 +6767,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
5664{ 6767{
5665 unsigned long now = jiffies; 6768 unsigned long now = jiffies;
5666 struct sched_domain *sd; 6769 struct sched_domain *sd;
6770 struct sched_group_power *sgp;
6771 int nr_busy;
5667 6772
5668 if (unlikely(idle_cpu(cpu))) 6773 if (unlikely(idle_cpu(cpu)))
5669 return 0; 6774 return 0;
@@ -5689,22 +6794,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
5689 goto need_kick; 6794 goto need_kick;
5690 6795
5691 rcu_read_lock(); 6796 rcu_read_lock();
5692 for_each_domain(cpu, sd) { 6797 sd = rcu_dereference(per_cpu(sd_busy, cpu));
5693 struct sched_group *sg = sd->groups;
5694 struct sched_group_power *sgp = sg->sgp;
5695 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
5696 6798
5697 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) 6799 if (sd) {
5698 goto need_kick_unlock; 6800 sgp = sd->groups->sgp;
6801 nr_busy = atomic_read(&sgp->nr_busy_cpus);
5699 6802
5700 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight 6803 if (nr_busy > 1)
5701 && (cpumask_first_and(nohz.idle_cpus_mask,
5702 sched_domain_span(sd)) < cpu))
5703 goto need_kick_unlock; 6804 goto need_kick_unlock;
5704
5705 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5706 break;
5707 } 6805 }
6806
6807 sd = rcu_dereference(per_cpu(sd_asym, cpu));
6808
6809 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
6810 sched_domain_span(sd)) < cpu))
6811 goto need_kick_unlock;
6812
5708 rcu_read_unlock(); 6813 rcu_read_unlock();
5709 return 0; 6814 return 0;
5710 6815
@@ -5812,11 +6917,15 @@ static void task_fork_fair(struct task_struct *p)
5812 cfs_rq = task_cfs_rq(current); 6917 cfs_rq = task_cfs_rq(current);
5813 curr = cfs_rq->curr; 6918 curr = cfs_rq->curr;
5814 6919
5815 if (unlikely(task_cpu(p) != this_cpu)) { 6920 /*
5816 rcu_read_lock(); 6921 * Not only the cpu but also the task_group of the parent might have
5817 __set_task_cpu(p, this_cpu); 6922 * been changed after parent->se.parent,cfs_rq were copied to
5818 rcu_read_unlock(); 6923 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
5819 } 6924 * of child point to valid ones.
6925 */
6926 rcu_read_lock();
6927 __set_task_cpu(p, this_cpu);
6928 rcu_read_unlock();
5820 6929
5821 update_curr(cfs_rq); 6930 update_curr(cfs_rq);
5822 6931
@@ -5889,11 +6998,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
5889 * and ensure we don't carry in an old decay_count if we 6998 * and ensure we don't carry in an old decay_count if we
5890 * switch back. 6999 * switch back.
5891 */ 7000 */
5892 if (p->se.avg.decay_count) { 7001 if (se->avg.decay_count) {
5893 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); 7002 __synchronize_entity_decay(se);
5894 __synchronize_entity_decay(&p->se); 7003 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
5895 subtract_blocked_load_contrib(cfs_rq,
5896 p->se.avg.load_avg_contrib);
5897 } 7004 }
5898#endif 7005#endif
5899} 7006}
@@ -6095,7 +7202,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
6095 se->cfs_rq = parent->my_q; 7202 se->cfs_rq = parent->my_q;
6096 7203
6097 se->my_q = cfs_rq; 7204 se->my_q = cfs_rq;
6098 update_load_set(&se->load, 0); 7205 /* guarantee group entities always have weight */
7206 update_load_set(&se->load, NICE_0_LOAD);
6099 se->parent = parent; 7207 se->parent = parent;
6100} 7208}
6101 7209
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 99399f8e4799..5716929a2e3a 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false)
63/* 63/*
64 * Apply the automatic NUMA scheduling policy. Enabled automatically 64 * Apply the automatic NUMA scheduling policy. Enabled automatically
65 * at runtime if running on a NUMA machine. Can be controlled via 65 * at runtime if running on a NUMA machine. Can be controlled via
66 * numa_balancing=. Allow PTE scanning to be forced on UMA machines 66 * numa_balancing=
67 * for debugging the core machinery.
68 */ 67 */
69#ifdef CONFIG_NUMA_BALANCING 68#ifdef CONFIG_NUMA_BALANCING
70SCHED_FEAT(NUMA, false) 69SCHED_FEAT(NUMA, false)
71SCHED_FEAT(NUMA_FORCE, false) 70
71/*
72 * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
73 * higher number of hinting faults are recorded during active load
74 * balancing.
75 */
76SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
77
78/*
79 * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
80 * lower number of hinting faults have been recorded. As this has
81 * the potential to prevent a task ever migrating to a new node
82 * due to CPU overload it is disabled by default.
83 */
84SCHED_FEAT(NUMA_RESIST_LOWER, false)
72#endif 85#endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index d8da01008d39..516c3d9ceea1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,7 @@
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
11static int 11static int
12select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 12select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 01970c8e64df..7d57275fc396 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq)
246 * if we should look at the mask. It would be a shame 246 * if we should look at the mask. It would be a shame
247 * if we looked at the mask, but the mask was not 247 * if we looked at the mask, but the mask was not
248 * updated yet. 248 * updated yet.
249 *
250 * Matched by the barrier in pull_rt_task().
249 */ 251 */
250 wmb(); 252 smp_wmb();
251 atomic_inc(&rq->rd->rto_count); 253 atomic_inc(&rq->rd->rto_count);
252} 254}
253 255
@@ -1169,13 +1171,10 @@ static void yield_task_rt(struct rq *rq)
1169static int find_lowest_rq(struct task_struct *task); 1171static int find_lowest_rq(struct task_struct *task);
1170 1172
1171static int 1173static int
1172select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 1174select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1173{ 1175{
1174 struct task_struct *curr; 1176 struct task_struct *curr;
1175 struct rq *rq; 1177 struct rq *rq;
1176 int cpu;
1177
1178 cpu = task_cpu(p);
1179 1178
1180 if (p->nr_cpus_allowed == 1) 1179 if (p->nr_cpus_allowed == 1)
1181 goto out; 1180 goto out;
@@ -1213,8 +1212,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1213 */ 1212 */
1214 if (curr && unlikely(rt_task(curr)) && 1213 if (curr && unlikely(rt_task(curr)) &&
1215 (curr->nr_cpus_allowed < 2 || 1214 (curr->nr_cpus_allowed < 2 ||
1216 curr->prio <= p->prio) && 1215 curr->prio <= p->prio)) {
1217 (p->nr_cpus_allowed > 1)) {
1218 int target = find_lowest_rq(p); 1216 int target = find_lowest_rq(p);
1219 1217
1220 if (target != -1) 1218 if (target != -1)
@@ -1630,6 +1628,12 @@ static int pull_rt_task(struct rq *this_rq)
1630 if (likely(!rt_overloaded(this_rq))) 1628 if (likely(!rt_overloaded(this_rq)))
1631 return 0; 1629 return 0;
1632 1630
1631 /*
1632 * Match the barrier from rt_set_overloaded; this guarantees that if we
1633 * see overloaded we must also see the rto_mask bit.
1634 */
1635 smp_rmb();
1636
1633 for_each_cpu(cpu, this_rq->rd->rto_mask) { 1637 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1634 if (this_cpu == cpu) 1638 if (this_cpu == cpu)
1635 continue; 1639 continue;
@@ -1931,8 +1935,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1931 p->rt.time_slice = sched_rr_timeslice; 1935 p->rt.time_slice = sched_rr_timeslice;
1932 1936
1933 /* 1937 /*
1934 * Requeue to the end of queue if we (and all of our ancestors) are the 1938 * Requeue to the end of queue if we (and all of our ancestors) are not
1935 * only element on the queue 1939 * the only element on the queue
1936 */ 1940 */
1937 for_each_sched_rt_entity(rt_se) { 1941 for_each_sched_rt_entity(rt_se) {
1938 if (rt_se->run_list.prev != rt_se->run_list.next) { 1942 if (rt_se->run_list.prev != rt_se->run_list.next) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef0a7b2439dd..88c85b21d633 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
8#include <linux/tick.h> 8#include <linux/tick.h>
9#include <linux/slab.h>
9 10
10#include "cpupri.h" 11#include "cpupri.h"
11#include "cpuacct.h" 12#include "cpuacct.h"
@@ -285,7 +286,6 @@ struct cfs_rq {
285 /* Required to track per-cpu representation of a task_group */ 286 /* Required to track per-cpu representation of a task_group */
286 u32 tg_runnable_contrib; 287 u32 tg_runnable_contrib;
287 unsigned long tg_load_contrib; 288 unsigned long tg_load_contrib;
288#endif /* CONFIG_FAIR_GROUP_SCHED */
289 289
290 /* 290 /*
291 * h_load = weight * f(tg) 291 * h_load = weight * f(tg)
@@ -294,6 +294,9 @@ struct cfs_rq {
294 * this group. 294 * this group.
295 */ 295 */
296 unsigned long h_load; 296 unsigned long h_load;
297 u64 last_h_load_update;
298 struct sched_entity *h_load_next;
299#endif /* CONFIG_FAIR_GROUP_SCHED */
297#endif /* CONFIG_SMP */ 300#endif /* CONFIG_SMP */
298 301
299#ifdef CONFIG_FAIR_GROUP_SCHED 302#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -406,6 +409,10 @@ struct rq {
406 * remote CPUs use both these fields when doing load calculation. 409 * remote CPUs use both these fields when doing load calculation.
407 */ 410 */
408 unsigned int nr_running; 411 unsigned int nr_running;
412#ifdef CONFIG_NUMA_BALANCING
413 unsigned int nr_numa_running;
414 unsigned int nr_preferred_running;
415#endif
409 #define CPU_LOAD_IDX_MAX 5 416 #define CPU_LOAD_IDX_MAX 5
410 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 417 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
411 unsigned long last_load_update_tick; 418 unsigned long last_load_update_tick;
@@ -429,9 +436,6 @@ struct rq {
429#ifdef CONFIG_FAIR_GROUP_SCHED 436#ifdef CONFIG_FAIR_GROUP_SCHED
430 /* list of leaf cfs_rq on this cpu: */ 437 /* list of leaf cfs_rq on this cpu: */
431 struct list_head leaf_cfs_rq_list; 438 struct list_head leaf_cfs_rq_list;
432#ifdef CONFIG_SMP
433 unsigned long h_load_throttle;
434#endif /* CONFIG_SMP */
435#endif /* CONFIG_FAIR_GROUP_SCHED */ 439#endif /* CONFIG_FAIR_GROUP_SCHED */
436 440
437#ifdef CONFIG_RT_GROUP_SCHED 441#ifdef CONFIG_RT_GROUP_SCHED
@@ -477,6 +481,9 @@ struct rq {
477 u64 age_stamp; 481 u64 age_stamp;
478 u64 idle_stamp; 482 u64 idle_stamp;
479 u64 avg_idle; 483 u64 avg_idle;
484
485 /* This is used to determine avg_idle's max value */
486 u64 max_idle_balance_cost;
480#endif 487#endif
481 488
482#ifdef CONFIG_IRQ_TIME_ACCOUNTING 489#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -553,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq)
553 return rq->clock_task; 560 return rq->clock_task;
554} 561}
555 562
563#ifdef CONFIG_NUMA_BALANCING
564extern void sched_setnuma(struct task_struct *p, int node);
565extern int migrate_task_to(struct task_struct *p, int cpu);
566extern int migrate_swap(struct task_struct *, struct task_struct *);
567#endif /* CONFIG_NUMA_BALANCING */
568
556#ifdef CONFIG_SMP 569#ifdef CONFIG_SMP
557 570
558#define rcu_dereference_check_sched_domain(p) \ 571#define rcu_dereference_check_sched_domain(p) \
@@ -594,8 +607,24 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
594 return hsd; 607 return hsd;
595} 608}
596 609
610static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
611{
612 struct sched_domain *sd;
613
614 for_each_domain(cpu, sd) {
615 if (sd->flags & flag)
616 break;
617 }
618
619 return sd;
620}
621
597DECLARE_PER_CPU(struct sched_domain *, sd_llc); 622DECLARE_PER_CPU(struct sched_domain *, sd_llc);
623DECLARE_PER_CPU(int, sd_llc_size);
598DECLARE_PER_CPU(int, sd_llc_id); 624DECLARE_PER_CPU(int, sd_llc_id);
625DECLARE_PER_CPU(struct sched_domain *, sd_numa);
626DECLARE_PER_CPU(struct sched_domain *, sd_busy);
627DECLARE_PER_CPU(struct sched_domain *, sd_asym);
599 628
600struct sched_group_power { 629struct sched_group_power {
601 atomic_t ref; 630 atomic_t ref;
@@ -605,6 +634,7 @@ struct sched_group_power {
605 */ 634 */
606 unsigned int power, power_orig; 635 unsigned int power, power_orig;
607 unsigned long next_update; 636 unsigned long next_update;
637 int imbalance; /* XXX unrelated to power but shared group state */
608 /* 638 /*
609 * Number of busy cpus in this group. 639 * Number of busy cpus in this group.
610 */ 640 */
@@ -665,9 +695,9 @@ extern int group_balance_cpu(struct sched_group *sg);
665/* 695/*
666 * Return the group to which this tasks belongs. 696 * Return the group to which this tasks belongs.
667 * 697 *
668 * We cannot use task_subsys_state() and friends because the cgroup 698 * We cannot use task_css() and friends because the cgroup subsystem
669 * subsystem changes that value before the cgroup_subsys::attach() method 699 * changes that value before the cgroup_subsys::attach() method is called,
670 * is called, therefore we cannot pin it and might observe the wrong value. 700 * therefore we cannot pin it and might observe the wrong value.
671 * 701 *
672 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup 702 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
673 * core changes this before calling sched_move_task(). 703 * core changes this before calling sched_move_task().
@@ -719,6 +749,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
719 */ 749 */
720 smp_wmb(); 750 smp_wmb();
721 task_thread_info(p)->cpu = cpu; 751 task_thread_info(p)->cpu = cpu;
752 p->wake_cpu = cpu;
722#endif 753#endif
723} 754}
724 755
@@ -974,7 +1005,7 @@ struct sched_class {
974 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1005 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
975 1006
976#ifdef CONFIG_SMP 1007#ifdef CONFIG_SMP
977 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); 1008 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
978 void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 1009 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
979 1010
980 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1011 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
@@ -1220,6 +1251,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1220 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1251 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1221} 1252}
1222 1253
1254static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
1255{
1256 if (l1 > l2)
1257 swap(l1, l2);
1258
1259 spin_lock(l1);
1260 spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1261}
1262
1263static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
1264{
1265 if (l1 > l2)
1266 swap(l1, l2);
1267
1268 raw_spin_lock(l1);
1269 raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1270}
1271
1223/* 1272/*
1224 * double_rq_lock - safely lock two runqueues 1273 * double_rq_lock - safely lock two runqueues
1225 * 1274 *
@@ -1305,7 +1354,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
1305extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1354extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1306extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1355extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1307 1356
1308extern void account_cfs_bandwidth_used(int enabled, int was_enabled); 1357extern void cfs_bandwidth_usage_inc(void);
1358extern void cfs_bandwidth_usage_dec(void);
1309 1359
1310#ifdef CONFIG_NO_HZ_COMMON 1360#ifdef CONFIG_NO_HZ_COMMON
1311enum rq_nohz_flag_bits { 1361enum rq_nohz_flag_bits {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 5aef494fc8b4..4ab704339656 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
59 * from dequeue_task() to account for possible rq->clock skew across cpus. The 59 * from dequeue_task() to account for possible rq->clock skew across cpus. The
60 * delta taken on each cpu would annul the skew. 60 * delta taken on each cpu would annul the skew.
61 */ 61 */
62static inline void sched_info_dequeued(struct task_struct *t) 62static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
63{ 63{
64 unsigned long long now = rq_clock(task_rq(t)), delta = 0; 64 unsigned long long now = rq_clock(rq), delta = 0;
65 65
66 if (unlikely(sched_info_on())) 66 if (unlikely(sched_info_on()))
67 if (t->sched_info.last_queued) 67 if (t->sched_info.last_queued)
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
69 sched_info_reset_dequeued(t); 69 sched_info_reset_dequeued(t);
70 t->sched_info.run_delay += delta; 70 t->sched_info.run_delay += delta;
71 71
72 rq_sched_info_dequeued(task_rq(t), delta); 72 rq_sched_info_dequeued(rq, delta);
73} 73}
74 74
75/* 75/*
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)
77 * long it was waiting to run. We also note when it began so that we 77 * long it was waiting to run. We also note when it began so that we
78 * can keep stats on how long its timeslice is. 78 * can keep stats on how long its timeslice is.
79 */ 79 */
80static void sched_info_arrive(struct task_struct *t) 80static void sched_info_arrive(struct rq *rq, struct task_struct *t)
81{ 81{
82 unsigned long long now = rq_clock(task_rq(t)), delta = 0; 82 unsigned long long now = rq_clock(rq), delta = 0;
83 83
84 if (t->sched_info.last_queued) 84 if (t->sched_info.last_queued)
85 delta = now - t->sched_info.last_queued; 85 delta = now - t->sched_info.last_queued;
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)
88 t->sched_info.last_arrival = now; 88 t->sched_info.last_arrival = now;
89 t->sched_info.pcount++; 89 t->sched_info.pcount++;
90 90
91 rq_sched_info_arrive(task_rq(t), delta); 91 rq_sched_info_arrive(rq, delta);
92} 92}
93 93
94/* 94/*
@@ -96,29 +96,30 @@ static void sched_info_arrive(struct task_struct *t)
96 * the timestamp if it is already not set. It's assumed that 96 * the timestamp if it is already not set. It's assumed that
97 * sched_info_dequeued() will clear that stamp when appropriate. 97 * sched_info_dequeued() will clear that stamp when appropriate.
98 */ 98 */
99static inline void sched_info_queued(struct task_struct *t) 99static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
100{ 100{
101 if (unlikely(sched_info_on())) 101 if (unlikely(sched_info_on()))
102 if (!t->sched_info.last_queued) 102 if (!t->sched_info.last_queued)
103 t->sched_info.last_queued = rq_clock(task_rq(t)); 103 t->sched_info.last_queued = rq_clock(rq);
104} 104}
105 105
106/* 106/*
107 * Called when a process ceases being the active-running process, either 107 * Called when a process ceases being the active-running process involuntarily
108 * voluntarily or involuntarily. Now we can calculate how long we ran. 108 * due, typically, to expiring its time slice (this may also be called when
109 * switching to the idle task). Now we can calculate how long we ran.
109 * Also, if the process is still in the TASK_RUNNING state, call 110 * Also, if the process is still in the TASK_RUNNING state, call
110 * sched_info_queued() to mark that it has now again started waiting on 111 * sched_info_queued() to mark that it has now again started waiting on
111 * the runqueue. 112 * the runqueue.
112 */ 113 */
113static inline void sched_info_depart(struct task_struct *t) 114static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
114{ 115{
115 unsigned long long delta = rq_clock(task_rq(t)) - 116 unsigned long long delta = rq_clock(rq) -
116 t->sched_info.last_arrival; 117 t->sched_info.last_arrival;
117 118
118 rq_sched_info_depart(task_rq(t), delta); 119 rq_sched_info_depart(rq, delta);
119 120
120 if (t->state == TASK_RUNNING) 121 if (t->state == TASK_RUNNING)
121 sched_info_queued(t); 122 sched_info_queued(rq, t);
122} 123}
123 124
124/* 125/*
@@ -127,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)
127 * the idle task.) We are only called when prev != next. 128 * the idle task.) We are only called when prev != next.
128 */ 129 */
129static inline void 130static inline void
130__sched_info_switch(struct task_struct *prev, struct task_struct *next) 131__sched_info_switch(struct rq *rq,
132 struct task_struct *prev, struct task_struct *next)
131{ 133{
132 struct rq *rq = task_rq(prev);
133
134 /* 134 /*
135 * prev now departs the cpu. It's not interesting to record 135 * prev now departs the cpu. It's not interesting to record
136 * stats about how efficient we were at scheduling the idle 136 * stats about how efficient we were at scheduling the idle
137 * process, however. 137 * process, however.
138 */ 138 */
139 if (prev != rq->idle) 139 if (prev != rq->idle)
140 sched_info_depart(prev); 140 sched_info_depart(rq, prev);
141 141
142 if (next != rq->idle) 142 if (next != rq->idle)
143 sched_info_arrive(next); 143 sched_info_arrive(rq, next);
144} 144}
145static inline void 145static inline void
146sched_info_switch(struct task_struct *prev, struct task_struct *next) 146sched_info_switch(struct rq *rq,
147 struct task_struct *prev, struct task_struct *next)
147{ 148{
148 if (unlikely(sched_info_on())) 149 if (unlikely(sched_info_on()))
149 __sched_info_switch(prev, next); 150 __sched_info_switch(rq, prev, next);
150} 151}
151#else 152#else
152#define sched_info_queued(t) do { } while (0) 153#define sched_info_queued(rq, t) do { } while (0)
153#define sched_info_reset_dequeued(t) do { } while (0) 154#define sched_info_reset_dequeued(t) do { } while (0)
154#define sched_info_dequeued(t) do { } while (0) 155#define sched_info_dequeued(rq, t) do { } while (0)
155#define sched_info_switch(t, next) do { } while (0) 156#define sched_info_depart(rq, t) do { } while (0)
157#define sched_info_arrive(rq, next) do { } while (0)
158#define sched_info_switch(rq, t, next) do { } while (0)
156#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 159#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
157 160
158/* 161/*
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index e08fbeeb54b9..47197de8abd9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,7 @@
11 11
12#ifdef CONFIG_SMP 12#ifdef CONFIG_SMP
13static int 13static int
14select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) 14select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
15{ 15{
16 return task_cpu(p); /* stop tasks as never migrate */ 16 return task_cpu(p); /* stop tasks as never migrate */
17} 17}
diff --git a/kernel/wait.c b/kernel/sched/wait.c
index dec68bd4e9d8..7d50f794e248 100644
--- a/kernel/wait.c
+++ b/kernel/sched/wait.c
@@ -53,6 +53,109 @@ EXPORT_SYMBOL(remove_wait_queue);
53 53
54 54
55/* 55/*
56 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
57 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
58 * number) then we wake all the non-exclusive tasks and one exclusive task.
59 *
60 * There are circumstances in which we can try to wake a task which has already
61 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
62 * zero in this (rare) case, and we handle it by continuing to scan the queue.
63 */
64static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
65 int nr_exclusive, int wake_flags, void *key)
66{
67 wait_queue_t *curr, *next;
68
69 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
70 unsigned flags = curr->flags;
71
72 if (curr->func(curr, mode, wake_flags, key) &&
73 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
74 break;
75 }
76}
77
78/**
79 * __wake_up - wake up threads blocked on a waitqueue.
80 * @q: the waitqueue
81 * @mode: which threads
82 * @nr_exclusive: how many wake-one or wake-many threads to wake up
83 * @key: is directly passed to the wakeup function
84 *
85 * It may be assumed that this function implies a write memory barrier before
86 * changing the task state if and only if any tasks are woken up.
87 */
88void __wake_up(wait_queue_head_t *q, unsigned int mode,
89 int nr_exclusive, void *key)
90{
91 unsigned long flags;
92
93 spin_lock_irqsave(&q->lock, flags);
94 __wake_up_common(q, mode, nr_exclusive, 0, key);
95 spin_unlock_irqrestore(&q->lock, flags);
96}
97EXPORT_SYMBOL(__wake_up);
98
99/*
100 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
101 */
102void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
103{
104 __wake_up_common(q, mode, nr, 0, NULL);
105}
106EXPORT_SYMBOL_GPL(__wake_up_locked);
107
108void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
109{
110 __wake_up_common(q, mode, 1, 0, key);
111}
112EXPORT_SYMBOL_GPL(__wake_up_locked_key);
113
114/**
115 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
116 * @q: the waitqueue
117 * @mode: which threads
118 * @nr_exclusive: how many wake-one or wake-many threads to wake up
119 * @key: opaque value to be passed to wakeup targets
120 *
121 * The sync wakeup differs that the waker knows that it will schedule
122 * away soon, so while the target thread will be woken up, it will not
123 * be migrated to another CPU - ie. the two threads are 'synchronized'
124 * with each other. This can prevent needless bouncing between CPUs.
125 *
126 * On UP it can prevent extra preemption.
127 *
128 * It may be assumed that this function implies a write memory barrier before
129 * changing the task state if and only if any tasks are woken up.
130 */
131void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
132 int nr_exclusive, void *key)
133{
134 unsigned long flags;
135 int wake_flags = 1; /* XXX WF_SYNC */
136
137 if (unlikely(!q))
138 return;
139
140 if (unlikely(nr_exclusive != 1))
141 wake_flags = 0;
142
143 spin_lock_irqsave(&q->lock, flags);
144 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
145 spin_unlock_irqrestore(&q->lock, flags);
146}
147EXPORT_SYMBOL_GPL(__wake_up_sync_key);
148
149/*
150 * __wake_up_sync - see __wake_up_sync_key()
151 */
152void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
153{
154 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
155}
156EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
157
158/*
56 * Note: we use "set_current_state()" _after_ the wait-queue add, 159 * Note: we use "set_current_state()" _after_ the wait-queue add,
57 * because we need a memory barrier there on SMP, so that any 160 * because we need a memory barrier there on SMP, so that any
58 * wake-function that tests for the wait-queue being active 161 * wake-function that tests for the wait-queue being active
@@ -92,6 +195,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
92} 195}
93EXPORT_SYMBOL(prepare_to_wait_exclusive); 196EXPORT_SYMBOL(prepare_to_wait_exclusive);
94 197
198long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
199{
200 unsigned long flags;
201
202 if (signal_pending_state(state, current))
203 return -ERESTARTSYS;
204
205 wait->private = current;
206 wait->func = autoremove_wake_function;
207
208 spin_lock_irqsave(&q->lock, flags);
209 if (list_empty(&wait->task_list)) {
210 if (wait->flags & WQ_FLAG_EXCLUSIVE)
211 __add_wait_queue_tail(q, wait);
212 else
213 __add_wait_queue(q, wait);
214 }
215 set_current_state(state);
216 spin_unlock_irqrestore(&q->lock, flags);
217
218 return 0;
219}
220EXPORT_SYMBOL(prepare_to_wait_event);
221
95/** 222/**
96 * finish_wait - clean up after waiting in a queue 223 * finish_wait - clean up after waiting in a queue
97 * @q: waitqueue waited on 224 * @q: waitqueue waited on
@@ -363,8 +490,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
363 490
364/** 491/**
365 * wake_up_atomic_t - Wake up a waiter on a atomic_t 492 * wake_up_atomic_t - Wake up a waiter on a atomic_t
366 * @word: The word being waited on, a kernel virtual address 493 * @p: The atomic_t being waited on, a kernel virtual address
367 * @bit: The bit of the word being waited on
368 * 494 *
369 * Wake up anyone waiting for the atomic_t to go to zero. 495 * Wake up anyone waiting for the atomic_t to go to zero.
370 * 496 *
diff --git a/kernel/signal.c b/kernel/signal.c
index 50e41075ac77..ded28b91fa53 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3394,7 +3394,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
3394 new_ka.sa.sa_restorer = compat_ptr(restorer); 3394 new_ka.sa.sa_restorer = compat_ptr(restorer);
3395#endif 3395#endif
3396 ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); 3396 ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
3397 ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); 3397 ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
3398 if (ret) 3398 if (ret)
3399 return -EFAULT; 3399 return -EFAULT;
3400 sigset_from_compat(&new_ka.sa.sa_mask, &mask); 3400 sigset_from_compat(&new_ka.sa.sa_mask, &mask);
@@ -3406,7 +3406,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
3406 ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 3406 ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
3407 &oact->sa_handler); 3407 &oact->sa_handler);
3408 ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); 3408 ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
3409 ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); 3409 ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
3410#ifdef __ARCH_HAS_SA_RESTORER 3410#ifdef __ARCH_HAS_SA_RESTORER
3411 ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), 3411 ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
3412 &oact->sa_restorer); 3412 &oact->sa_restorer);
diff --git a/kernel/smp.c b/kernel/smp.c
index fe9f773d7114..f5768b0c816a 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -48,10 +48,13 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
48 cpu_to_node(cpu))) 48 cpu_to_node(cpu)))
49 return notifier_from_errno(-ENOMEM); 49 return notifier_from_errno(-ENOMEM);
50 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, 50 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
51 cpu_to_node(cpu))) 51 cpu_to_node(cpu))) {
52 free_cpumask_var(cfd->cpumask);
52 return notifier_from_errno(-ENOMEM); 53 return notifier_from_errno(-ENOMEM);
54 }
53 cfd->csd = alloc_percpu(struct call_single_data); 55 cfd->csd = alloc_percpu(struct call_single_data);
54 if (!cfd->csd) { 56 if (!cfd->csd) {
57 free_cpumask_var(cfd->cpumask_ipi);
55 free_cpumask_var(cfd->cpumask); 58 free_cpumask_var(cfd->cpumask);
56 return notifier_from_errno(-ENOMEM); 59 return notifier_from_errno(-ENOMEM);
57 } 60 }
@@ -186,25 +189,13 @@ void generic_smp_call_function_single_interrupt(void)
186 189
187 while (!list_empty(&list)) { 190 while (!list_empty(&list)) {
188 struct call_single_data *csd; 191 struct call_single_data *csd;
189 unsigned int csd_flags;
190 192
191 csd = list_entry(list.next, struct call_single_data, list); 193 csd = list_entry(list.next, struct call_single_data, list);
192 list_del(&csd->list); 194 list_del(&csd->list);
193 195
194 /*
195 * 'csd' can be invalid after this call if flags == 0
196 * (when called through generic_exec_single()),
197 * so save them away before making the call:
198 */
199 csd_flags = csd->flags;
200
201 csd->func(csd->info); 196 csd->func(csd->info);
202 197
203 /* 198 csd_unlock(csd);
204 * Unlocked CSDs are valid through generic_exec_single():
205 */
206 if (csd_flags & CSD_FLAG_LOCK)
207 csd_unlock(csd);
208 } 199 }
209} 200}
210 201
@@ -278,8 +269,6 @@ EXPORT_SYMBOL(smp_call_function_single);
278 * @wait: If true, wait until function has completed. 269 * @wait: If true, wait until function has completed.
279 * 270 *
280 * Returns 0 on success, else a negative status code (if no cpus were online). 271 * Returns 0 on success, else a negative status code (if no cpus were online).
281 * Note that @wait will be implicitly turned on in case of allocation failures,
282 * since we fall back to on-stack allocation.
283 * 272 *
284 * Selection preference: 273 * Selection preference:
285 * 1) current cpu if in @mask 274 * 1) current cpu if in @mask
@@ -535,6 +524,11 @@ void __init setup_nr_cpu_ids(void)
535 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; 524 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
536} 525}
537 526
527void __weak smp_announce(void)
528{
529 printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus());
530}
531
538/* Called by boot processor to activate the rest. */ 532/* Called by boot processor to activate the rest. */
539void __init smp_init(void) 533void __init smp_init(void)
540{ 534{
@@ -551,7 +545,7 @@ void __init smp_init(void)
551 } 545 }
552 546
553 /* Any cleanup work */ 547 /* Any cleanup work */
554 printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); 548 smp_announce();
555 smp_cpus_done(setup_max_cpus); 549 smp_cpus_done(setup_max_cpus);
556} 550}
557 551
@@ -586,8 +580,10 @@ EXPORT_SYMBOL(on_each_cpu);
586 * 580 *
587 * If @wait is true, then returns once @func has returned. 581 * If @wait is true, then returns once @func has returned.
588 * 582 *
589 * You must not call this function with disabled interrupts or 583 * You must not call this function with disabled interrupts or from a
590 * from a hardware interrupt handler or from a bottom half handler. 584 * hardware interrupt handler or from a bottom half handler. The
585 * exception is that it may be used during early boot while
586 * early_boot_irqs_disabled is set.
591 */ 587 */
592void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, 588void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
593 void *info, bool wait) 589 void *info, bool wait)
@@ -596,9 +592,10 @@ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
596 592
597 smp_call_function_many(mask, func, info, wait); 593 smp_call_function_many(mask, func, info, wait);
598 if (cpumask_test_cpu(cpu, mask)) { 594 if (cpumask_test_cpu(cpu, mask)) {
599 local_irq_disable(); 595 unsigned long flags;
596 local_irq_save(flags);
600 func(info); 597 func(info);
601 local_irq_enable(); 598 local_irq_restore(flags);
602 } 599 }
603 put_cpu(); 600 put_cpu();
604} 601}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index be3d3514c325..b24988353458 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -29,7 +29,6 @@
29#define CREATE_TRACE_POINTS 29#define CREATE_TRACE_POINTS
30#include <trace/events/irq.h> 30#include <trace/events/irq.h>
31 31
32#include <asm/irq.h>
33/* 32/*
34 - No shared variables, all the data are CPU local. 33 - No shared variables, all the data are CPU local.
35 - If a softirq needs serialization, let it serialize itself 34 - If a softirq needs serialization, let it serialize itself
@@ -100,13 +99,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
100 99
101 raw_local_irq_save(flags); 100 raw_local_irq_save(flags);
102 /* 101 /*
103 * The preempt tracer hooks into add_preempt_count and will break 102 * The preempt tracer hooks into preempt_count_add and will break
104 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET 103 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
105 * is set and before current->softirq_enabled is cleared. 104 * is set and before current->softirq_enabled is cleared.
106 * We must manually increment preempt_count here and manually 105 * We must manually increment preempt_count here and manually
107 * call the trace_preempt_off later. 106 * call the trace_preempt_off later.
108 */ 107 */
109 preempt_count() += cnt; 108 __preempt_count_add(cnt);
110 /* 109 /*
111 * Were softirqs turned off above: 110 * Were softirqs turned off above:
112 */ 111 */
@@ -120,7 +119,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
120#else /* !CONFIG_TRACE_IRQFLAGS */ 119#else /* !CONFIG_TRACE_IRQFLAGS */
121static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) 120static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
122{ 121{
123 add_preempt_count(cnt); 122 preempt_count_add(cnt);
124 barrier(); 123 barrier();
125} 124}
126#endif /* CONFIG_TRACE_IRQFLAGS */ 125#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -134,12 +133,11 @@ EXPORT_SYMBOL(local_bh_disable);
134 133
135static void __local_bh_enable(unsigned int cnt) 134static void __local_bh_enable(unsigned int cnt)
136{ 135{
137 WARN_ON_ONCE(in_irq());
138 WARN_ON_ONCE(!irqs_disabled()); 136 WARN_ON_ONCE(!irqs_disabled());
139 137
140 if (softirq_count() == cnt) 138 if (softirq_count() == cnt)
141 trace_softirqs_on(_RET_IP_); 139 trace_softirqs_on(_RET_IP_);
142 sub_preempt_count(cnt); 140 preempt_count_sub(cnt);
143} 141}
144 142
145/* 143/*
@@ -149,6 +147,7 @@ static void __local_bh_enable(unsigned int cnt)
149 */ 147 */
150void _local_bh_enable(void) 148void _local_bh_enable(void)
151{ 149{
150 WARN_ON_ONCE(in_irq());
152 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); 151 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
153} 152}
154 153
@@ -169,12 +168,17 @@ static inline void _local_bh_enable_ip(unsigned long ip)
169 * Keep preemption disabled until we are done with 168 * Keep preemption disabled until we are done with
170 * softirq processing: 169 * softirq processing:
171 */ 170 */
172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); 171 preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
173 172
174 if (unlikely(!in_interrupt() && local_softirq_pending())) 173 if (unlikely(!in_interrupt() && local_softirq_pending())) {
174 /*
175 * Run softirq if any pending. And do it in its own stack
176 * as we may be calling this deep in a task call stack already.
177 */
175 do_softirq(); 178 do_softirq();
179 }
176 180
177 dec_preempt_count(); 181 preempt_count_dec();
178#ifdef CONFIG_TRACE_IRQFLAGS 182#ifdef CONFIG_TRACE_IRQFLAGS
179 local_irq_enable(); 183 local_irq_enable();
180#endif 184#endif
@@ -256,7 +260,7 @@ restart:
256 " exited with %08x?\n", vec_nr, 260 " exited with %08x?\n", vec_nr,
257 softirq_to_name[vec_nr], h->action, 261 softirq_to_name[vec_nr], h->action,
258 prev_count, preempt_count()); 262 prev_count, preempt_count());
259 preempt_count() = prev_count; 263 preempt_count_set(prev_count);
260 } 264 }
261 265
262 rcu_bh_qs(cpu); 266 rcu_bh_qs(cpu);
@@ -280,10 +284,11 @@ restart:
280 284
281 account_irq_exit_time(current); 285 account_irq_exit_time(current);
282 __local_bh_enable(SOFTIRQ_OFFSET); 286 __local_bh_enable(SOFTIRQ_OFFSET);
287 WARN_ON_ONCE(in_interrupt());
283 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 288 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
284} 289}
285 290
286#ifndef __ARCH_HAS_DO_SOFTIRQ 291
287 292
288asmlinkage void do_softirq(void) 293asmlinkage void do_softirq(void)
289{ 294{
@@ -298,13 +303,11 @@ asmlinkage void do_softirq(void)
298 pending = local_softirq_pending(); 303 pending = local_softirq_pending();
299 304
300 if (pending) 305 if (pending)
301 __do_softirq(); 306 do_softirq_own_stack();
302 307
303 local_irq_restore(flags); 308 local_irq_restore(flags);
304} 309}
305 310
306#endif
307
308/* 311/*
309 * Enter an interrupt context. 312 * Enter an interrupt context.
310 */ 313 */
@@ -328,10 +331,25 @@ void irq_enter(void)
328 331
329static inline void invoke_softirq(void) 332static inline void invoke_softirq(void)
330{ 333{
331 if (!force_irqthreads) 334 if (!force_irqthreads) {
335#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
336 /*
337 * We can safely execute softirq on the current stack if
338 * it is the irq stack, because it should be near empty
339 * at this stage.
340 */
332 __do_softirq(); 341 __do_softirq();
333 else 342#else
343 /*
344 * Otherwise, irq_exit() is called on the task stack that can
345 * be potentially deep already. So call softirq in its own stack
346 * to prevent from any overrun.
347 */
348 do_softirq_own_stack();
349#endif
350 } else {
334 wakeup_softirqd(); 351 wakeup_softirqd();
352 }
335} 353}
336 354
337static inline void tick_irq_exit(void) 355static inline void tick_irq_exit(void)
@@ -360,7 +378,7 @@ void irq_exit(void)
360 378
361 account_irq_exit_time(current); 379 account_irq_exit_time(current);
362 trace_hardirq_exit(); 380 trace_hardirq_exit();
363 sub_preempt_count(HARDIRQ_OFFSET); 381 preempt_count_sub(HARDIRQ_OFFSET);
364 if (!in_interrupt() && local_softirq_pending()) 382 if (!in_interrupt() && local_softirq_pending())
365 invoke_softirq(); 383 invoke_softirq();
366 384
@@ -762,6 +780,10 @@ static void run_ksoftirqd(unsigned int cpu)
762{ 780{
763 local_irq_disable(); 781 local_irq_disable();
764 if (local_softirq_pending()) { 782 if (local_softirq_pending()) {
783 /*
784 * We can safely run softirq on inline stack, as we are not deep
785 * in the task stack here.
786 */
765 __do_softirq(); 787 __do_softirq();
766 rcu_note_context_switch(cpu); 788 rcu_note_context_switch(cpu);
767 local_irq_enable(); 789 local_irq_enable();
@@ -876,7 +898,6 @@ int __init __weak early_irq_init(void)
876 return 0; 898 return 0;
877} 899}
878 900
879#ifdef CONFIG_GENERIC_HARDIRQS
880int __init __weak arch_probe_nr_irqs(void) 901int __init __weak arch_probe_nr_irqs(void)
881{ 902{
882 return NR_IRQS_LEGACY; 903 return NR_IRQS_LEGACY;
@@ -886,4 +907,3 @@ int __init __weak arch_early_irq_init(void)
886{ 907{
887 return 0; 908 return 0;
888} 909}
889#endif
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5cdd8065a3ce..4b082b5cac9e 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -34,6 +34,20 @@
34#else 34#else
35#define raw_read_can_lock(l) read_can_lock(l) 35#define raw_read_can_lock(l) read_can_lock(l)
36#define raw_write_can_lock(l) write_can_lock(l) 36#define raw_write_can_lock(l) write_can_lock(l)
37
38/*
39 * Some architectures can relax in favour of the CPU owning the lock.
40 */
41#ifndef arch_read_relax
42# define arch_read_relax(l) cpu_relax()
43#endif
44#ifndef arch_write_relax
45# define arch_write_relax(l) cpu_relax()
46#endif
47#ifndef arch_spin_relax
48# define arch_spin_relax(l) cpu_relax()
49#endif
50
37/* 51/*
38 * We build the __lock_function inlines here. They are too large for 52 * We build the __lock_function inlines here. They are too large for
39 * inlining all over the place, but here is only one user per function 53 * inlining all over the place, but here is only one user per function
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index c09f2955ae30..84571e09c907 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,6 +20,7 @@
20#include <linux/kallsyms.h> 20#include <linux/kallsyms.h>
21#include <linux/smpboot.h> 21#include <linux/smpboot.h>
22#include <linux/atomic.h> 22#include <linux/atomic.h>
23#include <linux/lglock.h>
23 24
24/* 25/*
25 * Structure to determine completion condition and record errors. May 26 * Structure to determine completion condition and record errors. May
@@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
43static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); 44static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
44static bool stop_machine_initialized = false; 45static bool stop_machine_initialized = false;
45 46
47/*
48 * Avoids a race between stop_two_cpus and global stop_cpus, where
49 * the stoppers could get queued up in reverse order, leading to
50 * system deadlock. Using an lglock means stop_two_cpus remains
51 * relatively cheap.
52 */
53DEFINE_STATIC_LGLOCK(stop_cpus_lock);
54
46static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) 55static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
47{ 56{
48 memset(done, 0, sizeof(*done)); 57 memset(done, 0, sizeof(*done));
@@ -115,6 +124,184 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
115 return done.executed ? done.ret : -ENOENT; 124 return done.executed ? done.ret : -ENOENT;
116} 125}
117 126
127/* This controls the threads on each CPU. */
128enum multi_stop_state {
129 /* Dummy starting state for thread. */
130 MULTI_STOP_NONE,
131 /* Awaiting everyone to be scheduled. */
132 MULTI_STOP_PREPARE,
133 /* Disable interrupts. */
134 MULTI_STOP_DISABLE_IRQ,
135 /* Run the function */
136 MULTI_STOP_RUN,
137 /* Exit */
138 MULTI_STOP_EXIT,
139};
140
141struct multi_stop_data {
142 int (*fn)(void *);
143 void *data;
144 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
145 unsigned int num_threads;
146 const struct cpumask *active_cpus;
147
148 enum multi_stop_state state;
149 atomic_t thread_ack;
150};
151
152static void set_state(struct multi_stop_data *msdata,
153 enum multi_stop_state newstate)
154{
155 /* Reset ack counter. */
156 atomic_set(&msdata->thread_ack, msdata->num_threads);
157 smp_wmb();
158 msdata->state = newstate;
159}
160
161/* Last one to ack a state moves to the next state. */
162static void ack_state(struct multi_stop_data *msdata)
163{
164 if (atomic_dec_and_test(&msdata->thread_ack))
165 set_state(msdata, msdata->state + 1);
166}
167
168/* This is the cpu_stop function which stops the CPU. */
169static int multi_cpu_stop(void *data)
170{
171 struct multi_stop_data *msdata = data;
172 enum multi_stop_state curstate = MULTI_STOP_NONE;
173 int cpu = smp_processor_id(), err = 0;
174 unsigned long flags;
175 bool is_active;
176
177 /*
178 * When called from stop_machine_from_inactive_cpu(), irq might
179 * already be disabled. Save the state and restore it on exit.
180 */
181 local_save_flags(flags);
182
183 if (!msdata->active_cpus)
184 is_active = cpu == cpumask_first(cpu_online_mask);
185 else
186 is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
187
188 /* Simple state machine */
189 do {
190 /* Chill out and ensure we re-read multi_stop_state. */
191 cpu_relax();
192 if (msdata->state != curstate) {
193 curstate = msdata->state;
194 switch (curstate) {
195 case MULTI_STOP_DISABLE_IRQ:
196 local_irq_disable();
197 hard_irq_disable();
198 break;
199 case MULTI_STOP_RUN:
200 if (is_active)
201 err = msdata->fn(msdata->data);
202 break;
203 default:
204 break;
205 }
206 ack_state(msdata);
207 }
208 } while (curstate != MULTI_STOP_EXIT);
209
210 local_irq_restore(flags);
211 return err;
212}
213
214struct irq_cpu_stop_queue_work_info {
215 int cpu1;
216 int cpu2;
217 struct cpu_stop_work *work1;
218 struct cpu_stop_work *work2;
219};
220
221/*
222 * This function is always run with irqs and preemption disabled.
223 * This guarantees that both work1 and work2 get queued, before
224 * our local migrate thread gets the chance to preempt us.
225 */
226static void irq_cpu_stop_queue_work(void *arg)
227{
228 struct irq_cpu_stop_queue_work_info *info = arg;
229 cpu_stop_queue_work(info->cpu1, info->work1);
230 cpu_stop_queue_work(info->cpu2, info->work2);
231}
232
233/**
234 * stop_two_cpus - stops two cpus
235 * @cpu1: the cpu to stop
236 * @cpu2: the other cpu to stop
237 * @fn: function to execute
238 * @arg: argument to @fn
239 *
240 * Stops both the current and specified CPU and runs @fn on one of them.
241 *
242 * returns when both are completed.
243 */
244int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
245{
246 struct cpu_stop_done done;
247 struct cpu_stop_work work1, work2;
248 struct irq_cpu_stop_queue_work_info call_args;
249 struct multi_stop_data msdata;
250
251 preempt_disable();
252 msdata = (struct multi_stop_data){
253 .fn = fn,
254 .data = arg,
255 .num_threads = 2,
256 .active_cpus = cpumask_of(cpu1),
257 };
258
259 work1 = work2 = (struct cpu_stop_work){
260 .fn = multi_cpu_stop,
261 .arg = &msdata,
262 .done = &done
263 };
264
265 call_args = (struct irq_cpu_stop_queue_work_info){
266 .cpu1 = cpu1,
267 .cpu2 = cpu2,
268 .work1 = &work1,
269 .work2 = &work2,
270 };
271
272 cpu_stop_init_done(&done, 2);
273 set_state(&msdata, MULTI_STOP_PREPARE);
274
275 /*
276 * If we observe both CPUs active we know _cpu_down() cannot yet have
277 * queued its stop_machine works and therefore ours will get executed
278 * first. Or its not either one of our CPUs that's getting unplugged,
279 * in which case we don't care.
280 *
281 * This relies on the stopper workqueues to be FIFO.
282 */
283 if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
284 preempt_enable();
285 return -ENOENT;
286 }
287
288 lg_local_lock(&stop_cpus_lock);
289 /*
290 * Queuing needs to be done by the lowest numbered CPU, to ensure
291 * that works are always queued in the same order on every CPU.
292 * This prevents deadlocks.
293 */
294 smp_call_function_single(min(cpu1, cpu2),
295 &irq_cpu_stop_queue_work,
296 &call_args, 0);
297 lg_local_unlock(&stop_cpus_lock);
298 preempt_enable();
299
300 wait_for_completion(&done.completion);
301
302 return done.executed ? done.ret : -ENOENT;
303}
304
118/** 305/**
119 * stop_one_cpu_nowait - stop a cpu but don't wait for completion 306 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
120 * @cpu: cpu to stop 307 * @cpu: cpu to stop
@@ -159,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
159 * preempted by a stopper which might wait for other stoppers 346 * preempted by a stopper which might wait for other stoppers
160 * to enter @fn which can lead to deadlock. 347 * to enter @fn which can lead to deadlock.
161 */ 348 */
162 preempt_disable(); 349 lg_global_lock(&stop_cpus_lock);
163 for_each_cpu(cpu, cpumask) 350 for_each_cpu(cpu, cpumask)
164 cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); 351 cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
165 preempt_enable(); 352 lg_global_unlock(&stop_cpus_lock);
166} 353}
167 354
168static int __stop_cpus(const struct cpumask *cpumask, 355static int __stop_cpus(const struct cpumask *cpumask,
@@ -359,98 +546,14 @@ early_initcall(cpu_stop_init);
359 546
360#ifdef CONFIG_STOP_MACHINE 547#ifdef CONFIG_STOP_MACHINE
361 548
362/* This controls the threads on each CPU. */
363enum stopmachine_state {
364 /* Dummy starting state for thread. */
365 STOPMACHINE_NONE,
366 /* Awaiting everyone to be scheduled. */
367 STOPMACHINE_PREPARE,
368 /* Disable interrupts. */
369 STOPMACHINE_DISABLE_IRQ,
370 /* Run the function */
371 STOPMACHINE_RUN,
372 /* Exit */
373 STOPMACHINE_EXIT,
374};
375
376struct stop_machine_data {
377 int (*fn)(void *);
378 void *data;
379 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
380 unsigned int num_threads;
381 const struct cpumask *active_cpus;
382
383 enum stopmachine_state state;
384 atomic_t thread_ack;
385};
386
387static void set_state(struct stop_machine_data *smdata,
388 enum stopmachine_state newstate)
389{
390 /* Reset ack counter. */
391 atomic_set(&smdata->thread_ack, smdata->num_threads);
392 smp_wmb();
393 smdata->state = newstate;
394}
395
396/* Last one to ack a state moves to the next state. */
397static void ack_state(struct stop_machine_data *smdata)
398{
399 if (atomic_dec_and_test(&smdata->thread_ack))
400 set_state(smdata, smdata->state + 1);
401}
402
403/* This is the cpu_stop function which stops the CPU. */
404static int stop_machine_cpu_stop(void *data)
405{
406 struct stop_machine_data *smdata = data;
407 enum stopmachine_state curstate = STOPMACHINE_NONE;
408 int cpu = smp_processor_id(), err = 0;
409 unsigned long flags;
410 bool is_active;
411
412 /*
413 * When called from stop_machine_from_inactive_cpu(), irq might
414 * already be disabled. Save the state and restore it on exit.
415 */
416 local_save_flags(flags);
417
418 if (!smdata->active_cpus)
419 is_active = cpu == cpumask_first(cpu_online_mask);
420 else
421 is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
422
423 /* Simple state machine */
424 do {
425 /* Chill out and ensure we re-read stopmachine_state. */
426 cpu_relax();
427 if (smdata->state != curstate) {
428 curstate = smdata->state;
429 switch (curstate) {
430 case STOPMACHINE_DISABLE_IRQ:
431 local_irq_disable();
432 hard_irq_disable();
433 break;
434 case STOPMACHINE_RUN:
435 if (is_active)
436 err = smdata->fn(smdata->data);
437 break;
438 default:
439 break;
440 }
441 ack_state(smdata);
442 }
443 } while (curstate != STOPMACHINE_EXIT);
444
445 local_irq_restore(flags);
446 return err;
447}
448
449int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 549int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
450{ 550{
451 struct stop_machine_data smdata = { .fn = fn, .data = data, 551 struct multi_stop_data msdata = {
452 .num_threads = num_online_cpus(), 552 .fn = fn,
453 .active_cpus = cpus }; 553 .data = data,
554 .num_threads = num_online_cpus(),
555 .active_cpus = cpus,
556 };
454 557
455 if (!stop_machine_initialized) { 558 if (!stop_machine_initialized) {
456 /* 559 /*
@@ -461,7 +564,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
461 unsigned long flags; 564 unsigned long flags;
462 int ret; 565 int ret;
463 566
464 WARN_ON_ONCE(smdata.num_threads != 1); 567 WARN_ON_ONCE(msdata.num_threads != 1);
465 568
466 local_irq_save(flags); 569 local_irq_save(flags);
467 hard_irq_disable(); 570 hard_irq_disable();
@@ -472,8 +575,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
472 } 575 }
473 576
474 /* Set the initial state and stop all online cpus. */ 577 /* Set the initial state and stop all online cpus. */
475 set_state(&smdata, STOPMACHINE_PREPARE); 578 set_state(&msdata, MULTI_STOP_PREPARE);
476 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); 579 return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
477} 580}
478 581
479int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 582int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
@@ -513,25 +616,25 @@ EXPORT_SYMBOL_GPL(stop_machine);
513int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, 616int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
514 const struct cpumask *cpus) 617 const struct cpumask *cpus)
515{ 618{
516 struct stop_machine_data smdata = { .fn = fn, .data = data, 619 struct multi_stop_data msdata = { .fn = fn, .data = data,
517 .active_cpus = cpus }; 620 .active_cpus = cpus };
518 struct cpu_stop_done done; 621 struct cpu_stop_done done;
519 int ret; 622 int ret;
520 623
521 /* Local CPU must be inactive and CPU hotplug in progress. */ 624 /* Local CPU must be inactive and CPU hotplug in progress. */
522 BUG_ON(cpu_active(raw_smp_processor_id())); 625 BUG_ON(cpu_active(raw_smp_processor_id()));
523 smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ 626 msdata.num_threads = num_active_cpus() + 1; /* +1 for local */
524 627
525 /* No proper task established and can't sleep - busy wait for lock. */ 628 /* No proper task established and can't sleep - busy wait for lock. */
526 while (!mutex_trylock(&stop_cpus_mutex)) 629 while (!mutex_trylock(&stop_cpus_mutex))
527 cpu_relax(); 630 cpu_relax();
528 631
529 /* Schedule work on other CPUs and execute directly for local CPU */ 632 /* Schedule work on other CPUs and execute directly for local CPU */
530 set_state(&smdata, STOPMACHINE_PREPARE); 633 set_state(&msdata, MULTI_STOP_PREPARE);
531 cpu_stop_init_done(&done, num_active_cpus()); 634 cpu_stop_init_done(&done, num_active_cpus());
532 queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, 635 queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
533 &done); 636 &done);
534 ret = stop_machine_cpu_stop(&smdata); 637 ret = multi_cpu_stop(&msdata);
535 638
536 /* Busy wait for completion. */ 639 /* Busy wait for completion. */
537 while (!completion_done(&done.completion)) 640 while (!completion_done(&done.completion))
diff --git a/kernel/sys.c b/kernel/sys.c
index 771129b299f8..c18ecca575b4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -337,7 +337,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
337 if (rgid != (gid_t) -1) { 337 if (rgid != (gid_t) -1) {
338 if (gid_eq(old->gid, krgid) || 338 if (gid_eq(old->gid, krgid) ||
339 gid_eq(old->egid, krgid) || 339 gid_eq(old->egid, krgid) ||
340 nsown_capable(CAP_SETGID)) 340 ns_capable(old->user_ns, CAP_SETGID))
341 new->gid = krgid; 341 new->gid = krgid;
342 else 342 else
343 goto error; 343 goto error;
@@ -346,7 +346,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
346 if (gid_eq(old->gid, kegid) || 346 if (gid_eq(old->gid, kegid) ||
347 gid_eq(old->egid, kegid) || 347 gid_eq(old->egid, kegid) ||
348 gid_eq(old->sgid, kegid) || 348 gid_eq(old->sgid, kegid) ||
349 nsown_capable(CAP_SETGID)) 349 ns_capable(old->user_ns, CAP_SETGID))
350 new->egid = kegid; 350 new->egid = kegid;
351 else 351 else
352 goto error; 352 goto error;
@@ -387,7 +387,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
387 old = current_cred(); 387 old = current_cred();
388 388
389 retval = -EPERM; 389 retval = -EPERM;
390 if (nsown_capable(CAP_SETGID)) 390 if (ns_capable(old->user_ns, CAP_SETGID))
391 new->gid = new->egid = new->sgid = new->fsgid = kgid; 391 new->gid = new->egid = new->sgid = new->fsgid = kgid;
392 else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) 392 else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
393 new->egid = new->fsgid = kgid; 393 new->egid = new->fsgid = kgid;
@@ -471,7 +471,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
471 new->uid = kruid; 471 new->uid = kruid;
472 if (!uid_eq(old->uid, kruid) && 472 if (!uid_eq(old->uid, kruid) &&
473 !uid_eq(old->euid, kruid) && 473 !uid_eq(old->euid, kruid) &&
474 !nsown_capable(CAP_SETUID)) 474 !ns_capable(old->user_ns, CAP_SETUID))
475 goto error; 475 goto error;
476 } 476 }
477 477
@@ -480,7 +480,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
480 if (!uid_eq(old->uid, keuid) && 480 if (!uid_eq(old->uid, keuid) &&
481 !uid_eq(old->euid, keuid) && 481 !uid_eq(old->euid, keuid) &&
482 !uid_eq(old->suid, keuid) && 482 !uid_eq(old->suid, keuid) &&
483 !nsown_capable(CAP_SETUID)) 483 !ns_capable(old->user_ns, CAP_SETUID))
484 goto error; 484 goto error;
485 } 485 }
486 486
@@ -534,7 +534,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
534 old = current_cred(); 534 old = current_cred();
535 535
536 retval = -EPERM; 536 retval = -EPERM;
537 if (nsown_capable(CAP_SETUID)) { 537 if (ns_capable(old->user_ns, CAP_SETUID)) {
538 new->suid = new->uid = kuid; 538 new->suid = new->uid = kuid;
539 if (!uid_eq(kuid, old->uid)) { 539 if (!uid_eq(kuid, old->uid)) {
540 retval = set_user(new); 540 retval = set_user(new);
@@ -591,7 +591,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
591 old = current_cred(); 591 old = current_cred();
592 592
593 retval = -EPERM; 593 retval = -EPERM;
594 if (!nsown_capable(CAP_SETUID)) { 594 if (!ns_capable(old->user_ns, CAP_SETUID)) {
595 if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && 595 if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
596 !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) 596 !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
597 goto error; 597 goto error;
@@ -673,7 +673,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
673 old = current_cred(); 673 old = current_cred();
674 674
675 retval = -EPERM; 675 retval = -EPERM;
676 if (!nsown_capable(CAP_SETGID)) { 676 if (!ns_capable(old->user_ns, CAP_SETGID)) {
677 if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && 677 if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
678 !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) 678 !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
679 goto error; 679 goto error;
@@ -744,7 +744,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
744 744
745 if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || 745 if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) ||
746 uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || 746 uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
747 nsown_capable(CAP_SETUID)) { 747 ns_capable(old->user_ns, CAP_SETUID)) {
748 if (!uid_eq(kuid, old->fsuid)) { 748 if (!uid_eq(kuid, old->fsuid)) {
749 new->fsuid = kuid; 749 new->fsuid = kuid;
750 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 750 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -783,7 +783,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
783 783
784 if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || 784 if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) ||
785 gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || 785 gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
786 nsown_capable(CAP_SETGID)) { 786 ns_capable(old->user_ns, CAP_SETGID)) {
787 if (!gid_eq(kgid, old->fsgid)) { 787 if (!gid_eq(kgid, old->fsgid)) {
788 new->fsgid = kgid; 788 new->fsgid = kgid;
789 goto change_okay; 789 goto change_okay;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 07f6fc468e17..36547dddcdb8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -190,7 +190,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
190 190
191#ifdef CONFIG_MAGIC_SYSRQ 191#ifdef CONFIG_MAGIC_SYSRQ
192/* Note: sysrq code uses it's own private copy */ 192/* Note: sysrq code uses it's own private copy */
193static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; 193static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
194 194
195static int sysrq_sysctl_handler(ctl_table *table, int write, 195static int sysrq_sysctl_handler(ctl_table *table, int write,
196 void __user *buffer, size_t *lenp, 196 void __user *buffer, size_t *lenp,
@@ -371,13 +371,6 @@ static struct ctl_table kern_table[] = {
371 .proc_handler = proc_dointvec, 371 .proc_handler = proc_dointvec,
372 }, 372 },
373 { 373 {
374 .procname = "numa_balancing_scan_period_reset",
375 .data = &sysctl_numa_balancing_scan_period_reset,
376 .maxlen = sizeof(unsigned int),
377 .mode = 0644,
378 .proc_handler = proc_dointvec,
379 },
380 {
381 .procname = "numa_balancing_scan_period_max_ms", 374 .procname = "numa_balancing_scan_period_max_ms",
382 .data = &sysctl_numa_balancing_scan_period_max, 375 .data = &sysctl_numa_balancing_scan_period_max,
383 .maxlen = sizeof(unsigned int), 376 .maxlen = sizeof(unsigned int),
@@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = {
391 .mode = 0644, 384 .mode = 0644,
392 .proc_handler = proc_dointvec, 385 .proc_handler = proc_dointvec,
393 }, 386 },
387 {
388 .procname = "numa_balancing_settle_count",
389 .data = &sysctl_numa_balancing_settle_count,
390 .maxlen = sizeof(unsigned int),
391 .mode = 0644,
392 .proc_handler = proc_dointvec,
393 },
394 {
395 .procname = "numa_balancing_migrate_deferred",
396 .data = &sysctl_numa_balancing_migrate_deferred,
397 .maxlen = sizeof(unsigned int),
398 .mode = 0644,
399 .proc_handler = proc_dointvec,
400 },
394#endif /* CONFIG_NUMA_BALANCING */ 401#endif /* CONFIG_NUMA_BALANCING */
395#endif /* CONFIG_SCHED_DEBUG */ 402#endif /* CONFIG_SCHED_DEBUG */
396 { 403 {
@@ -1049,6 +1056,7 @@ static struct ctl_table kern_table[] = {
1049 .maxlen = sizeof(sysctl_perf_event_sample_rate), 1056 .maxlen = sizeof(sysctl_perf_event_sample_rate),
1050 .mode = 0644, 1057 .mode = 0644,
1051 .proc_handler = perf_proc_update_handler, 1058 .proc_handler = perf_proc_update_handler,
1059 .extra1 = &one,
1052 }, 1060 },
1053 { 1061 {
1054 .procname = "perf_cpu_time_max_percent", 1062 .procname = "perf_cpu_time_max_percent",
@@ -1225,7 +1233,7 @@ static struct ctl_table vm_table[] = {
1225 .data = &hugepages_treat_as_movable, 1233 .data = &hugepages_treat_as_movable,
1226 .maxlen = sizeof(int), 1234 .maxlen = sizeof(int),
1227 .mode = 0644, 1235 .mode = 0644,
1228 .proc_handler = hugetlb_treat_movable_handler, 1236 .proc_handler = proc_dointvec,
1229 }, 1237 },
1230 { 1238 {
1231 .procname = "nr_overcommit_hugepages", 1239 .procname = "nr_overcommit_hugepages",
@@ -1471,14 +1479,14 @@ static struct ctl_table fs_table[] = {
1471 { 1479 {
1472 .procname = "inode-nr", 1480 .procname = "inode-nr",
1473 .data = &inodes_stat, 1481 .data = &inodes_stat,
1474 .maxlen = 2*sizeof(int), 1482 .maxlen = 2*sizeof(long),
1475 .mode = 0444, 1483 .mode = 0444,
1476 .proc_handler = proc_nr_inodes, 1484 .proc_handler = proc_nr_inodes,
1477 }, 1485 },
1478 { 1486 {
1479 .procname = "inode-state", 1487 .procname = "inode-state",
1480 .data = &inodes_stat, 1488 .data = &inodes_stat,
1481 .maxlen = 7*sizeof(int), 1489 .maxlen = 7*sizeof(long),
1482 .mode = 0444, 1490 .mode = 0444,
1483 .proc_handler = proc_nr_inodes, 1491 .proc_handler = proc_nr_inodes,
1484 }, 1492 },
@@ -1508,7 +1516,7 @@ static struct ctl_table fs_table[] = {
1508 { 1516 {
1509 .procname = "dentry-state", 1517 .procname = "dentry-state",
1510 .data = &dentry_stat, 1518 .data = &dentry_stat,
1511 .maxlen = 6*sizeof(int), 1519 .maxlen = 6*sizeof(long),
1512 .mode = 0444, 1520 .mode = 0444,
1513 .proc_handler = proc_nr_dentry, 1521 .proc_handler = proc_nr_dentry,
1514 }, 1522 },
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 65bd3c92d6f3..8727032e3a6f 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -4,6 +4,23 @@
4 4
5static struct callback_head work_exited; /* all we need is ->next == NULL */ 5static struct callback_head work_exited; /* all we need is ->next == NULL */
6 6
7/**
8 * task_work_add - ask the @task to execute @work->func()
9 * @task: the task which should run the callback
10 * @work: the callback to run
11 * @notify: send the notification if true
12 *
13 * Queue @work for task_work_run() below and notify the @task if @notify.
14 * Fails if the @task is exiting/exited and thus it can't process this @work.
15 * Otherwise @work->func() will be called when the @task returns from kernel
16 * mode or exits.
17 *
18 * This is like the signal handler which runs in kernel mode, but it doesn't
19 * try to wake up the @task.
20 *
21 * RETURNS:
22 * 0 if succeeds or -ESRCH.
23 */
7int 24int
8task_work_add(struct task_struct *task, struct callback_head *work, bool notify) 25task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
9{ 26{
@@ -21,11 +38,22 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
21 return 0; 38 return 0;
22} 39}
23 40
41/**
42 * task_work_cancel - cancel a pending work added by task_work_add()
43 * @task: the task which should execute the work
44 * @func: identifies the work to remove
45 *
46 * Find the last queued pending work with ->func == @func and remove
47 * it from queue.
48 *
49 * RETURNS:
50 * The found work or NULL if not found.
51 */
24struct callback_head * 52struct callback_head *
25task_work_cancel(struct task_struct *task, task_work_func_t func) 53task_work_cancel(struct task_struct *task, task_work_func_t func)
26{ 54{
27 struct callback_head **pprev = &task->task_works; 55 struct callback_head **pprev = &task->task_works;
28 struct callback_head *work = NULL; 56 struct callback_head *work;
29 unsigned long flags; 57 unsigned long flags;
30 /* 58 /*
31 * If cmpxchg() fails we continue without updating pprev. 59 * If cmpxchg() fails we continue without updating pprev.
@@ -35,7 +63,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
35 */ 63 */
36 raw_spin_lock_irqsave(&task->pi_lock, flags); 64 raw_spin_lock_irqsave(&task->pi_lock, flags);
37 while ((work = ACCESS_ONCE(*pprev))) { 65 while ((work = ACCESS_ONCE(*pprev))) {
38 read_barrier_depends(); 66 smp_read_barrier_depends();
39 if (work->func != func) 67 if (work->func != func)
40 pprev = &work->next; 68 pprev = &work->next;
41 else if (cmpxchg(pprev, work, work->next) == work) 69 else if (cmpxchg(pprev, work, work->next) == work)
@@ -46,6 +74,14 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
46 return work; 74 return work;
47} 75}
48 76
77/**
78 * task_work_run - execute the works added by task_work_add()
79 *
80 * Flush the pending works. Should be used by the core kernel code.
81 * Called before the task returns to the user-mode or stops, or when
82 * it exits. In the latter case task_work_add() can no longer add the
83 * new work after task_work_run() returns.
84 */
49void task_work_run(void) 85void task_work_run(void)
50{ 86{
51 struct task_struct *task = current; 87 struct task_struct *task = current;
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 70f27e89012b..3ce6e8c5f3fc 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -100,12 +100,11 @@ config NO_HZ_FULL
100 # RCU_USER_QS dependency 100 # RCU_USER_QS dependency
101 depends on HAVE_CONTEXT_TRACKING 101 depends on HAVE_CONTEXT_TRACKING
102 # VIRT_CPU_ACCOUNTING_GEN dependency 102 # VIRT_CPU_ACCOUNTING_GEN dependency
103 depends on 64BIT 103 depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
104 select NO_HZ_COMMON 104 select NO_HZ_COMMON
105 select RCU_USER_QS 105 select RCU_USER_QS
106 select RCU_NOCB_CPU 106 select RCU_NOCB_CPU
107 select VIRT_CPU_ACCOUNTING_GEN 107 select VIRT_CPU_ACCOUNTING_GEN
108 select CONTEXT_TRACKING_FORCE
109 select IRQ_WORK 108 select IRQ_WORK
110 help 109 help
111 Adaptively try to shutdown the tick whenever possible, even when 110 Adaptively try to shutdown the tick whenever possible, even when
@@ -134,6 +133,56 @@ config NO_HZ_FULL_ALL
134 Note the boot CPU will still be kept outside the range to 133 Note the boot CPU will still be kept outside the range to
135 handle the timekeeping duty. 134 handle the timekeeping duty.
136 135
136config NO_HZ_FULL_SYSIDLE
137 bool "Detect full-system idle state for full dynticks system"
138 depends on NO_HZ_FULL
139 default n
140 help
141 At least one CPU must keep the scheduling-clock tick running for
142 timekeeping purposes whenever there is a non-idle CPU, where
143 "non-idle" also includes dynticks CPUs as long as they are
144 running non-idle tasks. Because the underlying adaptive-tick
145 support cannot distinguish between all CPUs being idle and
146 all CPUs each running a single task in dynticks mode, the
147 underlying support simply ensures that there is always a CPU
148 handling the scheduling-clock tick, whether or not all CPUs
149 are idle. This Kconfig option enables scalable detection of
150 the all-CPUs-idle state, thus allowing the scheduling-clock
151 tick to be disabled when all CPUs are idle. Note that scalable
152 detection of the all-CPUs-idle state means that larger systems
153 will be slower to declare the all-CPUs-idle state.
154
155 Say Y if you would like to help debug all-CPUs-idle detection.
156
157 Say N if you are unsure.
158
159config NO_HZ_FULL_SYSIDLE_SMALL
160 int "Number of CPUs above which large-system approach is used"
161 depends on NO_HZ_FULL_SYSIDLE
162 range 1 NR_CPUS
163 default 8
164 help
165 The full-system idle detection mechanism takes a lazy approach
166 on large systems, as is required to attain decent scalability.
167 However, on smaller systems, scalability is not anywhere near as
168 large a concern as is energy efficiency. The sysidle subsystem
169 therefore uses a fast but non-scalable algorithm for small
170 systems and a lazier but scalable algorithm for large systems.
171 This Kconfig parameter defines the number of CPUs in the largest
172 system that will be considered to be "small".
173
174 The default value will be fine in most cases. Battery-powered
175 systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
176 numbers of CPUs, and (3) are suffering from battery-lifetime
177 problems due to long sysidle latencies might wish to experiment
178 with larger values for this Kconfig parameter. On the other
179 hand, they might be even better served by disabling NO_HZ_FULL
180 entirely, given that NO_HZ_FULL is intended for HPC and
181 real-time workloads that at present do not tend to be run on
182 battery-powered systems.
183
184 Take the default if you are unsure.
185
137config NO_HZ 186config NO_HZ
138 bool "Old Idle dynticks config" 187 bool "Old Idle dynticks config"
139 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS 188 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index eec50fcef9e4..88c9c65a430d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
490 clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; 490 clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
491 491
492 if (!alarmtimer_get_rtcdev()) 492 if (!alarmtimer_get_rtcdev())
493 return -ENOTSUPP; 493 return -EINVAL;
494 494
495 return hrtimer_get_res(baseid, tp); 495 return hrtimer_get_res(baseid, tp);
496} 496}
@@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
507 struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; 507 struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
508 508
509 if (!alarmtimer_get_rtcdev()) 509 if (!alarmtimer_get_rtcdev())
510 return -ENOTSUPP; 510 return -EINVAL;
511 511
512 *tp = ktime_to_timespec(base->gettime()); 512 *tp = ktime_to_timespec(base->gettime());
513 return 0; 513 return 0;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 38959c866789..086ad6043bcb 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -33,29 +33,64 @@ struct ce_unbind {
33 int res; 33 int res;
34}; 34};
35 35
36/** 36static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
37 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds 37 bool ismax)
38 * @latch: value to convert
39 * @evt: pointer to clock event device descriptor
40 *
41 * Math helper, returns latch value converted to nanoseconds (bound checked)
42 */
43u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
44{ 38{
45 u64 clc = (u64) latch << evt->shift; 39 u64 clc = (u64) latch << evt->shift;
40 u64 rnd;
46 41
47 if (unlikely(!evt->mult)) { 42 if (unlikely(!evt->mult)) {
48 evt->mult = 1; 43 evt->mult = 1;
49 WARN_ON(1); 44 WARN_ON(1);
50 } 45 }
46 rnd = (u64) evt->mult - 1;
47
48 /*
49 * Upper bound sanity check. If the backwards conversion is
50 * not equal latch, we know that the above shift overflowed.
51 */
52 if ((clc >> evt->shift) != (u64)latch)
53 clc = ~0ULL;
54
55 /*
56 * Scaled math oddities:
57 *
58 * For mult <= (1 << shift) we can safely add mult - 1 to
59 * prevent integer rounding loss. So the backwards conversion
60 * from nsec to device ticks will be correct.
61 *
62 * For mult > (1 << shift), i.e. device frequency is > 1GHz we
63 * need to be careful. Adding mult - 1 will result in a value
64 * which when converted back to device ticks can be larger
65 * than latch by up to (mult - 1) >> shift. For the min_delta
66 * calculation we still want to apply this in order to stay
67 * above the minimum device ticks limit. For the upper limit
68 * we would end up with a latch value larger than the upper
69 * limit of the device, so we omit the add to stay below the
70 * device upper boundary.
71 *
72 * Also omit the add if it would overflow the u64 boundary.
73 */
74 if ((~0ULL - clc > rnd) &&
75 (!ismax || evt->mult <= (1U << evt->shift)))
76 clc += rnd;
51 77
52 do_div(clc, evt->mult); 78 do_div(clc, evt->mult);
53 if (clc < 1000)
54 clc = 1000;
55 if (clc > KTIME_MAX)
56 clc = KTIME_MAX;
57 79
58 return clc; 80 /* Deltas less than 1usec are pointless noise */
81 return clc > 1000 ? clc : 1000;
82}
83
84/**
85 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
86 * @latch: value to convert
87 * @evt: pointer to clock event device descriptor
88 *
89 * Math helper, returns latch value converted to nanoseconds (bound checked)
90 */
91u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
92{
93 return cev_delta2ns(latch, evt, false);
59} 94}
60EXPORT_SYMBOL_GPL(clockevent_delta2ns); 95EXPORT_SYMBOL_GPL(clockevent_delta2ns);
61 96
@@ -380,8 +415,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq)
380 sec = 600; 415 sec = 600;
381 416
382 clockevents_calc_mult_shift(dev, freq, sec); 417 clockevents_calc_mult_shift(dev, freq, sec);
383 dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); 418 dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false);
384 dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); 419 dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true);
385} 420}
386 421
387/** 422/**
@@ -584,7 +619,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev,
584 const char *buf, size_t count) 619 const char *buf, size_t count)
585{ 620{
586 char name[CS_NAME_LEN]; 621 char name[CS_NAME_LEN];
587 size_t ret = sysfs_get_uname(buf, name, count); 622 ssize_t ret = sysfs_get_uname(buf, name, count);
588 struct clock_event_device *ce; 623 struct clock_event_device *ce;
589 624
590 if (ret < 0) 625 if (ret < 0)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 50a8736757f3..ba3e502c955a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -479,6 +479,7 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
479static inline void clocksource_resume_watchdog(void) { } 479static inline void clocksource_resume_watchdog(void) { }
480static inline int __clocksource_watchdog_kthread(void) { return 0; } 480static inline int __clocksource_watchdog_kthread(void) { return 0; }
481static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } 481static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
482void clocksource_mark_unstable(struct clocksource *cs) { }
482 483
483#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ 484#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
484 485
@@ -537,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
537} 538}
538 539
539/** 540/**
540 * clocksource_max_deferment - Returns max time the clocksource can be deferred 541 * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
541 * @cs: Pointer to clocksource 542 * @mult: cycle to nanosecond multiplier
542 * 543 * @shift: cycle to nanosecond divisor (power of two)
544 * @maxadj: maximum adjustment value to mult (~11%)
545 * @mask: bitmask for two's complement subtraction of non 64 bit counters
543 */ 546 */
544static u64 clocksource_max_deferment(struct clocksource *cs) 547u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
545{ 548{
546 u64 max_nsecs, max_cycles; 549 u64 max_nsecs, max_cycles;
547 550
548 /* 551 /*
549 * Calculate the maximum number of cycles that we can pass to the 552 * Calculate the maximum number of cycles that we can pass to the
550 * cyc2ns function without overflowing a 64-bit signed result. The 553 * cyc2ns function without overflowing a 64-bit signed result. The
551 * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) 554 * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
552 * which is equivalent to the below. 555 * which is equivalent to the below.
553 * max_cycles < (2^63)/(cs->mult + cs->maxadj) 556 * max_cycles < (2^63)/(mult + maxadj)
554 * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) 557 * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
555 * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) 558 * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
556 * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) 559 * max_cycles < 2^(63 - log2(mult + maxadj))
557 * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) 560 * max_cycles < 1 << (63 - log2(mult + maxadj))
558 * Please note that we add 1 to the result of the log2 to account for 561 * Please note that we add 1 to the result of the log2 to account for
559 * any rounding errors, ensure the above inequality is satisfied and 562 * any rounding errors, ensure the above inequality is satisfied and
560 * no overflow will occur. 563 * no overflow will occur.
561 */ 564 */
562 max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); 565 max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
563 566
564 /* 567 /*
565 * The actual maximum number of cycles we can defer the clocksource is 568 * The actual maximum number of cycles we can defer the clocksource is
566 * determined by the minimum of max_cycles and cs->mask. 569 * determined by the minimum of max_cycles and mask.
567 * Note: Here we subtract the maxadj to make sure we don't sleep for 570 * Note: Here we subtract the maxadj to make sure we don't sleep for
568 * too long if there's a large negative adjustment. 571 * too long if there's a large negative adjustment.
569 */ 572 */
570 max_cycles = min_t(u64, max_cycles, (u64) cs->mask); 573 max_cycles = min(max_cycles, mask);
571 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, 574 max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
572 cs->shift); 575
576 return max_nsecs;
577}
578
579/**
580 * clocksource_max_deferment - Returns max time the clocksource can be deferred
581 * @cs: Pointer to clocksource
582 *
583 */
584static u64 clocksource_max_deferment(struct clocksource *cs)
585{
586 u64 max_nsecs;
573 587
588 max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
589 cs->mask);
574 /* 590 /*
575 * To ensure that the clocksource does not wrap whilst we are idle, 591 * To ensure that the clocksource does not wrap whilst we are idle,
576 * limit the time the clocksource can be deferred by 12.5%. Please 592 * limit the time the clocksource can be deferred by 12.5%. Please
@@ -893,7 +909,7 @@ sysfs_show_current_clocksources(struct device *dev,
893 return count; 909 return count;
894} 910}
895 911
896size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) 912ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
897{ 913{
898 size_t ret = cnt; 914 size_t ret = cnt;
899 915
@@ -924,7 +940,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
924 struct device_attribute *attr, 940 struct device_attribute *attr,
925 const char *buf, size_t count) 941 const char *buf, size_t count)
926{ 942{
927 size_t ret; 943 ssize_t ret;
928 944
929 mutex_lock(&clocksource_mutex); 945 mutex_lock(&clocksource_mutex);
930 946
@@ -952,7 +968,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev,
952{ 968{
953 struct clocksource *cs; 969 struct clocksource *cs;
954 char name[CS_NAME_LEN]; 970 char name[CS_NAME_LEN];
955 size_t ret; 971 ssize_t ret;
956 972
957 ret = sysfs_get_uname(buf, name, count); 973 ret = sysfs_get_uname(buf, name, count);
958 if (ret < 0) 974 if (ret < 0)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8f5b3b98577b..af8d1d4f3d55 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work)
475 * called as close as possible to 500 ms before the new second starts. 475 * called as close as possible to 500 ms before the new second starts.
476 * This code is run on a timer. If the clock is set, that timer 476 * This code is run on a timer. If the clock is set, that timer
477 * may not expire at the correct time. Thus, we adjust... 477 * may not expire at the correct time. Thus, we adjust...
478 * We want the clock to be within a couple of ticks from the target.
478 */ 479 */
479 if (!ntp_synced()) { 480 if (!ntp_synced()) {
480 /* 481 /*
@@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work)
485 } 486 }
486 487
487 getnstimeofday(&now); 488 getnstimeofday(&now);
488 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { 489 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
489 struct timespec adjust = now; 490 struct timespec adjust = now;
490 491
491 fail = -ENODEV; 492 fail = -ENODEV;
@@ -516,13 +517,13 @@ static void sync_cmos_clock(struct work_struct *work)
516 schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); 517 schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
517} 518}
518 519
519static void notify_cmos_timer(void) 520void ntp_notify_cmos_timer(void)
520{ 521{
521 schedule_delayed_work(&sync_cmos_work, 0); 522 schedule_delayed_work(&sync_cmos_work, 0);
522} 523}
523 524
524#else 525#else
525static inline void notify_cmos_timer(void) { } 526void ntp_notify_cmos_timer(void) { }
526#endif 527#endif
527 528
528 529
@@ -687,8 +688,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
687 if (!(time_status & STA_NANO)) 688 if (!(time_status & STA_NANO))
688 txc->time.tv_usec /= NSEC_PER_USEC; 689 txc->time.tv_usec /= NSEC_PER_USEC;
689 690
690 notify_cmos_timer();
691
692 return result; 691 return result;
693} 692}
694 693
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index a326f27d7f09..68b799375981 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -8,25 +8,28 @@
8#include <linux/clocksource.h> 8#include <linux/clocksource.h>
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/jiffies.h> 10#include <linux/jiffies.h>
11#include <linux/ktime.h>
11#include <linux/kernel.h> 12#include <linux/kernel.h>
12#include <linux/moduleparam.h> 13#include <linux/moduleparam.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/syscore_ops.h> 15#include <linux/syscore_ops.h>
15#include <linux/timer.h> 16#include <linux/hrtimer.h>
16#include <linux/sched_clock.h> 17#include <linux/sched_clock.h>
18#include <linux/seqlock.h>
19#include <linux/bitops.h>
17 20
18struct clock_data { 21struct clock_data {
22 ktime_t wrap_kt;
19 u64 epoch_ns; 23 u64 epoch_ns;
20 u32 epoch_cyc; 24 u64 epoch_cyc;
21 u32 epoch_cyc_copy; 25 seqcount_t seq;
22 unsigned long rate; 26 unsigned long rate;
23 u32 mult; 27 u32 mult;
24 u32 shift; 28 u32 shift;
25 bool suspended; 29 bool suspended;
26}; 30};
27 31
28static void sched_clock_poll(unsigned long wrap_ticks); 32static struct hrtimer sched_clock_timer;
29static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0);
30static int irqtime = -1; 33static int irqtime = -1;
31 34
32core_param(irqtime, irqtime, int, 0400); 35core_param(irqtime, irqtime, int, 0400);
@@ -35,42 +38,46 @@ static struct clock_data cd = {
35 .mult = NSEC_PER_SEC / HZ, 38 .mult = NSEC_PER_SEC / HZ,
36}; 39};
37 40
38static u32 __read_mostly sched_clock_mask = 0xffffffff; 41static u64 __read_mostly sched_clock_mask;
39 42
40static u32 notrace jiffy_sched_clock_read(void) 43static u64 notrace jiffy_sched_clock_read(void)
41{ 44{
42 return (u32)(jiffies - INITIAL_JIFFIES); 45 /*
46 * We don't need to use get_jiffies_64 on 32-bit arches here
47 * because we register with BITS_PER_LONG
48 */
49 return (u64)(jiffies - INITIAL_JIFFIES);
43} 50}
44 51
45static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; 52static u32 __read_mostly (*read_sched_clock_32)(void);
53
54static u64 notrace read_sched_clock_32_wrapper(void)
55{
56 return read_sched_clock_32();
57}
58
59static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
46 60
47static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) 61static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
48{ 62{
49 return (cyc * mult) >> shift; 63 return (cyc * mult) >> shift;
50} 64}
51 65
52static unsigned long long notrace sched_clock_32(void) 66unsigned long long notrace sched_clock(void)
53{ 67{
54 u64 epoch_ns; 68 u64 epoch_ns;
55 u32 epoch_cyc; 69 u64 epoch_cyc;
56 u32 cyc; 70 u64 cyc;
71 unsigned long seq;
57 72
58 if (cd.suspended) 73 if (cd.suspended)
59 return cd.epoch_ns; 74 return cd.epoch_ns;
60 75
61 /*
62 * Load the epoch_cyc and epoch_ns atomically. We do this by
63 * ensuring that we always write epoch_cyc, epoch_ns and
64 * epoch_cyc_copy in strict order, and read them in strict order.
65 * If epoch_cyc and epoch_cyc_copy are not equal, then we're in
66 * the middle of an update, and we should repeat the load.
67 */
68 do { 76 do {
77 seq = read_seqcount_begin(&cd.seq);
69 epoch_cyc = cd.epoch_cyc; 78 epoch_cyc = cd.epoch_cyc;
70 smp_rmb();
71 epoch_ns = cd.epoch_ns; 79 epoch_ns = cd.epoch_ns;
72 smp_rmb(); 80 } while (read_seqcount_retry(&cd.seq, seq));
73 } while (epoch_cyc != cd.epoch_cyc_copy);
74 81
75 cyc = read_sched_clock(); 82 cyc = read_sched_clock();
76 cyc = (cyc - epoch_cyc) & sched_clock_mask; 83 cyc = (cyc - epoch_cyc) & sched_clock_mask;
@@ -83,49 +90,46 @@ static unsigned long long notrace sched_clock_32(void)
83static void notrace update_sched_clock(void) 90static void notrace update_sched_clock(void)
84{ 91{
85 unsigned long flags; 92 unsigned long flags;
86 u32 cyc; 93 u64 cyc;
87 u64 ns; 94 u64 ns;
88 95
89 cyc = read_sched_clock(); 96 cyc = read_sched_clock();
90 ns = cd.epoch_ns + 97 ns = cd.epoch_ns +
91 cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, 98 cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
92 cd.mult, cd.shift); 99 cd.mult, cd.shift);
93 /* 100
94 * Write epoch_cyc and epoch_ns in a way that the update is
95 * detectable in cyc_to_fixed_sched_clock().
96 */
97 raw_local_irq_save(flags); 101 raw_local_irq_save(flags);
98 cd.epoch_cyc_copy = cyc; 102 write_seqcount_begin(&cd.seq);
99 smp_wmb();
100 cd.epoch_ns = ns; 103 cd.epoch_ns = ns;
101 smp_wmb();
102 cd.epoch_cyc = cyc; 104 cd.epoch_cyc = cyc;
105 write_seqcount_end(&cd.seq);
103 raw_local_irq_restore(flags); 106 raw_local_irq_restore(flags);
104} 107}
105 108
106static void sched_clock_poll(unsigned long wrap_ticks) 109static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
107{ 110{
108 mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks));
109 update_sched_clock(); 111 update_sched_clock();
112 hrtimer_forward_now(hrt, cd.wrap_kt);
113 return HRTIMER_RESTART;
110} 114}
111 115
112void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) 116void __init sched_clock_register(u64 (*read)(void), int bits,
117 unsigned long rate)
113{ 118{
114 unsigned long r, w; 119 unsigned long r;
115 u64 res, wrap; 120 u64 res, wrap;
116 char r_unit; 121 char r_unit;
117 122
118 if (cd.rate > rate) 123 if (cd.rate > rate)
119 return; 124 return;
120 125
121 BUG_ON(bits > 32);
122 WARN_ON(!irqs_disabled()); 126 WARN_ON(!irqs_disabled());
123 read_sched_clock = read; 127 read_sched_clock = read;
124 sched_clock_mask = (1 << bits) - 1; 128 sched_clock_mask = CLOCKSOURCE_MASK(bits);
125 cd.rate = rate; 129 cd.rate = rate;
126 130
127 /* calculate the mult/shift to convert counter ticks to ns. */ 131 /* calculate the mult/shift to convert counter ticks to ns. */
128 clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); 132 clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600);
129 133
130 r = rate; 134 r = rate;
131 if (r >= 4000000) { 135 if (r >= 4000000) {
@@ -138,20 +142,14 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
138 r_unit = ' '; 142 r_unit = ' ';
139 143
140 /* calculate how many ns until we wrap */ 144 /* calculate how many ns until we wrap */
141 wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); 145 wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask);
142 do_div(wrap, NSEC_PER_MSEC); 146 cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
143 w = wrap;
144 147
145 /* calculate the ns resolution of this counter */ 148 /* calculate the ns resolution of this counter */
146 res = cyc_to_ns(1ULL, cd.mult, cd.shift); 149 res = cyc_to_ns(1ULL, cd.mult, cd.shift);
147 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", 150 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
148 bits, r, r_unit, res, w); 151 bits, r, r_unit, res, wrap);
149 152
150 /*
151 * Start the timer to keep sched_clock() properly updated and
152 * sets the initial epoch.
153 */
154 sched_clock_timer.data = msecs_to_jiffies(w - (w / 10));
155 update_sched_clock(); 153 update_sched_clock();
156 154
157 /* 155 /*
@@ -166,11 +164,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
166 pr_debug("Registered %pF as sched_clock source\n", read); 164 pr_debug("Registered %pF as sched_clock source\n", read);
167} 165}
168 166
169unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; 167void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
170
171unsigned long long notrace sched_clock(void)
172{ 168{
173 return sched_clock_func(); 169 read_sched_clock_32 = read;
170 sched_clock_register(read_sched_clock_32_wrapper, bits, rate);
174} 171}
175 172
176void __init sched_clock_postinit(void) 173void __init sched_clock_postinit(void)
@@ -180,14 +177,22 @@ void __init sched_clock_postinit(void)
180 * make it the final one one. 177 * make it the final one one.
181 */ 178 */
182 if (read_sched_clock == jiffy_sched_clock_read) 179 if (read_sched_clock == jiffy_sched_clock_read)
183 setup_sched_clock(jiffy_sched_clock_read, 32, HZ); 180 sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
184 181
185 sched_clock_poll(sched_clock_timer.data); 182 update_sched_clock();
183
184 /*
185 * Start the timer to keep sched_clock() properly updated and
186 * sets the initial epoch.
187 */
188 hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
189 sched_clock_timer.function = sched_clock_poll;
190 hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
186} 191}
187 192
188static int sched_clock_suspend(void) 193static int sched_clock_suspend(void)
189{ 194{
190 sched_clock_poll(sched_clock_timer.data); 195 sched_clock_poll(&sched_clock_timer);
191 cd.suspended = true; 196 cd.suspended = true;
192 return 0; 197 return 0;
193} 198}
@@ -195,7 +200,6 @@ static int sched_clock_suspend(void)
195static void sched_clock_resume(void) 200static void sched_clock_resume(void)
196{ 201{
197 cd.epoch_cyc = read_sched_clock(); 202 cd.epoch_cyc = read_sched_clock();
198 cd.epoch_cyc_copy = cd.epoch_cyc;
199 cd.suspended = false; 203 cd.suspended = false;
200} 204}
201 205
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 218bcb565fed..9532690daaa9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -70,6 +70,7 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev,
70 struct clock_event_device *newdev) 70 struct clock_event_device *newdev)
71{ 71{
72 if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || 72 if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
73 (newdev->features & CLOCK_EVT_FEAT_PERCPU) ||
73 (newdev->features & CLOCK_EVT_FEAT_C3STOP)) 74 (newdev->features & CLOCK_EVT_FEAT_C3STOP))
74 return false; 75 return false;
75 76
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index bc906cad709b..18e71f7fbc2a 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev);
31 31
32extern void clockevents_shutdown(struct clock_event_device *dev); 32extern void clockevents_shutdown(struct clock_event_device *dev);
33 33
34extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); 34extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
35 35
36/* 36/*
37 * NO_HZ / high resolution timer shared code 37 * NO_HZ / high resolution timer shared code
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e77edc97e036..3612fc77f834 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -23,6 +23,7 @@
23#include <linux/irq_work.h> 23#include <linux/irq_work.h>
24#include <linux/posix-timers.h> 24#include <linux/posix-timers.h>
25#include <linux/perf_event.h> 25#include <linux/perf_event.h>
26#include <linux/context_tracking.h>
26 27
27#include <asm/irq_regs.h> 28#include <asm/irq_regs.h>
28 29
@@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
148} 149}
149 150
150#ifdef CONFIG_NO_HZ_FULL 151#ifdef CONFIG_NO_HZ_FULL
151static cpumask_var_t nohz_full_mask; 152cpumask_var_t tick_nohz_full_mask;
152bool have_nohz_full_mask; 153bool tick_nohz_full_running;
153 154
154static bool can_stop_full_tick(void) 155static bool can_stop_full_tick(void)
155{ 156{
@@ -182,7 +183,8 @@ static bool can_stop_full_tick(void)
182 * Don't allow the user to think they can get 183 * Don't allow the user to think they can get
183 * full NO_HZ with this machine. 184 * full NO_HZ with this machine.
184 */ 185 */
185 WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock"); 186 WARN_ONCE(tick_nohz_full_running,
187 "NO_HZ FULL will not work with unstable sched clock");
186 return false; 188 return false;
187 } 189 }
188#endif 190#endif
@@ -196,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
196 * Re-evaluate the need for the tick on the current CPU 198 * Re-evaluate the need for the tick on the current CPU
197 * and restart it if necessary. 199 * and restart it if necessary.
198 */ 200 */
199void tick_nohz_full_check(void) 201void __tick_nohz_full_check(void)
200{ 202{
201 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 203 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
202 204
@@ -210,7 +212,7 @@ void tick_nohz_full_check(void)
210 212
211static void nohz_full_kick_work_func(struct irq_work *work) 213static void nohz_full_kick_work_func(struct irq_work *work)
212{ 214{
213 tick_nohz_full_check(); 215 __tick_nohz_full_check();
214} 216}
215 217
216static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { 218static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
@@ -229,7 +231,7 @@ void tick_nohz_full_kick(void)
229 231
230static void nohz_full_kick_ipi(void *info) 232static void nohz_full_kick_ipi(void *info)
231{ 233{
232 tick_nohz_full_check(); 234 __tick_nohz_full_check();
233} 235}
234 236
235/* 237/*
@@ -238,12 +240,13 @@ static void nohz_full_kick_ipi(void *info)
238 */ 240 */
239void tick_nohz_full_kick_all(void) 241void tick_nohz_full_kick_all(void)
240{ 242{
241 if (!have_nohz_full_mask) 243 if (!tick_nohz_full_running)
242 return; 244 return;
243 245
244 preempt_disable(); 246 preempt_disable();
245 smp_call_function_many(nohz_full_mask, 247 smp_call_function_many(tick_nohz_full_mask,
246 nohz_full_kick_ipi, NULL, false); 248 nohz_full_kick_ipi, NULL, false);
249 tick_nohz_full_kick();
247 preempt_enable(); 250 preempt_enable();
248} 251}
249 252
@@ -252,7 +255,7 @@ void tick_nohz_full_kick_all(void)
252 * It might need the tick due to per task/process properties: 255 * It might need the tick due to per task/process properties:
253 * perf events, posix cpu timers, ... 256 * perf events, posix cpu timers, ...
254 */ 257 */
255void tick_nohz_task_switch(struct task_struct *tsk) 258void __tick_nohz_task_switch(struct task_struct *tsk)
256{ 259{
257 unsigned long flags; 260 unsigned long flags;
258 261
@@ -268,31 +271,23 @@ out:
268 local_irq_restore(flags); 271 local_irq_restore(flags);
269} 272}
270 273
271int tick_nohz_full_cpu(int cpu)
272{
273 if (!have_nohz_full_mask)
274 return 0;
275
276 return cpumask_test_cpu(cpu, nohz_full_mask);
277}
278
279/* Parse the boot-time nohz CPU list from the kernel parameters. */ 274/* Parse the boot-time nohz CPU list from the kernel parameters. */
280static int __init tick_nohz_full_setup(char *str) 275static int __init tick_nohz_full_setup(char *str)
281{ 276{
282 int cpu; 277 int cpu;
283 278
284 alloc_bootmem_cpumask_var(&nohz_full_mask); 279 alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
285 if (cpulist_parse(str, nohz_full_mask) < 0) { 280 if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
286 pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); 281 pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
287 return 1; 282 return 1;
288 } 283 }
289 284
290 cpu = smp_processor_id(); 285 cpu = smp_processor_id();
291 if (cpumask_test_cpu(cpu, nohz_full_mask)) { 286 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
292 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); 287 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
293 cpumask_clear_cpu(cpu, nohz_full_mask); 288 cpumask_clear_cpu(cpu, tick_nohz_full_mask);
294 } 289 }
295 have_nohz_full_mask = true; 290 tick_nohz_full_running = true;
296 291
297 return 1; 292 return 1;
298} 293}
@@ -310,7 +305,7 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
310 * If we handle the timekeeping duty for full dynticks CPUs, 305 * If we handle the timekeeping duty for full dynticks CPUs,
311 * we can't safely shutdown that CPU. 306 * we can't safely shutdown that CPU.
312 */ 307 */
313 if (have_nohz_full_mask && tick_do_timer_cpu == cpu) 308 if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
314 return NOTIFY_BAD; 309 return NOTIFY_BAD;
315 break; 310 break;
316 } 311 }
@@ -329,14 +324,14 @@ static int tick_nohz_init_all(void)
329 int err = -1; 324 int err = -1;
330 325
331#ifdef CONFIG_NO_HZ_FULL_ALL 326#ifdef CONFIG_NO_HZ_FULL_ALL
332 if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { 327 if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
333 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); 328 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
334 return err; 329 return err;
335 } 330 }
336 err = 0; 331 err = 0;
337 cpumask_setall(nohz_full_mask); 332 cpumask_setall(tick_nohz_full_mask);
338 cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); 333 cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
339 have_nohz_full_mask = true; 334 tick_nohz_full_running = true;
340#endif 335#endif
341 return err; 336 return err;
342} 337}
@@ -345,17 +340,18 @@ void __init tick_nohz_init(void)
345{ 340{
346 int cpu; 341 int cpu;
347 342
348 if (!have_nohz_full_mask) { 343 if (!tick_nohz_full_running) {
349 if (tick_nohz_init_all() < 0) 344 if (tick_nohz_init_all() < 0)
350 return; 345 return;
351 } 346 }
352 347
348 for_each_cpu(cpu, tick_nohz_full_mask)
349 context_tracking_cpu_set(cpu);
350
353 cpu_notifier(tick_nohz_cpu_down_callback, 0); 351 cpu_notifier(tick_nohz_cpu_down_callback, 0);
354 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); 352 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask);
355 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); 353 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
356} 354}
357#else
358#define have_nohz_full_mask (0)
359#endif 355#endif
360 356
361/* 357/*
@@ -733,7 +729,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
733 return false; 729 return false;
734 } 730 }
735 731
736 if (have_nohz_full_mask) { 732 if (tick_nohz_full_enabled()) {
737 /* 733 /*
738 * Keep the tick alive to guarantee timekeeping progression 734 * Keep the tick alive to guarantee timekeeping progression
739 * if there are full dynticks CPUs around 735 * if there are full dynticks CPUs around
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 48b9fffabdc2..3abf53418b67 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1613,9 +1613,10 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1613 * ktime_get_update_offsets - hrtimer helper 1613 * ktime_get_update_offsets - hrtimer helper
1614 * @offs_real: pointer to storage for monotonic -> realtime offset 1614 * @offs_real: pointer to storage for monotonic -> realtime offset
1615 * @offs_boot: pointer to storage for monotonic -> boottime offset 1615 * @offs_boot: pointer to storage for monotonic -> boottime offset
1616 * @offs_tai: pointer to storage for monotonic -> clock tai offset
1616 * 1617 *
1617 * Returns current monotonic time and updates the offsets 1618 * Returns current monotonic time and updates the offsets
1618 * Called from hrtimer_interupt() or retrigger_next_event() 1619 * Called from hrtimer_interrupt() or retrigger_next_event()
1619 */ 1620 */
1620ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, 1621ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
1621 ktime_t *offs_tai) 1622 ktime_t *offs_tai)
@@ -1703,6 +1704,8 @@ int do_adjtimex(struct timex *txc)
1703 write_seqcount_end(&timekeeper_seq); 1704 write_seqcount_end(&timekeeper_seq);
1704 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1705 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1705 1706
1707 ntp_notify_cmos_timer();
1708
1706 return ret; 1709 return ret;
1707} 1710}
1708 1711
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 3bdf28323012..61ed862cdd37 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -265,10 +265,9 @@ static inline void timer_list_header(struct seq_file *m, u64 now)
265static int timer_list_show(struct seq_file *m, void *v) 265static int timer_list_show(struct seq_file *m, void *v)
266{ 266{
267 struct timer_list_iter *iter = v; 267 struct timer_list_iter *iter = v;
268 u64 now = ktime_to_ns(ktime_get());
269 268
270 if (iter->cpu == -1 && !iter->second_pass) 269 if (iter->cpu == -1 && !iter->second_pass)
271 timer_list_header(m, now); 270 timer_list_header(m, iter->now);
272 else if (!iter->second_pass) 271 else if (!iter->second_pass)
273 print_cpu(m, iter->cpu, iter->now); 272 print_cpu(m, iter->cpu, iter->now);
274#ifdef CONFIG_GENERIC_CLOCKEVENTS 273#ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -298,33 +297,41 @@ void sysrq_timer_list_show(void)
298 return; 297 return;
299} 298}
300 299
301static void *timer_list_start(struct seq_file *file, loff_t *offset) 300static void *move_iter(struct timer_list_iter *iter, loff_t offset)
302{ 301{
303 struct timer_list_iter *iter = file->private; 302 for (; offset; offset--) {
304 303 iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
305 if (!*offset) { 304 if (iter->cpu >= nr_cpu_ids) {
306 iter->cpu = -1;
307 iter->now = ktime_to_ns(ktime_get());
308 } else if (iter->cpu >= nr_cpu_ids) {
309#ifdef CONFIG_GENERIC_CLOCKEVENTS 305#ifdef CONFIG_GENERIC_CLOCKEVENTS
310 if (!iter->second_pass) { 306 if (!iter->second_pass) {
311 iter->cpu = -1; 307 iter->cpu = -1;
312 iter->second_pass = true; 308 iter->second_pass = true;
313 } else 309 } else
314 return NULL; 310 return NULL;
315#else 311#else
316 return NULL; 312 return NULL;
317#endif 313#endif
314 }
318 } 315 }
319 return iter; 316 return iter;
320} 317}
321 318
319static void *timer_list_start(struct seq_file *file, loff_t *offset)
320{
321 struct timer_list_iter *iter = file->private;
322
323 if (!*offset)
324 iter->now = ktime_to_ns(ktime_get());
325 iter->cpu = -1;
326 iter->second_pass = false;
327 return move_iter(iter, *offset);
328}
329
322static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) 330static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
323{ 331{
324 struct timer_list_iter *iter = file->private; 332 struct timer_list_iter *iter = file->private;
325 iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
326 ++*offset; 333 ++*offset;
327 return timer_list_start(file, offset); 334 return move_iter(iter, 1);
328} 335}
329 336
330static void timer_list_stop(struct seq_file *seq, void *v) 337static void timer_list_stop(struct seq_file *seq, void *v)
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 0b537f27b559..1fb08f21302e 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v)
298 period = ktime_to_timespec(time); 298 period = ktime_to_timespec(time);
299 ms = period.tv_nsec / 1000000; 299 ms = period.tv_nsec / 1000000;
300 300
301 seq_puts(m, "Timer Stats Version: v0.2\n"); 301 seq_puts(m, "Timer Stats Version: v0.3\n");
302 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); 302 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
303 if (atomic_read(&overflow_count)) 303 if (atomic_read(&overflow_count))
304 seq_printf(m, "Overflow: %d entries\n", 304 seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
305 atomic_read(&overflow_count)); 305 seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
306 306
307 for (i = 0; i < nr_entries; i++) { 307 for (i = 0; i < nr_entries; i++) {
308 entry = entries + i; 308 entry = entries + i;
309 if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { 309 if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
310 seq_printf(m, "%4luD, %5d %-16s ", 310 seq_printf(m, "%4luD, %5d %-16s ",
311 entry->count, entry->pid, entry->comm); 311 entry->count, entry->pid, entry->comm);
312 } else { 312 } else {
diff --git a/kernel/timer.c b/kernel/timer.c
index 4296d13db3d1..6582b82fa966 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
1092static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), 1092static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1093 unsigned long data) 1093 unsigned long data)
1094{ 1094{
1095 int preempt_count = preempt_count(); 1095 int count = preempt_count();
1096 1096
1097#ifdef CONFIG_LOCKDEP 1097#ifdef CONFIG_LOCKDEP
1098 /* 1098 /*
@@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1119 1119
1120 lock_map_release(&lockdep_map); 1120 lock_map_release(&lockdep_map);
1121 1121
1122 if (preempt_count != preempt_count()) { 1122 if (count != preempt_count()) {
1123 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", 1123 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1124 fn, preempt_count, preempt_count()); 1124 fn, count, preempt_count());
1125 /* 1125 /*
1126 * Restore the preempt count. That gives us a decent 1126 * Restore the preempt count. That gives us a decent
1127 * chance to survive and extract information. If the 1127 * chance to survive and extract information. If the
1128 * callback kept a lock held, bad luck, but not worse 1128 * callback kept a lock held, bad luck, but not worse
1129 * than the BUG() we had. 1129 * than the BUG() we had.
1130 */ 1130 */
1131 preempt_count() = preempt_count; 1131 preempt_count_set(count);
1132 } 1132 }
1133} 1133}
1134 1134
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a6d098c6df3f..03cf44ac54d3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1978,12 +1978,27 @@ int __weak ftrace_arch_code_modify_post_process(void)
1978 1978
1979void ftrace_modify_all_code(int command) 1979void ftrace_modify_all_code(int command)
1980{ 1980{
1981 int update = command & FTRACE_UPDATE_TRACE_FUNC;
1982
1983 /*
1984 * If the ftrace_caller calls a ftrace_ops func directly,
1985 * we need to make sure that it only traces functions it
1986 * expects to trace. When doing the switch of functions,
1987 * we need to update to the ftrace_ops_list_func first
1988 * before the transition between old and new calls are set,
1989 * as the ftrace_ops_list_func will check the ops hashes
1990 * to make sure the ops are having the right functions
1991 * traced.
1992 */
1993 if (update)
1994 ftrace_update_ftrace_func(ftrace_ops_list_func);
1995
1981 if (command & FTRACE_UPDATE_CALLS) 1996 if (command & FTRACE_UPDATE_CALLS)
1982 ftrace_replace_code(1); 1997 ftrace_replace_code(1);
1983 else if (command & FTRACE_DISABLE_CALLS) 1998 else if (command & FTRACE_DISABLE_CALLS)
1984 ftrace_replace_code(0); 1999 ftrace_replace_code(0);
1985 2000
1986 if (command & FTRACE_UPDATE_TRACE_FUNC) 2001 if (update && ftrace_trace_function != ftrace_ops_list_func)
1987 ftrace_update_ftrace_func(ftrace_trace_function); 2002 ftrace_update_ftrace_func(ftrace_trace_function);
1988 2003
1989 if (command & FTRACE_START_FUNC_RET) 2004 if (command & FTRACE_START_FUNC_RET)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 496f94d57698..d9fea7dfd5d3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1509,7 +1509,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1509#endif 1509#endif
1510 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | 1510 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
1511 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 1511 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
1512 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 1512 (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
1513 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
1513} 1514}
1514EXPORT_SYMBOL_GPL(tracing_generic_entry_update); 1515EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
1515 1516
@@ -3166,11 +3167,6 @@ static const struct file_operations show_traces_fops = {
3166}; 3167};
3167 3168
3168/* 3169/*
3169 * Only trace on a CPU if the bitmask is set:
3170 */
3171static cpumask_var_t tracing_cpumask;
3172
3173/*
3174 * The tracer itself will not take this lock, but still we want 3170 * The tracer itself will not take this lock, but still we want
3175 * to provide a consistent cpumask to user-space: 3171 * to provide a consistent cpumask to user-space:
3176 */ 3172 */
@@ -3186,11 +3182,12 @@ static ssize_t
3186tracing_cpumask_read(struct file *filp, char __user *ubuf, 3182tracing_cpumask_read(struct file *filp, char __user *ubuf,
3187 size_t count, loff_t *ppos) 3183 size_t count, loff_t *ppos)
3188{ 3184{
3185 struct trace_array *tr = file_inode(filp)->i_private;
3189 int len; 3186 int len;
3190 3187
3191 mutex_lock(&tracing_cpumask_update_lock); 3188 mutex_lock(&tracing_cpumask_update_lock);
3192 3189
3193 len = cpumask_scnprintf(mask_str, count, tracing_cpumask); 3190 len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask);
3194 if (count - len < 2) { 3191 if (count - len < 2) {
3195 count = -EINVAL; 3192 count = -EINVAL;
3196 goto out_err; 3193 goto out_err;
@@ -3208,7 +3205,7 @@ static ssize_t
3208tracing_cpumask_write(struct file *filp, const char __user *ubuf, 3205tracing_cpumask_write(struct file *filp, const char __user *ubuf,
3209 size_t count, loff_t *ppos) 3206 size_t count, loff_t *ppos)
3210{ 3207{
3211 struct trace_array *tr = filp->private_data; 3208 struct trace_array *tr = file_inode(filp)->i_private;
3212 cpumask_var_t tracing_cpumask_new; 3209 cpumask_var_t tracing_cpumask_new;
3213 int err, cpu; 3210 int err, cpu;
3214 3211
@@ -3228,12 +3225,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
3228 * Increase/decrease the disabled counter if we are 3225 * Increase/decrease the disabled counter if we are
3229 * about to flip a bit in the cpumask: 3226 * about to flip a bit in the cpumask:
3230 */ 3227 */
3231 if (cpumask_test_cpu(cpu, tracing_cpumask) && 3228 if (cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
3232 !cpumask_test_cpu(cpu, tracing_cpumask_new)) { 3229 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
3233 atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); 3230 atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
3234 ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); 3231 ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
3235 } 3232 }
3236 if (!cpumask_test_cpu(cpu, tracing_cpumask) && 3233 if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
3237 cpumask_test_cpu(cpu, tracing_cpumask_new)) { 3234 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
3238 atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); 3235 atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
3239 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); 3236 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
@@ -3242,7 +3239,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
3242 arch_spin_unlock(&ftrace_max_lock); 3239 arch_spin_unlock(&ftrace_max_lock);
3243 local_irq_enable(); 3240 local_irq_enable();
3244 3241
3245 cpumask_copy(tracing_cpumask, tracing_cpumask_new); 3242 cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
3246 3243
3247 mutex_unlock(&tracing_cpumask_update_lock); 3244 mutex_unlock(&tracing_cpumask_update_lock);
3248 free_cpumask_var(tracing_cpumask_new); 3245 free_cpumask_var(tracing_cpumask_new);
@@ -3256,9 +3253,10 @@ err_unlock:
3256} 3253}
3257 3254
3258static const struct file_operations tracing_cpumask_fops = { 3255static const struct file_operations tracing_cpumask_fops = {
3259 .open = tracing_open_generic, 3256 .open = tracing_open_generic_tr,
3260 .read = tracing_cpumask_read, 3257 .read = tracing_cpumask_read,
3261 .write = tracing_cpumask_write, 3258 .write = tracing_cpumask_write,
3259 .release = tracing_release_generic_tr,
3262 .llseek = generic_file_llseek, 3260 .llseek = generic_file_llseek,
3263}; 3261};
3264 3262
@@ -5938,6 +5936,11 @@ static int new_instance_create(const char *name)
5938 if (!tr->name) 5936 if (!tr->name)
5939 goto out_free_tr; 5937 goto out_free_tr;
5940 5938
5939 if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
5940 goto out_free_tr;
5941
5942 cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
5943
5941 raw_spin_lock_init(&tr->start_lock); 5944 raw_spin_lock_init(&tr->start_lock);
5942 5945
5943 tr->current_trace = &nop_trace; 5946 tr->current_trace = &nop_trace;
@@ -5969,6 +5972,7 @@ static int new_instance_create(const char *name)
5969 out_free_tr: 5972 out_free_tr:
5970 if (tr->trace_buffer.buffer) 5973 if (tr->trace_buffer.buffer)
5971 ring_buffer_free(tr->trace_buffer.buffer); 5974 ring_buffer_free(tr->trace_buffer.buffer);
5975 free_cpumask_var(tr->tracing_cpumask);
5972 kfree(tr->name); 5976 kfree(tr->name);
5973 kfree(tr); 5977 kfree(tr);
5974 5978
@@ -6098,6 +6102,9 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6098{ 6102{
6099 int cpu; 6103 int cpu;
6100 6104
6105 trace_create_file("tracing_cpumask", 0644, d_tracer,
6106 tr, &tracing_cpumask_fops);
6107
6101 trace_create_file("trace_options", 0644, d_tracer, 6108 trace_create_file("trace_options", 0644, d_tracer,
6102 tr, &tracing_iter_fops); 6109 tr, &tracing_iter_fops);
6103 6110
@@ -6147,9 +6154,6 @@ static __init int tracer_init_debugfs(void)
6147 6154
6148 init_tracer_debugfs(&global_trace, d_tracer); 6155 init_tracer_debugfs(&global_trace, d_tracer);
6149 6156
6150 trace_create_file("tracing_cpumask", 0644, d_tracer,
6151 &global_trace, &tracing_cpumask_fops);
6152
6153 trace_create_file("available_tracers", 0444, d_tracer, 6157 trace_create_file("available_tracers", 0444, d_tracer,
6154 &global_trace, &show_traces_fops); 6158 &global_trace, &show_traces_fops);
6155 6159
@@ -6371,7 +6375,7 @@ __init static int tracer_alloc_buffers(void)
6371 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) 6375 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
6372 goto out; 6376 goto out;
6373 6377
6374 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 6378 if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL))
6375 goto out_free_buffer_mask; 6379 goto out_free_buffer_mask;
6376 6380
6377 /* Only allocate trace_printk buffers if a trace_printk exists */ 6381 /* Only allocate trace_printk buffers if a trace_printk exists */
@@ -6386,7 +6390,7 @@ __init static int tracer_alloc_buffers(void)
6386 ring_buf_size = 1; 6390 ring_buf_size = 1;
6387 6391
6388 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 6392 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
6389 cpumask_copy(tracing_cpumask, cpu_all_mask); 6393 cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask);
6390 6394
6391 raw_spin_lock_init(&global_trace.start_lock); 6395 raw_spin_lock_init(&global_trace.start_lock);
6392 6396
@@ -6441,7 +6445,7 @@ out_free_cpumask:
6441#ifdef CONFIG_TRACER_MAX_TRACE 6445#ifdef CONFIG_TRACER_MAX_TRACE
6442 free_percpu(global_trace.max_buffer.data); 6446 free_percpu(global_trace.max_buffer.data);
6443#endif 6447#endif
6444 free_cpumask_var(tracing_cpumask); 6448 free_cpumask_var(global_trace.tracing_cpumask);
6445out_free_buffer_mask: 6449out_free_buffer_mask:
6446 free_cpumask_var(tracing_buffer_mask); 6450 free_cpumask_var(tracing_buffer_mask);
6447out: 6451out:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index afaae41b0a02..73d08aa25b55 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -124,6 +124,7 @@ enum trace_flag_type {
124 TRACE_FLAG_NEED_RESCHED = 0x04, 124 TRACE_FLAG_NEED_RESCHED = 0x04,
125 TRACE_FLAG_HARDIRQ = 0x08, 125 TRACE_FLAG_HARDIRQ = 0x08,
126 TRACE_FLAG_SOFTIRQ = 0x10, 126 TRACE_FLAG_SOFTIRQ = 0x10,
127 TRACE_FLAG_PREEMPT_RESCHED = 0x20,
127}; 128};
128 129
129#define TRACE_BUF_SIZE 1024 130#define TRACE_BUF_SIZE 1024
@@ -206,6 +207,7 @@ struct trace_array {
206 struct dentry *event_dir; 207 struct dentry *event_dir;
207 struct list_head systems; 208 struct list_head systems;
208 struct list_head events; 209 struct list_head events;
210 cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
209 int ref; 211 int ref;
210}; 212};
211 213
@@ -1022,6 +1024,9 @@ extern struct list_head ftrace_events;
1022extern const char *__start___trace_bprintk_fmt[]; 1024extern const char *__start___trace_bprintk_fmt[];
1023extern const char *__stop___trace_bprintk_fmt[]; 1025extern const char *__stop___trace_bprintk_fmt[];
1024 1026
1027extern const char *__start___tracepoint_str[];
1028extern const char *__stop___tracepoint_str[];
1029
1025void trace_printk_init_buffers(void); 1030void trace_printk_init_buffers(void);
1026void trace_printk_start_comm(void); 1031void trace_printk_start_comm(void);
1027int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); 1032int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 80c36bcf66e8..78e27e3b52ac 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -26,7 +26,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
26{ 26{
27 /* The ftrace function trace is allowed only for root. */ 27 /* The ftrace function trace is allowed only for root. */
28 if (ftrace_event_is_function(tp_event) && 28 if (ftrace_event_is_function(tp_event) &&
29 perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 29 perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
30 return -EPERM; 30 return -EPERM;
31 31
32 /* No tracing, just counting, so no obvious leak */ 32 /* No tracing, just counting, so no obvious leak */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 29a7ebcfb426..368a4d50cc30 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1489,12 +1489,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
1489} 1489}
1490 1490
1491static int 1491static int
1492event_create_dir(struct dentry *parent, 1492event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
1493 struct ftrace_event_file *file,
1494 const struct file_operations *id,
1495 const struct file_operations *enable,
1496 const struct file_operations *filter,
1497 const struct file_operations *format)
1498{ 1493{
1499 struct ftrace_event_call *call = file->event_call; 1494 struct ftrace_event_call *call = file->event_call;
1500 struct trace_array *tr = file->tr; 1495 struct trace_array *tr = file->tr;
@@ -1522,12 +1517,13 @@ event_create_dir(struct dentry *parent,
1522 1517
1523 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) 1518 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
1524 trace_create_file("enable", 0644, file->dir, file, 1519 trace_create_file("enable", 0644, file->dir, file,
1525 enable); 1520 &ftrace_enable_fops);
1526 1521
1527#ifdef CONFIG_PERF_EVENTS 1522#ifdef CONFIG_PERF_EVENTS
1528 if (call->event.type && call->class->reg) 1523 if (call->event.type && call->class->reg)
1529 trace_create_file("id", 0444, file->dir, 1524 trace_create_file("id", 0444, file->dir,
1530 (void *)(long)call->event.type, id); 1525 (void *)(long)call->event.type,
1526 &ftrace_event_id_fops);
1531#endif 1527#endif
1532 1528
1533 /* 1529 /*
@@ -1544,10 +1540,10 @@ event_create_dir(struct dentry *parent,
1544 } 1540 }
1545 } 1541 }
1546 trace_create_file("filter", 0644, file->dir, call, 1542 trace_create_file("filter", 0644, file->dir, call,
1547 filter); 1543 &ftrace_event_filter_fops);
1548 1544
1549 trace_create_file("format", 0444, file->dir, call, 1545 trace_create_file("format", 0444, file->dir, call,
1550 format); 1546 &ftrace_event_format_fops);
1551 1547
1552 return 0; 1548 return 0;
1553} 1549}
@@ -1648,12 +1644,7 @@ trace_create_new_event(struct ftrace_event_call *call,
1648 1644
1649/* Add an event to a trace directory */ 1645/* Add an event to a trace directory */
1650static int 1646static int
1651__trace_add_new_event(struct ftrace_event_call *call, 1647__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)
1652 struct trace_array *tr,
1653 const struct file_operations *id,
1654 const struct file_operations *enable,
1655 const struct file_operations *filter,
1656 const struct file_operations *format)
1657{ 1648{
1658 struct ftrace_event_file *file; 1649 struct ftrace_event_file *file;
1659 1650
@@ -1661,7 +1652,7 @@ __trace_add_new_event(struct ftrace_event_call *call,
1661 if (!file) 1652 if (!file)
1662 return -ENOMEM; 1653 return -ENOMEM;
1663 1654
1664 return event_create_dir(tr->event_dir, file, id, enable, filter, format); 1655 return event_create_dir(tr->event_dir, file);
1665} 1656}
1666 1657
1667/* 1658/*
@@ -1683,8 +1674,7 @@ __trace_early_add_new_event(struct ftrace_event_call *call,
1683} 1674}
1684 1675
1685struct ftrace_module_file_ops; 1676struct ftrace_module_file_ops;
1686static void __add_event_to_tracers(struct ftrace_event_call *call, 1677static void __add_event_to_tracers(struct ftrace_event_call *call);
1687 struct ftrace_module_file_ops *file_ops);
1688 1678
1689/* Add an additional event_call dynamically */ 1679/* Add an additional event_call dynamically */
1690int trace_add_event_call(struct ftrace_event_call *call) 1680int trace_add_event_call(struct ftrace_event_call *call)
@@ -1695,7 +1685,7 @@ int trace_add_event_call(struct ftrace_event_call *call)
1695 1685
1696 ret = __register_event(call, NULL); 1686 ret = __register_event(call, NULL);
1697 if (ret >= 0) 1687 if (ret >= 0)
1698 __add_event_to_tracers(call, NULL); 1688 __add_event_to_tracers(call);
1699 1689
1700 mutex_unlock(&event_mutex); 1690 mutex_unlock(&event_mutex);
1701 mutex_unlock(&trace_types_lock); 1691 mutex_unlock(&trace_types_lock);
@@ -1769,100 +1759,21 @@ int trace_remove_event_call(struct ftrace_event_call *call)
1769 1759
1770#ifdef CONFIG_MODULES 1760#ifdef CONFIG_MODULES
1771 1761
1772static LIST_HEAD(ftrace_module_file_list);
1773
1774/*
1775 * Modules must own their file_operations to keep up with
1776 * reference counting.
1777 */
1778struct ftrace_module_file_ops {
1779 struct list_head list;
1780 struct module *mod;
1781 struct file_operations id;
1782 struct file_operations enable;
1783 struct file_operations format;
1784 struct file_operations filter;
1785};
1786
1787static struct ftrace_module_file_ops *
1788find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
1789{
1790 /*
1791 * As event_calls are added in groups by module,
1792 * when we find one file_ops, we don't need to search for
1793 * each call in that module, as the rest should be the
1794 * same. Only search for a new one if the last one did
1795 * not match.
1796 */
1797 if (file_ops && mod == file_ops->mod)
1798 return file_ops;
1799
1800 list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
1801 if (file_ops->mod == mod)
1802 return file_ops;
1803 }
1804 return NULL;
1805}
1806
1807static struct ftrace_module_file_ops *
1808trace_create_file_ops(struct module *mod)
1809{
1810 struct ftrace_module_file_ops *file_ops;
1811
1812 /*
1813 * This is a bit of a PITA. To allow for correct reference
1814 * counting, modules must "own" their file_operations.
1815 * To do this, we allocate the file operations that will be
1816 * used in the event directory.
1817 */
1818
1819 file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
1820 if (!file_ops)
1821 return NULL;
1822
1823 file_ops->mod = mod;
1824
1825 file_ops->id = ftrace_event_id_fops;
1826 file_ops->id.owner = mod;
1827
1828 file_ops->enable = ftrace_enable_fops;
1829 file_ops->enable.owner = mod;
1830
1831 file_ops->filter = ftrace_event_filter_fops;
1832 file_ops->filter.owner = mod;
1833
1834 file_ops->format = ftrace_event_format_fops;
1835 file_ops->format.owner = mod;
1836
1837 list_add(&file_ops->list, &ftrace_module_file_list);
1838
1839 return file_ops;
1840}
1841
1842static void trace_module_add_events(struct module *mod) 1762static void trace_module_add_events(struct module *mod)
1843{ 1763{
1844 struct ftrace_module_file_ops *file_ops = NULL;
1845 struct ftrace_event_call **call, **start, **end; 1764 struct ftrace_event_call **call, **start, **end;
1846 1765
1847 start = mod->trace_events; 1766 start = mod->trace_events;
1848 end = mod->trace_events + mod->num_trace_events; 1767 end = mod->trace_events + mod->num_trace_events;
1849 1768
1850 if (start == end)
1851 return;
1852
1853 file_ops = trace_create_file_ops(mod);
1854 if (!file_ops)
1855 return;
1856
1857 for_each_event(call, start, end) { 1769 for_each_event(call, start, end) {
1858 __register_event(*call, mod); 1770 __register_event(*call, mod);
1859 __add_event_to_tracers(*call, file_ops); 1771 __add_event_to_tracers(*call);
1860 } 1772 }
1861} 1773}
1862 1774
1863static void trace_module_remove_events(struct module *mod) 1775static void trace_module_remove_events(struct module *mod)
1864{ 1776{
1865 struct ftrace_module_file_ops *file_ops;
1866 struct ftrace_event_call *call, *p; 1777 struct ftrace_event_call *call, *p;
1867 bool clear_trace = false; 1778 bool clear_trace = false;
1868 1779
@@ -1874,16 +1785,6 @@ static void trace_module_remove_events(struct module *mod)
1874 __trace_remove_event_call(call); 1785 __trace_remove_event_call(call);
1875 } 1786 }
1876 } 1787 }
1877
1878 /* Now free the file_operations */
1879 list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
1880 if (file_ops->mod == mod)
1881 break;
1882 }
1883 if (&file_ops->list != &ftrace_module_file_list) {
1884 list_del(&file_ops->list);
1885 kfree(file_ops);
1886 }
1887 up_write(&trace_event_sem); 1788 up_write(&trace_event_sem);
1888 1789
1889 /* 1790 /*
@@ -1919,67 +1820,21 @@ static int trace_module_notify(struct notifier_block *self,
1919 return 0; 1820 return 0;
1920} 1821}
1921 1822
1922static int 1823static struct notifier_block trace_module_nb = {
1923__trace_add_new_mod_event(struct ftrace_event_call *call, 1824 .notifier_call = trace_module_notify,
1924 struct trace_array *tr, 1825 .priority = 0,
1925 struct ftrace_module_file_ops *file_ops) 1826};
1926{
1927 return __trace_add_new_event(call, tr,
1928 &file_ops->id, &file_ops->enable,
1929 &file_ops->filter, &file_ops->format);
1930}
1931
1932#else
1933static inline struct ftrace_module_file_ops *
1934find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
1935{
1936 return NULL;
1937}
1938static inline int trace_module_notify(struct notifier_block *self,
1939 unsigned long val, void *data)
1940{
1941 return 0;
1942}
1943static inline int
1944__trace_add_new_mod_event(struct ftrace_event_call *call,
1945 struct trace_array *tr,
1946 struct ftrace_module_file_ops *file_ops)
1947{
1948 return -ENODEV;
1949}
1950#endif /* CONFIG_MODULES */ 1827#endif /* CONFIG_MODULES */
1951 1828
1952/* Create a new event directory structure for a trace directory. */ 1829/* Create a new event directory structure for a trace directory. */
1953static void 1830static void
1954__trace_add_event_dirs(struct trace_array *tr) 1831__trace_add_event_dirs(struct trace_array *tr)
1955{ 1832{
1956 struct ftrace_module_file_ops *file_ops = NULL;
1957 struct ftrace_event_call *call; 1833 struct ftrace_event_call *call;
1958 int ret; 1834 int ret;
1959 1835
1960 list_for_each_entry(call, &ftrace_events, list) { 1836 list_for_each_entry(call, &ftrace_events, list) {
1961 if (call->mod) { 1837 ret = __trace_add_new_event(call, tr);
1962 /*
1963 * Directories for events by modules need to
1964 * keep module ref counts when opened (as we don't
1965 * want the module to disappear when reading one
1966 * of these files). The file_ops keep account of
1967 * the module ref count.
1968 */
1969 file_ops = find_ftrace_file_ops(file_ops, call->mod);
1970 if (!file_ops)
1971 continue; /* Warn? */
1972 ret = __trace_add_new_mod_event(call, tr, file_ops);
1973 if (ret < 0)
1974 pr_warning("Could not create directory for event %s\n",
1975 call->name);
1976 continue;
1977 }
1978 ret = __trace_add_new_event(call, tr,
1979 &ftrace_event_id_fops,
1980 &ftrace_enable_fops,
1981 &ftrace_event_filter_fops,
1982 &ftrace_event_format_fops);
1983 if (ret < 0) 1838 if (ret < 0)
1984 pr_warning("Could not create directory for event %s\n", 1839 pr_warning("Could not create directory for event %s\n",
1985 call->name); 1840 call->name);
@@ -2287,11 +2142,7 @@ __trace_early_add_event_dirs(struct trace_array *tr)
2287 2142
2288 2143
2289 list_for_each_entry(file, &tr->events, list) { 2144 list_for_each_entry(file, &tr->events, list) {
2290 ret = event_create_dir(tr->event_dir, file, 2145 ret = event_create_dir(tr->event_dir, file);
2291 &ftrace_event_id_fops,
2292 &ftrace_enable_fops,
2293 &ftrace_event_filter_fops,
2294 &ftrace_event_format_fops);
2295 if (ret < 0) 2146 if (ret < 0)
2296 pr_warning("Could not create directory for event %s\n", 2147 pr_warning("Could not create directory for event %s\n",
2297 file->event_call->name); 2148 file->event_call->name);
@@ -2332,29 +2183,14 @@ __trace_remove_event_dirs(struct trace_array *tr)
2332 remove_event_file_dir(file); 2183 remove_event_file_dir(file);
2333} 2184}
2334 2185
2335static void 2186static void __add_event_to_tracers(struct ftrace_event_call *call)
2336__add_event_to_tracers(struct ftrace_event_call *call,
2337 struct ftrace_module_file_ops *file_ops)
2338{ 2187{
2339 struct trace_array *tr; 2188 struct trace_array *tr;
2340 2189
2341 list_for_each_entry(tr, &ftrace_trace_arrays, list) { 2190 list_for_each_entry(tr, &ftrace_trace_arrays, list)
2342 if (file_ops) 2191 __trace_add_new_event(call, tr);
2343 __trace_add_new_mod_event(call, tr, file_ops);
2344 else
2345 __trace_add_new_event(call, tr,
2346 &ftrace_event_id_fops,
2347 &ftrace_enable_fops,
2348 &ftrace_event_filter_fops,
2349 &ftrace_event_format_fops);
2350 }
2351} 2192}
2352 2193
2353static struct notifier_block trace_module_nb = {
2354 .notifier_call = trace_module_notify,
2355 .priority = 0,
2356};
2357
2358extern struct ftrace_event_call *__start_ftrace_events[]; 2194extern struct ftrace_event_call *__start_ftrace_events[];
2359extern struct ftrace_event_call *__stop_ftrace_events[]; 2195extern struct ftrace_event_call *__stop_ftrace_events[];
2360 2196
@@ -2559,10 +2395,11 @@ static __init int event_trace_init(void)
2559 if (ret) 2395 if (ret)
2560 return ret; 2396 return ret;
2561 2397
2398#ifdef CONFIG_MODULES
2562 ret = register_module_notifier(&trace_module_nb); 2399 ret = register_module_notifier(&trace_module_nb);
2563 if (ret) 2400 if (ret)
2564 pr_warning("Failed to register trace events module notifier\n"); 2401 pr_warning("Failed to register trace events module notifier\n");
2565 2402#endif
2566 return 0; 2403 return 0;
2567} 2404}
2568early_initcall(event_trace_memsetup); 2405early_initcall(event_trace_memsetup);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 34e7cbac0c9c..ed32284fbe32 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -618,8 +618,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
618 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 618 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
619 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : 619 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
620 '.'; 620 '.';
621 need_resched = 621
622 (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; 622 switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
623 TRACE_FLAG_PREEMPT_RESCHED)) {
624 case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
625 need_resched = 'N';
626 break;
627 case TRACE_FLAG_NEED_RESCHED:
628 need_resched = 'n';
629 break;
630 case TRACE_FLAG_PREEMPT_RESCHED:
631 need_resched = 'p';
632 break;
633 default:
634 need_resched = '.';
635 break;
636 }
637
623 hardsoft_irq = 638 hardsoft_irq =
624 (hardirq && softirq) ? 'H' : 639 (hardirq && softirq) ? 'H' :
625 hardirq ? 'h' : 640 hardirq ? 'h' :
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index a9077c1b4ad3..2900817ba65c 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -244,12 +244,31 @@ static const char **find_next(void *v, loff_t *pos)
244{ 244{
245 const char **fmt = v; 245 const char **fmt = v;
246 int start_index; 246 int start_index;
247 int last_index;
247 248
248 start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; 249 start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
249 250
250 if (*pos < start_index) 251 if (*pos < start_index)
251 return __start___trace_bprintk_fmt + *pos; 252 return __start___trace_bprintk_fmt + *pos;
252 253
254 /*
255 * The __tracepoint_str section is treated the same as the
256 * __trace_printk_fmt section. The difference is that the
257 * __trace_printk_fmt section should only be used by trace_printk()
258 * in a debugging environment, as if anything exists in that section
259 * the trace_prink() helper buffers are allocated, which would just
260 * waste space in a production environment.
261 *
262 * The __tracepoint_str sections on the other hand are used by
263 * tracepoints which need to map pointers to their strings to
264 * the ASCII text for userspace.
265 */
266 last_index = start_index;
267 start_index = __stop___tracepoint_str - __start___tracepoint_str;
268
269 if (*pos < last_index + start_index)
270 return __start___tracepoint_str + (*pos - last_index);
271
253 return find_next_mod_format(start_index, v, fmt, pos); 272 return find_next_mod_format(start_index, v, fmt, pos);
254} 273}
255 274
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8fd03657bc7d..559329d9bd2f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -200,8 +200,8 @@ extern char *__bad_type_size(void);
200 #type, #name, offsetof(typeof(trace), name), \ 200 #type, #name, offsetof(typeof(trace), name), \
201 sizeof(trace.name), is_signed_type(type) 201 sizeof(trace.name), is_signed_type(type)
202 202
203static 203static int __init
204int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) 204__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
205{ 205{
206 int i; 206 int i;
207 int pos = 0; 207 int pos = 0;
@@ -228,7 +228,7 @@ int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
228 return pos; 228 return pos;
229} 229}
230 230
231static int set_syscall_print_fmt(struct ftrace_event_call *call) 231static int __init set_syscall_print_fmt(struct ftrace_event_call *call)
232{ 232{
233 char *print_fmt; 233 char *print_fmt;
234 int len; 234 int len;
@@ -253,7 +253,7 @@ static int set_syscall_print_fmt(struct ftrace_event_call *call)
253 return 0; 253 return 0;
254} 254}
255 255
256static void free_syscall_print_fmt(struct ftrace_event_call *call) 256static void __init free_syscall_print_fmt(struct ftrace_event_call *call)
257{ 257{
258 struct syscall_metadata *entry = call->data; 258 struct syscall_metadata *entry = call->data;
259 259
@@ -459,7 +459,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
459 mutex_unlock(&syscall_trace_lock); 459 mutex_unlock(&syscall_trace_lock);
460} 460}
461 461
462static int init_syscall_trace(struct ftrace_event_call *call) 462static int __init init_syscall_trace(struct ftrace_event_call *call)
463{ 463{
464 int id; 464 int id;
465 int num; 465 int num;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index f6c83d7ef000..602e5bbbceff 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
176 struct group_info *group_info; 176 struct group_info *group_info;
177 int retval; 177 int retval;
178 178
179 if (!nsown_capable(CAP_SETGID)) 179 if (!ns_capable(current_user_ns(), CAP_SETGID))
180 return -EPERM; 180 return -EPERM;
181 if ((unsigned)gidsetsize > NGROUPS_MAX) 181 if ((unsigned)gidsetsize > NGROUPS_MAX)
182 return -EINVAL; 182 return -EINVAL;
diff --git a/kernel/up.c b/kernel/up.c
index c54c75e9faf7..630d72bf7e41 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -10,12 +10,64 @@
10int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 10int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
11 int wait) 11 int wait)
12{ 12{
13 unsigned long flags;
14
13 WARN_ON(cpu != 0); 15 WARN_ON(cpu != 0);
14 16
15 local_irq_disable(); 17 local_irq_save(flags);
16 (func)(info); 18 func(info);
17 local_irq_enable(); 19 local_irq_restore(flags);
18 20
19 return 0; 21 return 0;
20} 22}
21EXPORT_SYMBOL(smp_call_function_single); 23EXPORT_SYMBOL(smp_call_function_single);
24
25int on_each_cpu(smp_call_func_t func, void *info, int wait)
26{
27 unsigned long flags;
28
29 local_irq_save(flags);
30 func(info);
31 local_irq_restore(flags);
32 return 0;
33}
34EXPORT_SYMBOL(on_each_cpu);
35
36/*
37 * Note we still need to test the mask even for UP
38 * because we actually can get an empty mask from
39 * code that on SMP might call us without the local
40 * CPU in the mask.
41 */
42void on_each_cpu_mask(const struct cpumask *mask,
43 smp_call_func_t func, void *info, bool wait)
44{
45 unsigned long flags;
46
47 if (cpumask_test_cpu(0, mask)) {
48 local_irq_save(flags);
49 func(info);
50 local_irq_restore(flags);
51 }
52}
53EXPORT_SYMBOL(on_each_cpu_mask);
54
55/*
56 * Preemption is disabled here to make sure the cond_func is called under the
57 * same condtions in UP and SMP.
58 */
59void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
60 smp_call_func_t func, void *info, bool wait,
61 gfp_t gfp_flags)
62{
63 unsigned long flags;
64
65 preempt_disable();
66 if (cond_func(0, info)) {
67 local_irq_save(flags);
68 func(info);
69 local_irq_restore(flags);
70 }
71 preempt_enable();
72}
73EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/user.c b/kernel/user.c
index 69b4c3d48cde..5bbb91988e69 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,8 +51,6 @@ struct user_namespace init_user_ns = {
51 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
52 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
53 .proc_inum = PROC_USER_INIT_INO, 53 .proc_inum = PROC_USER_INIT_INO,
54 .may_mount_sysfs = true,
55 .may_mount_proc = true,
56}; 54};
57EXPORT_SYMBOL_GPL(init_user_ns); 55EXPORT_SYMBOL_GPL(init_user_ns);
58 56
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9064b919a406..13fb1134ba58 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -101,8 +101,6 @@ int create_user_ns(struct cred *new)
101 101
102 set_cred_user_ns(new, ns); 102 set_cred_user_ns(new, ns);
103 103
104 update_mnt_policy(ns);
105
106 return 0; 104 return 0;
107} 105}
108 106
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 2fc8576efaa8..fd393124e507 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -114,7 +114,7 @@ static int utsns_install(struct nsproxy *nsproxy, void *new)
114 struct uts_namespace *ns = new; 114 struct uts_namespace *ns = new;
115 115
116 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || 116 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
117 !nsown_capable(CAP_SYS_ADMIN)) 117 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
118 return -EPERM; 118 return -EPERM;
119 119
120 get_uts_ns(ns); 120 get_uts_ns(ns);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 1241d8c91d5e..4431610f049a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -486,7 +486,52 @@ static struct smp_hotplug_thread watchdog_threads = {
486 .unpark = watchdog_enable, 486 .unpark = watchdog_enable,
487}; 487};
488 488
489static int watchdog_enable_all_cpus(void) 489static void restart_watchdog_hrtimer(void *info)
490{
491 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
492 int ret;
493
494 /*
495 * No need to cancel and restart hrtimer if it is currently executing
496 * because it will reprogram itself with the new period now.
497 * We should never see it unqueued here because we are running per-cpu
498 * with interrupts disabled.
499 */
500 ret = hrtimer_try_to_cancel(hrtimer);
501 if (ret == 1)
502 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
503 HRTIMER_MODE_REL_PINNED);
504}
505
506static void update_timers(int cpu)
507{
508 struct call_single_data data = {.func = restart_watchdog_hrtimer};
509 /*
510 * Make sure that perf event counter will adopt to a new
511 * sampling period. Updating the sampling period directly would
512 * be much nicer but we do not have an API for that now so
513 * let's use a big hammer.
514 * Hrtimer will adopt the new period on the next tick but this
515 * might be late already so we have to restart the timer as well.
516 */
517 watchdog_nmi_disable(cpu);
518 __smp_call_function_single(cpu, &data, 1);
519 watchdog_nmi_enable(cpu);
520}
521
522static void update_timers_all_cpus(void)
523{
524 int cpu;
525
526 get_online_cpus();
527 preempt_disable();
528 for_each_online_cpu(cpu)
529 update_timers(cpu);
530 preempt_enable();
531 put_online_cpus();
532}
533
534static int watchdog_enable_all_cpus(bool sample_period_changed)
490{ 535{
491 int err = 0; 536 int err = 0;
492 537
@@ -496,6 +541,8 @@ static int watchdog_enable_all_cpus(void)
496 pr_err("Failed to create watchdog threads, disabled\n"); 541 pr_err("Failed to create watchdog threads, disabled\n");
497 else 542 else
498 watchdog_running = 1; 543 watchdog_running = 1;
544 } else if (sample_period_changed) {
545 update_timers_all_cpus();
499 } 546 }
500 547
501 return err; 548 return err;
@@ -520,13 +567,15 @@ int proc_dowatchdog(struct ctl_table *table, int write,
520 void __user *buffer, size_t *lenp, loff_t *ppos) 567 void __user *buffer, size_t *lenp, loff_t *ppos)
521{ 568{
522 int err, old_thresh, old_enabled; 569 int err, old_thresh, old_enabled;
570 static DEFINE_MUTEX(watchdog_proc_mutex);
523 571
572 mutex_lock(&watchdog_proc_mutex);
524 old_thresh = ACCESS_ONCE(watchdog_thresh); 573 old_thresh = ACCESS_ONCE(watchdog_thresh);
525 old_enabled = ACCESS_ONCE(watchdog_user_enabled); 574 old_enabled = ACCESS_ONCE(watchdog_user_enabled);
526 575
527 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 576 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
528 if (err || !write) 577 if (err || !write)
529 return err; 578 goto out;
530 579
531 set_sample_period(); 580 set_sample_period();
532 /* 581 /*
@@ -535,7 +584,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
535 * watchdog_*_all_cpus() function takes care of this. 584 * watchdog_*_all_cpus() function takes care of this.
536 */ 585 */
537 if (watchdog_user_enabled && watchdog_thresh) 586 if (watchdog_user_enabled && watchdog_thresh)
538 err = watchdog_enable_all_cpus(); 587 err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
539 else 588 else
540 watchdog_disable_all_cpus(); 589 watchdog_disable_all_cpus();
541 590
@@ -544,7 +593,8 @@ int proc_dowatchdog(struct ctl_table *table, int write,
544 watchdog_thresh = old_thresh; 593 watchdog_thresh = old_thresh;
545 watchdog_user_enabled = old_enabled; 594 watchdog_user_enabled = old_enabled;
546 } 595 }
547 596out:
597 mutex_unlock(&watchdog_proc_mutex);
548 return err; 598 return err;
549} 599}
550#endif /* CONFIG_SYSCTL */ 600#endif /* CONFIG_SYSCTL */
@@ -553,14 +603,6 @@ void __init lockup_detector_init(void)
553{ 603{
554 set_sample_period(); 604 set_sample_period();
555 605
556#ifdef CONFIG_NO_HZ_FULL
557 if (watchdog_user_enabled) {
558 watchdog_user_enabled = 0;
559 pr_warning("Disabled lockup detectors by default for full dynticks\n");
560 pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n");
561 }
562#endif
563
564 if (watchdog_user_enabled) 606 if (watchdog_user_enabled)
565 watchdog_enable_all_cpus(); 607 watchdog_enable_all_cpus(false);
566} 608}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7f5d4be22034..987293d03ebc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -16,9 +16,10 @@
16 * 16 *
17 * This is the generic async execution mechanism. Work items as are 17 * This is the generic async execution mechanism. Work items as are
18 * executed in process context. The worker pool is shared and 18 * executed in process context. The worker pool is shared and
19 * automatically managed. There is one worker pool for each CPU and 19 * automatically managed. There are two worker pools for each CPU (one for
20 * one extra for works which are better served by workers which are 20 * normal work items and the other for high priority ones) and some extra
21 * not bound to any specific CPU. 21 * pools for workqueues which are not bound to any specific CPU - the
22 * number of these backing pools is dynamic.
22 * 23 *
23 * Please read Documentation/workqueue.txt for details. 24 * Please read Documentation/workqueue.txt for details.
24 */ 25 */
@@ -540,6 +541,8 @@ static int worker_pool_assign_id(struct worker_pool *pool)
540 * This must be called either with pwq_lock held or sched RCU read locked. 541 * This must be called either with pwq_lock held or sched RCU read locked.
541 * If the pwq needs to be used beyond the locking in effect, the caller is 542 * If the pwq needs to be used beyond the locking in effect, the caller is
542 * responsible for guaranteeing that the pwq stays online. 543 * responsible for guaranteeing that the pwq stays online.
544 *
545 * Return: The unbound pool_workqueue for @node.
543 */ 546 */
544static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, 547static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
545 int node) 548 int node)
@@ -638,8 +641,6 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
638 * get_work_pool - return the worker_pool a given work was associated with 641 * get_work_pool - return the worker_pool a given work was associated with
639 * @work: the work item of interest 642 * @work: the work item of interest
640 * 643 *
641 * Return the worker_pool @work was last associated with. %NULL if none.
642 *
643 * Pools are created and destroyed under wq_pool_mutex, and allows read 644 * Pools are created and destroyed under wq_pool_mutex, and allows read
644 * access under sched-RCU read lock. As such, this function should be 645 * access under sched-RCU read lock. As such, this function should be
645 * called under wq_pool_mutex or with preemption disabled. 646 * called under wq_pool_mutex or with preemption disabled.
@@ -648,6 +649,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
648 * mentioned locking is in effect. If the returned pool needs to be used 649 * mentioned locking is in effect. If the returned pool needs to be used
649 * beyond the critical section, the caller is responsible for ensuring the 650 * beyond the critical section, the caller is responsible for ensuring the
650 * returned pool is and stays online. 651 * returned pool is and stays online.
652 *
653 * Return: The worker_pool @work was last associated with. %NULL if none.
651 */ 654 */
652static struct worker_pool *get_work_pool(struct work_struct *work) 655static struct worker_pool *get_work_pool(struct work_struct *work)
653{ 656{
@@ -671,7 +674,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
671 * get_work_pool_id - return the worker pool ID a given work is associated with 674 * get_work_pool_id - return the worker pool ID a given work is associated with
672 * @work: the work item of interest 675 * @work: the work item of interest
673 * 676 *
674 * Return the worker_pool ID @work was last associated with. 677 * Return: The worker_pool ID @work was last associated with.
675 * %WORK_OFFQ_POOL_NONE if none. 678 * %WORK_OFFQ_POOL_NONE if none.
676 */ 679 */
677static int get_work_pool_id(struct work_struct *work) 680static int get_work_pool_id(struct work_struct *work)
@@ -830,7 +833,7 @@ void wq_worker_waking_up(struct task_struct *task, int cpu)
830 * CONTEXT: 833 * CONTEXT:
831 * spin_lock_irq(rq->lock) 834 * spin_lock_irq(rq->lock)
832 * 835 *
833 * RETURNS: 836 * Return:
834 * Worker task on @cpu to wake up, %NULL if none. 837 * Worker task on @cpu to wake up, %NULL if none.
835 */ 838 */
836struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) 839struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
@@ -965,8 +968,8 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
965 * CONTEXT: 968 * CONTEXT:
966 * spin_lock_irq(pool->lock). 969 * spin_lock_irq(pool->lock).
967 * 970 *
968 * RETURNS: 971 * Return:
969 * Pointer to worker which is executing @work if found, NULL 972 * Pointer to worker which is executing @work if found, %NULL
970 * otherwise. 973 * otherwise.
971 */ 974 */
972static struct worker *find_worker_executing_work(struct worker_pool *pool, 975static struct worker *find_worker_executing_work(struct worker_pool *pool,
@@ -1154,14 +1157,16 @@ out_put:
1154 * @flags: place to store irq state 1157 * @flags: place to store irq state
1155 * 1158 *
1156 * Try to grab PENDING bit of @work. This function can handle @work in any 1159 * Try to grab PENDING bit of @work. This function can handle @work in any
1157 * stable state - idle, on timer or on worklist. Return values are 1160 * stable state - idle, on timer or on worklist.
1158 * 1161 *
1162 * Return:
1159 * 1 if @work was pending and we successfully stole PENDING 1163 * 1 if @work was pending and we successfully stole PENDING
1160 * 0 if @work was idle and we claimed PENDING 1164 * 0 if @work was idle and we claimed PENDING
1161 * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry 1165 * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry
1162 * -ENOENT if someone else is canceling @work, this state may persist 1166 * -ENOENT if someone else is canceling @work, this state may persist
1163 * for arbitrarily long 1167 * for arbitrarily long
1164 * 1168 *
1169 * Note:
1165 * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting 1170 * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting
1166 * interrupted while holding PENDING and @work off queue, irq must be 1171 * interrupted while holding PENDING and @work off queue, irq must be
1167 * disabled on entry. This, combined with delayed_work->timer being 1172 * disabled on entry. This, combined with delayed_work->timer being
@@ -1403,10 +1408,10 @@ retry:
1403 * @wq: workqueue to use 1408 * @wq: workqueue to use
1404 * @work: work to queue 1409 * @work: work to queue
1405 * 1410 *
1406 * Returns %false if @work was already on a queue, %true otherwise.
1407 *
1408 * We queue the work to a specific CPU, the caller must ensure it 1411 * We queue the work to a specific CPU, the caller must ensure it
1409 * can't go away. 1412 * can't go away.
1413 *
1414 * Return: %false if @work was already on a queue, %true otherwise.
1410 */ 1415 */
1411bool queue_work_on(int cpu, struct workqueue_struct *wq, 1416bool queue_work_on(int cpu, struct workqueue_struct *wq,
1412 struct work_struct *work) 1417 struct work_struct *work)
@@ -1476,7 +1481,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1476 * @dwork: work to queue 1481 * @dwork: work to queue
1477 * @delay: number of jiffies to wait before queueing 1482 * @delay: number of jiffies to wait before queueing
1478 * 1483 *
1479 * Returns %false if @work was already on a queue, %true otherwise. If 1484 * Return: %false if @work was already on a queue, %true otherwise. If
1480 * @delay is zero and @dwork is idle, it will be scheduled for immediate 1485 * @delay is zero and @dwork is idle, it will be scheduled for immediate
1481 * execution. 1486 * execution.
1482 */ 1487 */
@@ -1512,7 +1517,7 @@ EXPORT_SYMBOL(queue_delayed_work_on);
1512 * zero, @work is guaranteed to be scheduled immediately regardless of its 1517 * zero, @work is guaranteed to be scheduled immediately regardless of its
1513 * current state. 1518 * current state.
1514 * 1519 *
1515 * Returns %false if @dwork was idle and queued, %true if @dwork was 1520 * Return: %false if @dwork was idle and queued, %true if @dwork was
1516 * pending and its timer was modified. 1521 * pending and its timer was modified.
1517 * 1522 *
1518 * This function is safe to call from any context including IRQ handler. 1523 * This function is safe to call from any context including IRQ handler.
@@ -1627,7 +1632,7 @@ static void worker_leave_idle(struct worker *worker)
1627 * Might sleep. Called without any lock but returns with pool->lock 1632 * Might sleep. Called without any lock but returns with pool->lock
1628 * held. 1633 * held.
1629 * 1634 *
1630 * RETURNS: 1635 * Return:
1631 * %true if the associated pool is online (@worker is successfully 1636 * %true if the associated pool is online (@worker is successfully
1632 * bound), %false if offline. 1637 * bound), %false if offline.
1633 */ 1638 */
@@ -1688,7 +1693,7 @@ static struct worker *alloc_worker(void)
1688 * CONTEXT: 1693 * CONTEXT:
1689 * Might sleep. Does GFP_KERNEL allocations. 1694 * Might sleep. Does GFP_KERNEL allocations.
1690 * 1695 *
1691 * RETURNS: 1696 * Return:
1692 * Pointer to the newly created worker. 1697 * Pointer to the newly created worker.
1693 */ 1698 */
1694static struct worker *create_worker(struct worker_pool *pool) 1699static struct worker *create_worker(struct worker_pool *pool)
@@ -1788,6 +1793,8 @@ static void start_worker(struct worker *worker)
1788 * @pool: the target pool 1793 * @pool: the target pool
1789 * 1794 *
1790 * Grab the managership of @pool and create and start a new worker for it. 1795 * Grab the managership of @pool and create and start a new worker for it.
1796 *
1797 * Return: 0 on success. A negative error code otherwise.
1791 */ 1798 */
1792static int create_and_start_worker(struct worker_pool *pool) 1799static int create_and_start_worker(struct worker_pool *pool)
1793{ 1800{
@@ -1932,7 +1939,7 @@ static void pool_mayday_timeout(unsigned long __pool)
1932 * multiple times. Does GFP_KERNEL allocations. Called only from 1939 * multiple times. Does GFP_KERNEL allocations. Called only from
1933 * manager. 1940 * manager.
1934 * 1941 *
1935 * RETURNS: 1942 * Return:
1936 * %false if no action was taken and pool->lock stayed locked, %true 1943 * %false if no action was taken and pool->lock stayed locked, %true
1937 * otherwise. 1944 * otherwise.
1938 */ 1945 */
@@ -1989,7 +1996,7 @@ restart:
1989 * spin_lock_irq(pool->lock) which may be released and regrabbed 1996 * spin_lock_irq(pool->lock) which may be released and regrabbed
1990 * multiple times. Called only from manager. 1997 * multiple times. Called only from manager.
1991 * 1998 *
1992 * RETURNS: 1999 * Return:
1993 * %false if no action was taken and pool->lock stayed locked, %true 2000 * %false if no action was taken and pool->lock stayed locked, %true
1994 * otherwise. 2001 * otherwise.
1995 */ 2002 */
@@ -2032,9 +2039,12 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
2032 * spin_lock_irq(pool->lock) which may be released and regrabbed 2039 * spin_lock_irq(pool->lock) which may be released and regrabbed
2033 * multiple times. Does GFP_KERNEL allocations. 2040 * multiple times. Does GFP_KERNEL allocations.
2034 * 2041 *
2035 * RETURNS: 2042 * Return:
2036 * spin_lock_irq(pool->lock) which may be released and regrabbed 2043 * %false if the pool don't need management and the caller can safely start
2037 * multiple times. Does GFP_KERNEL allocations. 2044 * processing works, %true indicates that the function released pool->lock
2045 * and reacquired it to perform some management function and that the
2046 * conditions that the caller verified while holding the lock before
2047 * calling the function might no longer be true.
2038 */ 2048 */
2039static bool manage_workers(struct worker *worker) 2049static bool manage_workers(struct worker *worker)
2040{ 2050{
@@ -2201,6 +2211,15 @@ __acquires(&pool->lock)
2201 dump_stack(); 2211 dump_stack();
2202 } 2212 }
2203 2213
2214 /*
2215 * The following prevents a kworker from hogging CPU on !PREEMPT
2216 * kernels, where a requeueing work item waiting for something to
2217 * happen could deadlock with stop_machine as such work item could
2218 * indefinitely requeue itself while all other CPUs are trapped in
2219 * stop_machine.
2220 */
2221 cond_resched();
2222
2204 spin_lock_irq(&pool->lock); 2223 spin_lock_irq(&pool->lock);
2205 2224
2206 /* clear cpu intensive status */ 2225 /* clear cpu intensive status */
@@ -2246,6 +2265,8 @@ static void process_scheduled_works(struct worker *worker)
2246 * work items regardless of their specific target workqueue. The only 2265 * work items regardless of their specific target workqueue. The only
2247 * exception is work items which belong to workqueues with a rescuer which 2266 * exception is work items which belong to workqueues with a rescuer which
2248 * will be explained in rescuer_thread(). 2267 * will be explained in rescuer_thread().
2268 *
2269 * Return: 0
2249 */ 2270 */
2250static int worker_thread(void *__worker) 2271static int worker_thread(void *__worker)
2251{ 2272{
@@ -2344,6 +2365,8 @@ sleep:
2344 * those works so that forward progress can be guaranteed. 2365 * those works so that forward progress can be guaranteed.
2345 * 2366 *
2346 * This should happen rarely. 2367 * This should happen rarely.
2368 *
2369 * Return: 0
2347 */ 2370 */
2348static int rescuer_thread(void *__rescuer) 2371static int rescuer_thread(void *__rescuer)
2349{ 2372{
@@ -2516,7 +2539,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
2516 * CONTEXT: 2539 * CONTEXT:
2517 * mutex_lock(wq->mutex). 2540 * mutex_lock(wq->mutex).
2518 * 2541 *
2519 * RETURNS: 2542 * Return:
2520 * %true if @flush_color >= 0 and there's something to flush. %false 2543 * %true if @flush_color >= 0 and there's something to flush. %false
2521 * otherwise. 2544 * otherwise.
2522 */ 2545 */
@@ -2837,7 +2860,7 @@ static bool __flush_work(struct work_struct *work)
2837 * Wait until @work has finished execution. @work is guaranteed to be idle 2860 * Wait until @work has finished execution. @work is guaranteed to be idle
2838 * on return if it hasn't been requeued since flush started. 2861 * on return if it hasn't been requeued since flush started.
2839 * 2862 *
2840 * RETURNS: 2863 * Return:
2841 * %true if flush_work() waited for the work to finish execution, 2864 * %true if flush_work() waited for the work to finish execution,
2842 * %false if it was already idle. 2865 * %false if it was already idle.
2843 */ 2866 */
@@ -2889,7 +2912,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
2889 * The caller must ensure that the workqueue on which @work was last 2912 * The caller must ensure that the workqueue on which @work was last
2890 * queued can't be destroyed before this function returns. 2913 * queued can't be destroyed before this function returns.
2891 * 2914 *
2892 * RETURNS: 2915 * Return:
2893 * %true if @work was pending, %false otherwise. 2916 * %true if @work was pending, %false otherwise.
2894 */ 2917 */
2895bool cancel_work_sync(struct work_struct *work) 2918bool cancel_work_sync(struct work_struct *work)
@@ -2906,7 +2929,7 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
2906 * immediate execution. Like flush_work(), this function only 2929 * immediate execution. Like flush_work(), this function only
2907 * considers the last queueing instance of @dwork. 2930 * considers the last queueing instance of @dwork.
2908 * 2931 *
2909 * RETURNS: 2932 * Return:
2910 * %true if flush_work() waited for the work to finish execution, 2933 * %true if flush_work() waited for the work to finish execution,
2911 * %false if it was already idle. 2934 * %false if it was already idle.
2912 */ 2935 */
@@ -2924,11 +2947,15 @@ EXPORT_SYMBOL(flush_delayed_work);
2924 * cancel_delayed_work - cancel a delayed work 2947 * cancel_delayed_work - cancel a delayed work
2925 * @dwork: delayed_work to cancel 2948 * @dwork: delayed_work to cancel
2926 * 2949 *
2927 * Kill off a pending delayed_work. Returns %true if @dwork was pending 2950 * Kill off a pending delayed_work.
2928 * and canceled; %false if wasn't pending. Note that the work callback 2951 *
2929 * function may still be running on return, unless it returns %true and the 2952 * Return: %true if @dwork was pending and canceled; %false if it wasn't
2930 * work doesn't re-arm itself. Explicitly flush or use 2953 * pending.
2931 * cancel_delayed_work_sync() to wait on it. 2954 *
2955 * Note:
2956 * The work callback function may still be running on return, unless
2957 * it returns %true and the work doesn't re-arm itself. Explicitly flush or
2958 * use cancel_delayed_work_sync() to wait on it.
2932 * 2959 *
2933 * This function is safe to call from any context including IRQ handler. 2960 * This function is safe to call from any context including IRQ handler.
2934 */ 2961 */
@@ -2957,7 +2984,7 @@ EXPORT_SYMBOL(cancel_delayed_work);
2957 * 2984 *
2958 * This is cancel_work_sync() for delayed works. 2985 * This is cancel_work_sync() for delayed works.
2959 * 2986 *
2960 * RETURNS: 2987 * Return:
2961 * %true if @dwork was pending, %false otherwise. 2988 * %true if @dwork was pending, %false otherwise.
2962 */ 2989 */
2963bool cancel_delayed_work_sync(struct delayed_work *dwork) 2990bool cancel_delayed_work_sync(struct delayed_work *dwork)
@@ -2974,7 +3001,7 @@ EXPORT_SYMBOL(cancel_delayed_work_sync);
2974 * system workqueue and blocks until all CPUs have completed. 3001 * system workqueue and blocks until all CPUs have completed.
2975 * schedule_on_each_cpu() is very slow. 3002 * schedule_on_each_cpu() is very slow.
2976 * 3003 *
2977 * RETURNS: 3004 * Return:
2978 * 0 on success, -errno on failure. 3005 * 0 on success, -errno on failure.
2979 */ 3006 */
2980int schedule_on_each_cpu(work_func_t func) 3007int schedule_on_each_cpu(work_func_t func)
@@ -3042,7 +3069,7 @@ EXPORT_SYMBOL(flush_scheduled_work);
3042 * Executes the function immediately if process context is available, 3069 * Executes the function immediately if process context is available,
3043 * otherwise schedules the function for delayed execution. 3070 * otherwise schedules the function for delayed execution.
3044 * 3071 *
3045 * Returns: 0 - function was executed 3072 * Return: 0 - function was executed
3046 * 1 - function was scheduled for execution 3073 * 1 - function was scheduled for execution
3047 */ 3074 */
3048int execute_in_process_context(work_func_t fn, struct execute_work *ew) 3075int execute_in_process_context(work_func_t fn, struct execute_work *ew)
@@ -3086,25 +3113,26 @@ static struct workqueue_struct *dev_to_wq(struct device *dev)
3086 return wq_dev->wq; 3113 return wq_dev->wq;
3087} 3114}
3088 3115
3089static ssize_t wq_per_cpu_show(struct device *dev, 3116static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
3090 struct device_attribute *attr, char *buf) 3117 char *buf)
3091{ 3118{
3092 struct workqueue_struct *wq = dev_to_wq(dev); 3119 struct workqueue_struct *wq = dev_to_wq(dev);
3093 3120
3094 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); 3121 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
3095} 3122}
3123static DEVICE_ATTR_RO(per_cpu);
3096 3124
3097static ssize_t wq_max_active_show(struct device *dev, 3125static ssize_t max_active_show(struct device *dev,
3098 struct device_attribute *attr, char *buf) 3126 struct device_attribute *attr, char *buf)
3099{ 3127{
3100 struct workqueue_struct *wq = dev_to_wq(dev); 3128 struct workqueue_struct *wq = dev_to_wq(dev);
3101 3129
3102 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); 3130 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
3103} 3131}
3104 3132
3105static ssize_t wq_max_active_store(struct device *dev, 3133static ssize_t max_active_store(struct device *dev,
3106 struct device_attribute *attr, 3134 struct device_attribute *attr, const char *buf,
3107 const char *buf, size_t count) 3135 size_t count)
3108{ 3136{
3109 struct workqueue_struct *wq = dev_to_wq(dev); 3137 struct workqueue_struct *wq = dev_to_wq(dev);
3110 int val; 3138 int val;
@@ -3115,12 +3143,14 @@ static ssize_t wq_max_active_store(struct device *dev,
3115 workqueue_set_max_active(wq, val); 3143 workqueue_set_max_active(wq, val);
3116 return count; 3144 return count;
3117} 3145}
3146static DEVICE_ATTR_RW(max_active);
3118 3147
3119static struct device_attribute wq_sysfs_attrs[] = { 3148static struct attribute *wq_sysfs_attrs[] = {
3120 __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), 3149 &dev_attr_per_cpu.attr,
3121 __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), 3150 &dev_attr_max_active.attr,
3122 __ATTR_NULL, 3151 NULL,
3123}; 3152};
3153ATTRIBUTE_GROUPS(wq_sysfs);
3124 3154
3125static ssize_t wq_pool_ids_show(struct device *dev, 3155static ssize_t wq_pool_ids_show(struct device *dev,
3126 struct device_attribute *attr, char *buf) 3156 struct device_attribute *attr, char *buf)
@@ -3270,7 +3300,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = {
3270 3300
3271static struct bus_type wq_subsys = { 3301static struct bus_type wq_subsys = {
3272 .name = "workqueue", 3302 .name = "workqueue",
3273 .dev_attrs = wq_sysfs_attrs, 3303 .dev_groups = wq_sysfs_groups,
3274}; 3304};
3275 3305
3276static int __init wq_sysfs_init(void) 3306static int __init wq_sysfs_init(void)
@@ -3299,7 +3329,7 @@ static void wq_device_release(struct device *dev)
3299 * apply_workqueue_attrs() may race against userland updating the 3329 * apply_workqueue_attrs() may race against userland updating the
3300 * attributes. 3330 * attributes.
3301 * 3331 *
3302 * Returns 0 on success, -errno on failure. 3332 * Return: 0 on success, -errno on failure.
3303 */ 3333 */
3304int workqueue_sysfs_register(struct workqueue_struct *wq) 3334int workqueue_sysfs_register(struct workqueue_struct *wq)
3305{ 3335{
@@ -3392,7 +3422,9 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
3392 * @gfp_mask: allocation mask to use 3422 * @gfp_mask: allocation mask to use
3393 * 3423 *
3394 * Allocate a new workqueue_attrs, initialize with default settings and 3424 * Allocate a new workqueue_attrs, initialize with default settings and
3395 * return it. Returns NULL on failure. 3425 * return it.
3426 *
3427 * Return: The allocated new workqueue_attr on success. %NULL on failure.
3396 */ 3428 */
3397struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) 3429struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
3398{ 3430{
@@ -3451,7 +3483,8 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
3451 * @pool: worker_pool to initialize 3483 * @pool: worker_pool to initialize
3452 * 3484 *
3453 * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. 3485 * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs.
3454 * Returns 0 on success, -errno on failure. Even on failure, all fields 3486 *
3487 * Return: 0 on success, -errno on failure. Even on failure, all fields
3455 * inside @pool proper are initialized and put_unbound_pool() can be called 3488 * inside @pool proper are initialized and put_unbound_pool() can be called
3456 * on @pool safely to release it. 3489 * on @pool safely to release it.
3457 */ 3490 */
@@ -3558,9 +3591,12 @@ static void put_unbound_pool(struct worker_pool *pool)
3558 * Obtain a worker_pool which has the same attributes as @attrs, bump the 3591 * Obtain a worker_pool which has the same attributes as @attrs, bump the
3559 * reference count and return it. If there already is a matching 3592 * reference count and return it. If there already is a matching
3560 * worker_pool, it will be used; otherwise, this function attempts to 3593 * worker_pool, it will be used; otherwise, this function attempts to
3561 * create a new one. On failure, returns NULL. 3594 * create a new one.
3562 * 3595 *
3563 * Should be called with wq_pool_mutex held. 3596 * Should be called with wq_pool_mutex held.
3597 *
3598 * Return: On success, a worker_pool with the same attributes as @attrs.
3599 * On failure, %NULL.
3564 */ 3600 */
3565static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) 3601static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3566{ 3602{
@@ -3796,9 +3832,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq)
3796 * 3832 *
3797 * Calculate the cpumask a workqueue with @attrs should use on @node. If 3833 * Calculate the cpumask a workqueue with @attrs should use on @node. If
3798 * @cpu_going_down is >= 0, that cpu is considered offline during 3834 * @cpu_going_down is >= 0, that cpu is considered offline during
3799 * calculation. The result is stored in @cpumask. This function returns 3835 * calculation. The result is stored in @cpumask.
3800 * %true if the resulting @cpumask is different from @attrs->cpumask,
3801 * %false if equal.
3802 * 3836 *
3803 * If NUMA affinity is not enabled, @attrs->cpumask is always used. If 3837 * If NUMA affinity is not enabled, @attrs->cpumask is always used. If
3804 * enabled and @node has online CPUs requested by @attrs, the returned 3838 * enabled and @node has online CPUs requested by @attrs, the returned
@@ -3807,6 +3841,9 @@ static void free_unbound_pwq(struct pool_workqueue *pwq)
3807 * 3841 *
3808 * The caller is responsible for ensuring that the cpumask of @node stays 3842 * The caller is responsible for ensuring that the cpumask of @node stays
3809 * stable. 3843 * stable.
3844 *
3845 * Return: %true if the resulting @cpumask is different from @attrs->cpumask,
3846 * %false if equal.
3810 */ 3847 */
3811static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, 3848static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
3812 int cpu_going_down, cpumask_t *cpumask) 3849 int cpu_going_down, cpumask_t *cpumask)
@@ -3860,8 +3897,9 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
3860 * items finish. Note that a work item which repeatedly requeues itself 3897 * items finish. Note that a work item which repeatedly requeues itself
3861 * back-to-back will stay on its current pwq. 3898 * back-to-back will stay on its current pwq.
3862 * 3899 *
3863 * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on 3900 * Performs GFP_KERNEL allocations.
3864 * failure. 3901 *
3902 * Return: 0 on success and -errno on failure.
3865 */ 3903 */
3866int apply_workqueue_attrs(struct workqueue_struct *wq, 3904int apply_workqueue_attrs(struct workqueue_struct *wq,
3867 const struct workqueue_attrs *attrs) 3905 const struct workqueue_attrs *attrs)
@@ -4329,6 +4367,8 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
4329 * 4367 *
4330 * Determine whether %current is a workqueue rescuer. Can be used from 4368 * Determine whether %current is a workqueue rescuer. Can be used from
4331 * work functions to determine whether it's being run off the rescuer task. 4369 * work functions to determine whether it's being run off the rescuer task.
4370 *
4371 * Return: %true if %current is a workqueue rescuer. %false otherwise.
4332 */ 4372 */
4333bool current_is_workqueue_rescuer(void) 4373bool current_is_workqueue_rescuer(void)
4334{ 4374{
@@ -4352,7 +4392,7 @@ bool current_is_workqueue_rescuer(void)
4352 * workqueue being congested on one CPU doesn't mean the workqueue is also 4392 * workqueue being congested on one CPU doesn't mean the workqueue is also
4353 * contested on other CPUs / NUMA nodes. 4393 * contested on other CPUs / NUMA nodes.
4354 * 4394 *
4355 * RETURNS: 4395 * Return:
4356 * %true if congested, %false otherwise. 4396 * %true if congested, %false otherwise.
4357 */ 4397 */
4358bool workqueue_congested(int cpu, struct workqueue_struct *wq) 4398bool workqueue_congested(int cpu, struct workqueue_struct *wq)
@@ -4385,7 +4425,7 @@ EXPORT_SYMBOL_GPL(workqueue_congested);
4385 * synchronization around this function and the test result is 4425 * synchronization around this function and the test result is
4386 * unreliable and only useful as advisory hints or for debugging. 4426 * unreliable and only useful as advisory hints or for debugging.
4387 * 4427 *
4388 * RETURNS: 4428 * Return:
4389 * OR'd bitmask of WORK_BUSY_* bits. 4429 * OR'd bitmask of WORK_BUSY_* bits.
4390 */ 4430 */
4391unsigned int work_busy(struct work_struct *work) 4431unsigned int work_busy(struct work_struct *work)
@@ -4763,9 +4803,10 @@ static void work_for_cpu_fn(struct work_struct *work)
4763 * @fn: the function to run 4803 * @fn: the function to run
4764 * @arg: the function arg 4804 * @arg: the function arg
4765 * 4805 *
4766 * This will return the value @fn returns.
4767 * It is up to the caller to ensure that the cpu doesn't go offline. 4806 * It is up to the caller to ensure that the cpu doesn't go offline.
4768 * The caller must not hold any locks which would prevent @fn from completing. 4807 * The caller must not hold any locks which would prevent @fn from completing.
4808 *
4809 * Return: The value @fn returns.
4769 */ 4810 */
4770long work_on_cpu(int cpu, long (*fn)(void *), void *arg) 4811long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
4771{ 4812{
@@ -4837,7 +4878,7 @@ void freeze_workqueues_begin(void)
4837 * CONTEXT: 4878 * CONTEXT:
4838 * Grabs and releases wq_pool_mutex. 4879 * Grabs and releases wq_pool_mutex.
4839 * 4880 *
4840 * RETURNS: 4881 * Return:
4841 * %true if some freezable workqueues are still busy. %false if freezing 4882 * %true if some freezable workqueues are still busy. %false if freezing
4842 * is complete. 4883 * is complete.
4843 */ 4884 */