aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-07-26 16:08:01 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-26 16:08:01 -0400
commit79071638ce655c1f78a50d05c7dae0ad04a3e92a (patch)
treed9e76997c418b78a2485ac50d5970f7d420a5600
parent44a6b8442190cf213081060b610dae2e822f802b (diff)
parent8323f26ce3425460769605a6aece7a174edaa7d1 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "The biggest change is a performance improvement on SMP systems: | 4 socket 40 core + SMT Westmere box, single 30 sec tbench | runs, higher is better: | | clients 1 2 4 8 16 32 64 128 |.......................................................................... | pre 30 41 118 645 3769 6214 12233 14312 | post 299 603 1211 2418 4697 6847 11606 14557 | | A nice increase in performance. which speedup is particularly noticeable on heavily interacting few-tasks workloads, so the changes should help desktop-style Xorg workloads and interactivity as well, on multi-core CPUs. There are also cpuset suspend behavior fixes/restructuring and various smaller tweaks." * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched: Fix race in task_group() sched: Improve balance_cpu() to consider other cpus in its group as target of (pinned) task sched: Reset loop counters if all tasks are pinned and we need to redo load balance sched: Reorder 'struct lb_env' members to reduce its size sched: Improve scalability via 'CPU buddies', which withstand random perturbations cpusets: Remove/update outdated comments cpusets, hotplug: Restructure functions that are invoked during hotplug cpusets, hotplug: Implement cpuset tree traversal in a helper function CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume sched/x86: Remove broken power estimation
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/sched.c55
-rw-r--r--include/linux/cpuset.h4
-rw-r--r--include/linux/init_task.h12
-rw-r--r--include/linux/sched.h6
-rw-r--r--kernel/cpuset.c130
-rw-r--r--kernel/sched/core.c92
-rw-r--r--kernel/sched/fair.c113
-rw-r--r--kernel/sched/sched.h23
9 files changed, 291 insertions, 146 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index bac4c3804cc7..d30a6a9a0121 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o := $(nostackp)
14 14
15obj-y := intel_cacheinfo.o scattered.o topology.o 15obj-y := intel_cacheinfo.o scattered.o topology.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o mshyperv.o 17obj-y += vmware.o hypervisor.o mshyperv.o
18obj-y += rdrand.o 18obj-y += rdrand.o
19obj-y += match.o 19obj-y += match.o
20 20
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
deleted file mode 100644
index a640ae5ad201..000000000000
--- a/arch/x86/kernel/cpu/sched.c
+++ /dev/null
@@ -1,55 +0,0 @@
1#include <linux/sched.h>
2#include <linux/math64.h>
3#include <linux/percpu.h>
4#include <linux/irqflags.h>
5
6#include <asm/cpufeature.h>
7#include <asm/processor.h>
8
9#ifdef CONFIG_SMP
10
11static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
12
13static unsigned long scale_aperfmperf(void)
14{
15 struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
16 unsigned long ratio, flags;
17
18 local_irq_save(flags);
19 get_aperfmperf(&val);
20 local_irq_restore(flags);
21
22 ratio = calc_aperfmperf_ratio(old, &val);
23 *old = val;
24
25 return ratio;
26}
27
28unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
29{
30 /*
31 * do aperf/mperf on the cpu level because it includes things
32 * like turbo mode, which are relevant to full cores.
33 */
34 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
35 return scale_aperfmperf();
36
37 /*
38 * maybe have something cpufreq here
39 */
40
41 return default_scale_freq_power(sd, cpu);
42}
43
44unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
45{
46 /*
47 * aperf/mperf already includes the smt gain
48 */
49 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
50 return SCHED_LOAD_SCALE;
51
52 return default_scale_smt_power(sd, cpu);
53}
54
55#endif
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 668f66baac7b..838320fc3d1d 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -20,7 +20,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */
20 20
21extern int cpuset_init(void); 21extern int cpuset_init(void);
22extern void cpuset_init_smp(void); 22extern void cpuset_init_smp(void);
23extern void cpuset_update_active_cpus(void); 23extern void cpuset_update_active_cpus(bool cpu_online);
24extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); 24extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
25extern void cpuset_cpus_allowed_fallback(struct task_struct *p); 25extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
26extern nodemask_t cpuset_mems_allowed(struct task_struct *p); 26extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -124,7 +124,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
124static inline int cpuset_init(void) { return 0; } 124static inline int cpuset_init(void) { return 0; }
125static inline void cpuset_init_smp(void) {} 125static inline void cpuset_init_smp(void) {}
126 126
127static inline void cpuset_update_active_cpus(void) 127static inline void cpuset_update_active_cpus(bool cpu_online)
128{ 128{
129 partition_sched_domains(1, NULL, NULL); 129 partition_sched_domains(1, NULL, NULL);
130} 130}
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 8a7476186990..89f1cb1056f0 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -123,8 +123,17 @@ extern struct group_info init_groups;
123 123
124extern struct cred init_cred; 124extern struct cred init_cred;
125 125
126extern struct task_group root_task_group;
127
128#ifdef CONFIG_CGROUP_SCHED
129# define INIT_CGROUP_SCHED(tsk) \
130 .sched_task_group = &root_task_group,
131#else
132# define INIT_CGROUP_SCHED(tsk)
133#endif
134
126#ifdef CONFIG_PERF_EVENTS 135#ifdef CONFIG_PERF_EVENTS
127# define INIT_PERF_EVENTS(tsk) \ 136# define INIT_PERF_EVENTS(tsk) \
128 .perf_event_mutex = \ 137 .perf_event_mutex = \
129 __MUTEX_INITIALIZER(tsk.perf_event_mutex), \ 138 __MUTEX_INITIALIZER(tsk.perf_event_mutex), \
130 .perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list), 139 .perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list),
@@ -161,6 +170,7 @@ extern struct cred init_cred;
161 }, \ 170 }, \
162 .tasks = LIST_HEAD_INIT(tsk.tasks), \ 171 .tasks = LIST_HEAD_INIT(tsk.tasks), \
163 INIT_PUSHABLE_TASKS(tsk) \ 172 INIT_PUSHABLE_TASKS(tsk) \
173 INIT_CGROUP_SCHED(tsk) \
164 .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ 174 .ptraced = LIST_HEAD_INIT(tsk.ptraced), \
165 .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ 175 .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
166 .real_parent = &tsk, \ 176 .real_parent = &tsk, \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1a2ebd39b800..a721cef7e2d4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -949,6 +949,7 @@ struct sched_domain {
949 unsigned int smt_gain; 949 unsigned int smt_gain;
950 int flags; /* See SD_* */ 950 int flags; /* See SD_* */
951 int level; 951 int level;
952 int idle_buddy; /* cpu assigned to select_idle_sibling() */
952 953
953 /* Runtime fields. */ 954 /* Runtime fields. */
954 unsigned long last_balance; /* init to jiffies. units in jiffies */ 955 unsigned long last_balance; /* init to jiffies. units in jiffies */
@@ -1244,6 +1245,9 @@ struct task_struct {
1244 const struct sched_class *sched_class; 1245 const struct sched_class *sched_class;
1245 struct sched_entity se; 1246 struct sched_entity se;
1246 struct sched_rt_entity rt; 1247 struct sched_rt_entity rt;
1248#ifdef CONFIG_CGROUP_SCHED
1249 struct task_group *sched_task_group;
1250#endif
1247 1251
1248#ifdef CONFIG_PREEMPT_NOTIFIERS 1252#ifdef CONFIG_PREEMPT_NOTIFIERS
1249 /* list of struct preempt_notifier: */ 1253 /* list of struct preempt_notifier: */
@@ -2721,7 +2725,7 @@ extern int sched_group_set_rt_period(struct task_group *tg,
2721extern long sched_group_rt_period(struct task_group *tg); 2725extern long sched_group_rt_period(struct task_group *tg);
2722extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); 2726extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
2723#endif 2727#endif
2724#endif 2728#endif /* CONFIG_CGROUP_SCHED */
2725 2729
2726extern int task_can_switch_user(struct user_struct *up, 2730extern int task_can_switch_user(struct user_struct *up,
2727 struct task_struct *tsk); 2731 struct task_struct *tsk);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8c8bd652dd12..f33c7153b6d7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -147,6 +147,12 @@ typedef enum {
147 CS_SPREAD_SLAB, 147 CS_SPREAD_SLAB,
148} cpuset_flagbits_t; 148} cpuset_flagbits_t;
149 149
150/* the type of hotplug event */
151enum hotplug_event {
152 CPUSET_CPU_OFFLINE,
153 CPUSET_MEM_OFFLINE,
154};
155
150/* convenient tests for these bits */ 156/* convenient tests for these bits */
151static inline int is_cpu_exclusive(const struct cpuset *cs) 157static inline int is_cpu_exclusive(const struct cpuset *cs)
152{ 158{
@@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1990} 1996}
1991 1997
1992/* 1998/*
1993 * Walk the specified cpuset subtree and look for empty cpusets. 1999 * Helper function to traverse cpusets.
1994 * The tasks of such cpuset must be moved to a parent cpuset. 2000 * It can be used to walk the cpuset tree from top to bottom, completing
2001 * one layer before dropping down to the next (thus always processing a
2002 * node before any of its children).
2003 */
2004static struct cpuset *cpuset_next(struct list_head *queue)
2005{
2006 struct cpuset *cp;
2007 struct cpuset *child; /* scans child cpusets of cp */
2008 struct cgroup *cont;
2009
2010 if (list_empty(queue))
2011 return NULL;
2012
2013 cp = list_first_entry(queue, struct cpuset, stack_list);
2014 list_del(queue->next);
2015 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
2016 child = cgroup_cs(cont);
2017 list_add_tail(&child->stack_list, queue);
2018 }
2019
2020 return cp;
2021}
2022
2023
2024/*
2025 * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
2026 * online/offline) and update the cpusets accordingly.
2027 * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
2028 * cpuset must be moved to a parent cpuset.
1995 * 2029 *
1996 * Called with cgroup_mutex held. We take callback_mutex to modify 2030 * Called with cgroup_mutex held. We take callback_mutex to modify
1997 * cpus_allowed and mems_allowed. 2031 * cpus_allowed and mems_allowed.
@@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2000 * before dropping down to the next. It always processes a node before 2034 * before dropping down to the next. It always processes a node before
2001 * any of its children. 2035 * any of its children.
2002 * 2036 *
2003 * For now, since we lack memory hot unplug, we'll never see a cpuset 2037 * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
2004 * that has tasks along with an empty 'mems'. But if we did see such 2038 * if all present pages from a node are offlined.
2005 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
2006 */ 2039 */
2007static void scan_for_empty_cpusets(struct cpuset *root) 2040static void
2041scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2008{ 2042{
2009 LIST_HEAD(queue); 2043 LIST_HEAD(queue);
2010 struct cpuset *cp; /* scans cpusets being updated */ 2044 struct cpuset *cp; /* scans cpusets being updated */
2011 struct cpuset *child; /* scans child cpusets of cp */
2012 struct cgroup *cont;
2013 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2045 static nodemask_t oldmems; /* protected by cgroup_mutex */
2014 2046
2015 list_add_tail((struct list_head *)&root->stack_list, &queue); 2047 list_add_tail((struct list_head *)&root->stack_list, &queue);
2016 2048
2017 while (!list_empty(&queue)) { 2049 switch (event) {
2018 cp = list_first_entry(&queue, struct cpuset, stack_list); 2050 case CPUSET_CPU_OFFLINE:
2019 list_del(queue.next); 2051 while ((cp = cpuset_next(&queue)) != NULL) {
2020 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 2052
2021 child = cgroup_cs(cont); 2053 /* Continue past cpusets with all cpus online */
2022 list_add_tail(&child->stack_list, &queue); 2054 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
2055 continue;
2056
2057 /* Remove offline cpus from this cpuset. */
2058 mutex_lock(&callback_mutex);
2059 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2060 cpu_active_mask);
2061 mutex_unlock(&callback_mutex);
2062
2063 /* Move tasks from the empty cpuset to a parent */
2064 if (cpumask_empty(cp->cpus_allowed))
2065 remove_tasks_in_empty_cpuset(cp);
2066 else
2067 update_tasks_cpumask(cp, NULL);
2023 } 2068 }
2069 break;
2024 2070
2025 /* Continue past cpusets with all cpus, mems online */ 2071 case CPUSET_MEM_OFFLINE:
2026 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && 2072 while ((cp = cpuset_next(&queue)) != NULL) {
2027 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2028 continue;
2029 2073
2030 oldmems = cp->mems_allowed; 2074 /* Continue past cpusets with all mems online */
2075 if (nodes_subset(cp->mems_allowed,
2076 node_states[N_HIGH_MEMORY]))
2077 continue;
2031 2078
2032 /* Remove offline cpus and mems from this cpuset. */ 2079 oldmems = cp->mems_allowed;
2033 mutex_lock(&callback_mutex); 2080
2034 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2081 /* Remove offline mems from this cpuset. */
2035 cpu_active_mask); 2082 mutex_lock(&callback_mutex);
2036 nodes_and(cp->mems_allowed, cp->mems_allowed, 2083 nodes_and(cp->mems_allowed, cp->mems_allowed,
2037 node_states[N_HIGH_MEMORY]); 2084 node_states[N_HIGH_MEMORY]);
2038 mutex_unlock(&callback_mutex); 2085 mutex_unlock(&callback_mutex);
2039 2086
2040 /* Move tasks from the empty cpuset to a parent */ 2087 /* Move tasks from the empty cpuset to a parent */
2041 if (cpumask_empty(cp->cpus_allowed) || 2088 if (nodes_empty(cp->mems_allowed))
2042 nodes_empty(cp->mems_allowed)) 2089 remove_tasks_in_empty_cpuset(cp);
2043 remove_tasks_in_empty_cpuset(cp); 2090 else
2044 else { 2091 update_tasks_nodemask(cp, &oldmems, NULL);
2045 update_tasks_cpumask(cp, NULL);
2046 update_tasks_nodemask(cp, &oldmems, NULL);
2047 } 2092 }
2048 } 2093 }
2049} 2094}
@@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2054 * (of no affect) on systems that are actively using CPU hotplug 2099 * (of no affect) on systems that are actively using CPU hotplug
2055 * but making no active use of cpusets. 2100 * but making no active use of cpusets.
2056 * 2101 *
2102 * The only exception to this is suspend/resume, where we don't
2103 * modify cpusets at all.
2104 *
2057 * This routine ensures that top_cpuset.cpus_allowed tracks 2105 * This routine ensures that top_cpuset.cpus_allowed tracks
2058 * cpu_active_mask on each CPU hotplug (cpuhp) event. 2106 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2059 * 2107 *
2060 * Called within get_online_cpus(). Needs to call cgroup_lock() 2108 * Called within get_online_cpus(). Needs to call cgroup_lock()
2061 * before calling generate_sched_domains(). 2109 * before calling generate_sched_domains().
2110 *
2111 * @cpu_online: Indicates whether this is a CPU online event (true) or
2112 * a CPU offline event (false).
2062 */ 2113 */
2063void cpuset_update_active_cpus(void) 2114void cpuset_update_active_cpus(bool cpu_online)
2064{ 2115{
2065 struct sched_domain_attr *attr; 2116 struct sched_domain_attr *attr;
2066 cpumask_var_t *doms; 2117 cpumask_var_t *doms;
@@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void)
2070 mutex_lock(&callback_mutex); 2121 mutex_lock(&callback_mutex);
2071 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2122 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2072 mutex_unlock(&callback_mutex); 2123 mutex_unlock(&callback_mutex);
2073 scan_for_empty_cpusets(&top_cpuset); 2124
2125 if (!cpu_online)
2126 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
2127
2074 ndoms = generate_sched_domains(&doms, &attr); 2128 ndoms = generate_sched_domains(&doms, &attr);
2075 cgroup_unlock(); 2129 cgroup_unlock();
2076 2130
@@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void)
2082/* 2136/*
2083 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2137 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
2084 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. 2138 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
2085 * See also the previous routine cpuset_track_online_cpus(). 2139 * See cpuset_update_active_cpus() for CPU hotplug handling.
2086 */ 2140 */
2087static int cpuset_track_online_nodes(struct notifier_block *self, 2141static int cpuset_track_online_nodes(struct notifier_block *self,
2088 unsigned long action, void *arg) 2142 unsigned long action, void *arg)
@@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2101 case MEM_OFFLINE: 2155 case MEM_OFFLINE:
2102 /* 2156 /*
2103 * needn't update top_cpuset.mems_allowed explicitly because 2157 * needn't update top_cpuset.mems_allowed explicitly because
2104 * scan_for_empty_cpusets() will update it. 2158 * scan_cpusets_upon_hotplug() will update it.
2105 */ 2159 */
2106 scan_for_empty_cpusets(&top_cpuset); 2160 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
2107 break; 2161 break;
2108 default: 2162 default:
2109 break; 2163 break;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 468bdd44c1ba..5d011ef4c0df 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1096 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 1096 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
1097 * 1097 *
1098 * sched_move_task() holds both and thus holding either pins the cgroup, 1098 * sched_move_task() holds both and thus holding either pins the cgroup,
1099 * see set_task_rq(). 1099 * see task_group().
1100 * 1100 *
1101 * Furthermore, all task_rq users should acquire both locks, see 1101 * Furthermore, all task_rq users should acquire both locks, see
1102 * task_rq_lock(). 1102 * task_rq_lock().
@@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6024 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 6024 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
6025 * allows us to avoid some pointer chasing select_idle_sibling(). 6025 * allows us to avoid some pointer chasing select_idle_sibling().
6026 * 6026 *
6027 * Iterate domains and sched_groups downward, assigning CPUs to be
6028 * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
6029 * due to random perturbation self canceling, ie sw buddies pull
6030 * their counterpart to their CPU's hw counterpart.
6031 *
6027 * Also keep a unique ID per domain (we use the first cpu number in 6032 * Also keep a unique ID per domain (we use the first cpu number in
6028 * the cpumask of the domain), this allows us to quickly tell if 6033 * the cpumask of the domain), this allows us to quickly tell if
6029 * two cpus are in the same cache domain, see cpus_share_cache(). 6034 * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu)
6037 int id = cpu; 6042 int id = cpu;
6038 6043
6039 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 6044 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
6040 if (sd) 6045 if (sd) {
6046 struct sched_domain *tmp = sd;
6047 struct sched_group *sg, *prev;
6048 bool right;
6049
6050 /*
6051 * Traverse to first CPU in group, and count hops
6052 * to cpu from there, switching direction on each
6053 * hop, never ever pointing the last CPU rightward.
6054 */
6055 do {
6056 id = cpumask_first(sched_domain_span(tmp));
6057 prev = sg = tmp->groups;
6058 right = 1;
6059
6060 while (cpumask_first(sched_group_cpus(sg)) != id)
6061 sg = sg->next;
6062
6063 while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
6064 prev = sg;
6065 sg = sg->next;
6066 right = !right;
6067 }
6068
6069 /* A CPU went down, never point back to domain start. */
6070 if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
6071 right = false;
6072
6073 sg = right ? sg->next : prev;
6074 tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
6075 } while ((tmp = tmp->child));
6076
6041 id = cpumask_first(sched_domain_span(sd)); 6077 id = cpumask_first(sched_domain_span(sd));
6078 }
6042 6079
6043 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 6080 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
6044 per_cpu(sd_llc_id, cpu) = id; 6081 per_cpu(sd_llc_id, cpu) = id;
@@ -7097,34 +7134,66 @@ match2:
7097 mutex_unlock(&sched_domains_mutex); 7134 mutex_unlock(&sched_domains_mutex);
7098} 7135}
7099 7136
7137static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
7138
7100/* 7139/*
7101 * Update cpusets according to cpu_active mask. If cpusets are 7140 * Update cpusets according to cpu_active mask. If cpusets are
7102 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 7141 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7103 * around partition_sched_domains(). 7142 * around partition_sched_domains().
7143 *
7144 * If we come here as part of a suspend/resume, don't touch cpusets because we
7145 * want to restore it back to its original state upon resume anyway.
7104 */ 7146 */
7105static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 7147static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7106 void *hcpu) 7148 void *hcpu)
7107{ 7149{
7108 switch (action & ~CPU_TASKS_FROZEN) { 7150 switch (action) {
7151 case CPU_ONLINE_FROZEN:
7152 case CPU_DOWN_FAILED_FROZEN:
7153
7154 /*
7155 * num_cpus_frozen tracks how many CPUs are involved in suspend
7156 * resume sequence. As long as this is not the last online
7157 * operation in the resume sequence, just build a single sched
7158 * domain, ignoring cpusets.
7159 */
7160 num_cpus_frozen--;
7161 if (likely(num_cpus_frozen)) {
7162 partition_sched_domains(1, NULL, NULL);
7163 break;
7164 }
7165
7166 /*
7167 * This is the last CPU online operation. So fall through and
7168 * restore the original sched domains by considering the
7169 * cpuset configurations.
7170 */
7171
7109 case CPU_ONLINE: 7172 case CPU_ONLINE:
7110 case CPU_DOWN_FAILED: 7173 case CPU_DOWN_FAILED:
7111 cpuset_update_active_cpus(); 7174 cpuset_update_active_cpus(true);
7112 return NOTIFY_OK; 7175 break;
7113 default: 7176 default:
7114 return NOTIFY_DONE; 7177 return NOTIFY_DONE;
7115 } 7178 }
7179 return NOTIFY_OK;
7116} 7180}
7117 7181
7118static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 7182static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7119 void *hcpu) 7183 void *hcpu)
7120{ 7184{
7121 switch (action & ~CPU_TASKS_FROZEN) { 7185 switch (action) {
7122 case CPU_DOWN_PREPARE: 7186 case CPU_DOWN_PREPARE:
7123 cpuset_update_active_cpus(); 7187 cpuset_update_active_cpus(false);
7124 return NOTIFY_OK; 7188 break;
7189 case CPU_DOWN_PREPARE_FROZEN:
7190 num_cpus_frozen++;
7191 partition_sched_domains(1, NULL, NULL);
7192 break;
7125 default: 7193 default:
7126 return NOTIFY_DONE; 7194 return NOTIFY_DONE;
7127 } 7195 }
7196 return NOTIFY_OK;
7128} 7197}
7129 7198
7130void __init sched_init_smp(void) 7199void __init sched_init_smp(void)
@@ -7589,6 +7658,7 @@ void sched_destroy_group(struct task_group *tg)
7589 */ 7658 */
7590void sched_move_task(struct task_struct *tsk) 7659void sched_move_task(struct task_struct *tsk)
7591{ 7660{
7661 struct task_group *tg;
7592 int on_rq, running; 7662 int on_rq, running;
7593 unsigned long flags; 7663 unsigned long flags;
7594 struct rq *rq; 7664 struct rq *rq;
@@ -7603,6 +7673,12 @@ void sched_move_task(struct task_struct *tsk)
7603 if (unlikely(running)) 7673 if (unlikely(running))
7604 tsk->sched_class->put_prev_task(rq, tsk); 7674 tsk->sched_class->put_prev_task(rq, tsk);
7605 7675
7676 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
7677 lockdep_is_held(&tsk->sighand->siglock)),
7678 struct task_group, css);
7679 tg = autogroup_task_group(tsk, tg);
7680 tsk->sched_task_group = tg;
7681
7606#ifdef CONFIG_FAIR_GROUP_SCHED 7682#ifdef CONFIG_FAIR_GROUP_SCHED
7607 if (tsk->sched_class->task_move_group) 7683 if (tsk->sched_class->task_move_group)
7608 tsk->sched_class->task_move_group(tsk, on_rq); 7684 tsk->sched_class->task_move_group(tsk, on_rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6eebe3..22321db64952 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
2637 int cpu = smp_processor_id(); 2637 int cpu = smp_processor_id();
2638 int prev_cpu = task_cpu(p); 2638 int prev_cpu = task_cpu(p);
2639 struct sched_domain *sd; 2639 struct sched_domain *sd;
2640 struct sched_group *sg;
2641 int i;
2642 2640
2643 /* 2641 /*
2644 * If the task is going to be woken-up on this cpu and if it is 2642 * If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
2655 return prev_cpu; 2653 return prev_cpu;
2656 2654
2657 /* 2655 /*
2658 * Otherwise, iterate the domains and find an elegible idle cpu. 2656 * Otherwise, check assigned siblings to find an elegible idle cpu.
2659 */ 2657 */
2660 sd = rcu_dereference(per_cpu(sd_llc, target)); 2658 sd = rcu_dereference(per_cpu(sd_llc, target));
2661 for_each_lower_domain(sd) {
2662 sg = sd->groups;
2663 do {
2664 if (!cpumask_intersects(sched_group_cpus(sg),
2665 tsk_cpus_allowed(p)))
2666 goto next;
2667
2668 for_each_cpu(i, sched_group_cpus(sg)) {
2669 if (!idle_cpu(i))
2670 goto next;
2671 }
2672 2659
2673 target = cpumask_first_and(sched_group_cpus(sg), 2660 for_each_lower_domain(sd) {
2674 tsk_cpus_allowed(p)); 2661 if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
2675 goto done; 2662 continue;
2676next: 2663 if (idle_cpu(sd->idle_buddy))
2677 sg = sg->next; 2664 return sd->idle_buddy;
2678 } while (sg != sd->groups);
2679 } 2665 }
2680done: 2666
2681 return target; 2667 return target;
2682} 2668}
2683 2669
@@ -3068,16 +3054,19 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3068 3054
3069#define LBF_ALL_PINNED 0x01 3055#define LBF_ALL_PINNED 0x01
3070#define LBF_NEED_BREAK 0x02 3056#define LBF_NEED_BREAK 0x02
3057#define LBF_SOME_PINNED 0x04
3071 3058
3072struct lb_env { 3059struct lb_env {
3073 struct sched_domain *sd; 3060 struct sched_domain *sd;
3074 3061
3075 int src_cpu;
3076 struct rq *src_rq; 3062 struct rq *src_rq;
3063 int src_cpu;
3077 3064
3078 int dst_cpu; 3065 int dst_cpu;
3079 struct rq *dst_rq; 3066 struct rq *dst_rq;
3080 3067
3068 struct cpumask *dst_grpmask;
3069 int new_dst_cpu;
3081 enum cpu_idle_type idle; 3070 enum cpu_idle_type idle;
3082 long imbalance; 3071 long imbalance;
3083 unsigned int flags; 3072 unsigned int flags;
@@ -3145,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3145 * 3) are cache-hot on their current CPU. 3134 * 3) are cache-hot on their current CPU.
3146 */ 3135 */
3147 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 3136 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3137 int new_dst_cpu;
3138
3148 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3139 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3140
3141 /*
3142 * Remember if this task can be migrated to any other cpu in
3143 * our sched_group. We may want to revisit it if we couldn't
3144 * meet load balance goals by pulling other tasks on src_cpu.
3145 *
3146 * Also avoid computing new_dst_cpu if we have already computed
3147 * one in current iteration.
3148 */
3149 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3150 return 0;
3151
3152 new_dst_cpu = cpumask_first_and(env->dst_grpmask,
3153 tsk_cpus_allowed(p));
3154 if (new_dst_cpu < nr_cpu_ids) {
3155 env->flags |= LBF_SOME_PINNED;
3156 env->new_dst_cpu = new_dst_cpu;
3157 }
3149 return 0; 3158 return 0;
3150 } 3159 }
3160
3161 /* Record that we found atleast one task that could run on dst_cpu */
3151 env->flags &= ~LBF_ALL_PINNED; 3162 env->flags &= ~LBF_ALL_PINNED;
3152 3163
3153 if (task_running(env->src_rq, p)) { 3164 if (task_running(env->src_rq, p)) {
@@ -4227,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4227 struct sched_domain *sd, enum cpu_idle_type idle, 4238 struct sched_domain *sd, enum cpu_idle_type idle,
4228 int *balance) 4239 int *balance)
4229{ 4240{
4230 int ld_moved, active_balance = 0; 4241 int ld_moved, cur_ld_moved, active_balance = 0;
4242 int lb_iterations, max_lb_iterations;
4231 struct sched_group *group; 4243 struct sched_group *group;
4232 struct rq *busiest; 4244 struct rq *busiest;
4233 unsigned long flags; 4245 unsigned long flags;
@@ -4237,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4237 .sd = sd, 4249 .sd = sd,
4238 .dst_cpu = this_cpu, 4250 .dst_cpu = this_cpu,
4239 .dst_rq = this_rq, 4251 .dst_rq = this_rq,
4252 .dst_grpmask = sched_group_cpus(sd->groups),
4240 .idle = idle, 4253 .idle = idle,
4241 .loop_break = sched_nr_migrate_break, 4254 .loop_break = sched_nr_migrate_break,
4242 }; 4255 };
4243 4256
4244 cpumask_copy(cpus, cpu_active_mask); 4257 cpumask_copy(cpus, cpu_active_mask);
4258 max_lb_iterations = cpumask_weight(env.dst_grpmask);
4245 4259
4246 schedstat_inc(sd, lb_count[idle]); 4260 schedstat_inc(sd, lb_count[idle]);
4247 4261
@@ -4267,6 +4281,7 @@ redo:
4267 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 4281 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4268 4282
4269 ld_moved = 0; 4283 ld_moved = 0;
4284 lb_iterations = 1;
4270 if (busiest->nr_running > 1) { 4285 if (busiest->nr_running > 1) {
4271 /* 4286 /*
4272 * Attempt to move tasks. If find_busiest_group has found 4287 * Attempt to move tasks. If find_busiest_group has found
@@ -4284,7 +4299,13 @@ more_balance:
4284 double_rq_lock(this_rq, busiest); 4299 double_rq_lock(this_rq, busiest);
4285 if (!env.loop) 4300 if (!env.loop)
4286 update_h_load(env.src_cpu); 4301 update_h_load(env.src_cpu);
4287 ld_moved += move_tasks(&env); 4302
4303 /*
4304 * cur_ld_moved - load moved in current iteration
4305 * ld_moved - cumulative load moved across iterations
4306 */
4307 cur_ld_moved = move_tasks(&env);
4308 ld_moved += cur_ld_moved;
4288 double_rq_unlock(this_rq, busiest); 4309 double_rq_unlock(this_rq, busiest);
4289 local_irq_restore(flags); 4310 local_irq_restore(flags);
4290 4311
@@ -4296,14 +4317,52 @@ more_balance:
4296 /* 4317 /*
4297 * some other cpu did the load balance for us. 4318 * some other cpu did the load balance for us.
4298 */ 4319 */
4299 if (ld_moved && this_cpu != smp_processor_id()) 4320 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
4300 resched_cpu(this_cpu); 4321 resched_cpu(env.dst_cpu);
4322
4323 /*
4324 * Revisit (affine) tasks on src_cpu that couldn't be moved to
4325 * us and move them to an alternate dst_cpu in our sched_group
4326 * where they can run. The upper limit on how many times we
4327 * iterate on same src_cpu is dependent on number of cpus in our
4328 * sched_group.
4329 *
4330 * This changes load balance semantics a bit on who can move
4331 * load to a given_cpu. In addition to the given_cpu itself
4332 * (or a ilb_cpu acting on its behalf where given_cpu is
4333 * nohz-idle), we now have balance_cpu in a position to move
4334 * load to given_cpu. In rare situations, this may cause
4335 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
4336 * _independently_ and at _same_ time to move some load to
4337 * given_cpu) causing exceess load to be moved to given_cpu.
4338 * This however should not happen so much in practice and
4339 * moreover subsequent load balance cycles should correct the
4340 * excess load moved.
4341 */
4342 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
4343 lb_iterations++ < max_lb_iterations) {
4344
4345 this_rq = cpu_rq(env.new_dst_cpu);
4346 env.dst_rq = this_rq;
4347 env.dst_cpu = env.new_dst_cpu;
4348 env.flags &= ~LBF_SOME_PINNED;
4349 env.loop = 0;
4350 env.loop_break = sched_nr_migrate_break;
4351 /*
4352 * Go back to "more_balance" rather than "redo" since we
4353 * need to continue with same src_cpu.
4354 */
4355 goto more_balance;
4356 }
4301 4357
4302 /* All tasks on this runqueue were pinned by CPU affinity */ 4358 /* All tasks on this runqueue were pinned by CPU affinity */
4303 if (unlikely(env.flags & LBF_ALL_PINNED)) { 4359 if (unlikely(env.flags & LBF_ALL_PINNED)) {
4304 cpumask_clear_cpu(cpu_of(busiest), cpus); 4360 cpumask_clear_cpu(cpu_of(busiest), cpus);
4305 if (!cpumask_empty(cpus)) 4361 if (!cpumask_empty(cpus)) {
4362 env.loop = 0;
4363 env.loop_break = sched_nr_migrate_break;
4306 goto redo; 4364 goto redo;
4365 }
4307 goto out_balanced; 4366 goto out_balanced;
4308 } 4367 }
4309 } 4368 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55844f24435a..c35a1a7dd4d6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg);
538/* 538/*
539 * Return the group to which this tasks belongs. 539 * Return the group to which this tasks belongs.
540 * 540 *
541 * We use task_subsys_state_check() and extend the RCU verification with 541 * We cannot use task_subsys_state() and friends because the cgroup
542 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each 542 * subsystem changes that value before the cgroup_subsys::attach() method
543 * task it moves into the cgroup. Therefore by holding either of those locks, 543 * is called, therefore we cannot pin it and might observe the wrong value.
544 * we pin the task to the current cgroup. 544 *
545 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
546 * core changes this before calling sched_move_task().
547 *
548 * Instead we use a 'copy' which is updated from sched_move_task() while
549 * holding both task_struct::pi_lock and rq::lock.
545 */ 550 */
546static inline struct task_group *task_group(struct task_struct *p) 551static inline struct task_group *task_group(struct task_struct *p)
547{ 552{
548 struct task_group *tg; 553 return p->sched_task_group;
549 struct cgroup_subsys_state *css;
550
551 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
552 lockdep_is_held(&p->pi_lock) ||
553 lockdep_is_held(&task_rq(p)->lock));
554 tg = container_of(css, struct task_group, css);
555
556 return autogroup_task_group(p, tg);
557} 554}
558 555
559/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 556/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */