aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-07-26 16:08:01 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-26 16:08:01 -0400
commit79071638ce655c1f78a50d05c7dae0ad04a3e92a (patch)
treed9e76997c418b78a2485ac50d5970f7d420a5600 /kernel
parent44a6b8442190cf213081060b610dae2e822f802b (diff)
parent8323f26ce3425460769605a6aece7a174edaa7d1 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "The biggest change is a performance improvement on SMP systems: | 4 socket 40 core + SMT Westmere box, single 30 sec tbench | runs, higher is better: | | clients 1 2 4 8 16 32 64 128 |.......................................................................... | pre 30 41 118 645 3769 6214 12233 14312 | post 299 603 1211 2418 4697 6847 11606 14557 | | A nice increase in performance. which speedup is particularly noticeable on heavily interacting few-tasks workloads, so the changes should help desktop-style Xorg workloads and interactivity as well, on multi-core CPUs. There are also cpuset suspend behavior fixes/restructuring and various smaller tweaks." * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched: Fix race in task_group() sched: Improve balance_cpu() to consider other cpus in its group as target of (pinned) task sched: Reset loop counters if all tasks are pinned and we need to redo load balance sched: Reorder 'struct lb_env' members to reduce its size sched: Improve scalability via 'CPU buddies', which withstand random perturbations cpusets: Remove/update outdated comments cpusets, hotplug: Restructure functions that are invoked during hotplug cpusets, hotplug: Implement cpuset tree traversal in a helper function CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume sched/x86: Remove broken power estimation
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpuset.c130
-rw-r--r--kernel/sched/core.c92
-rw-r--r--kernel/sched/fair.c113
-rw-r--r--kernel/sched/sched.h23
4 files changed, 272 insertions, 86 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8c8bd652dd12..f33c7153b6d7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -147,6 +147,12 @@ typedef enum {
147 CS_SPREAD_SLAB, 147 CS_SPREAD_SLAB,
148} cpuset_flagbits_t; 148} cpuset_flagbits_t;
149 149
150/* the type of hotplug event */
151enum hotplug_event {
152 CPUSET_CPU_OFFLINE,
153 CPUSET_MEM_OFFLINE,
154};
155
150/* convenient tests for these bits */ 156/* convenient tests for these bits */
151static inline int is_cpu_exclusive(const struct cpuset *cs) 157static inline int is_cpu_exclusive(const struct cpuset *cs)
152{ 158{
@@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1990} 1996}
1991 1997
1992/* 1998/*
1993 * Walk the specified cpuset subtree and look for empty cpusets. 1999 * Helper function to traverse cpusets.
1994 * The tasks of such cpuset must be moved to a parent cpuset. 2000 * It can be used to walk the cpuset tree from top to bottom, completing
2001 * one layer before dropping down to the next (thus always processing a
2002 * node before any of its children).
2003 */
2004static struct cpuset *cpuset_next(struct list_head *queue)
2005{
2006 struct cpuset *cp;
2007 struct cpuset *child; /* scans child cpusets of cp */
2008 struct cgroup *cont;
2009
2010 if (list_empty(queue))
2011 return NULL;
2012
2013 cp = list_first_entry(queue, struct cpuset, stack_list);
2014 list_del(queue->next);
2015 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
2016 child = cgroup_cs(cont);
2017 list_add_tail(&child->stack_list, queue);
2018 }
2019
2020 return cp;
2021}
2022
2023
2024/*
2025 * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
2026 * online/offline) and update the cpusets accordingly.
2027 * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
2028 * cpuset must be moved to a parent cpuset.
1995 * 2029 *
1996 * Called with cgroup_mutex held. We take callback_mutex to modify 2030 * Called with cgroup_mutex held. We take callback_mutex to modify
1997 * cpus_allowed and mems_allowed. 2031 * cpus_allowed and mems_allowed.
@@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2000 * before dropping down to the next. It always processes a node before 2034 * before dropping down to the next. It always processes a node before
2001 * any of its children. 2035 * any of its children.
2002 * 2036 *
2003 * For now, since we lack memory hot unplug, we'll never see a cpuset 2037 * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
2004 * that has tasks along with an empty 'mems'. But if we did see such 2038 * if all present pages from a node are offlined.
2005 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
2006 */ 2039 */
2007static void scan_for_empty_cpusets(struct cpuset *root) 2040static void
2041scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2008{ 2042{
2009 LIST_HEAD(queue); 2043 LIST_HEAD(queue);
2010 struct cpuset *cp; /* scans cpusets being updated */ 2044 struct cpuset *cp; /* scans cpusets being updated */
2011 struct cpuset *child; /* scans child cpusets of cp */
2012 struct cgroup *cont;
2013 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2045 static nodemask_t oldmems; /* protected by cgroup_mutex */
2014 2046
2015 list_add_tail((struct list_head *)&root->stack_list, &queue); 2047 list_add_tail((struct list_head *)&root->stack_list, &queue);
2016 2048
2017 while (!list_empty(&queue)) { 2049 switch (event) {
2018 cp = list_first_entry(&queue, struct cpuset, stack_list); 2050 case CPUSET_CPU_OFFLINE:
2019 list_del(queue.next); 2051 while ((cp = cpuset_next(&queue)) != NULL) {
2020 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 2052
2021 child = cgroup_cs(cont); 2053 /* Continue past cpusets with all cpus online */
2022 list_add_tail(&child->stack_list, &queue); 2054 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
2055 continue;
2056
2057 /* Remove offline cpus from this cpuset. */
2058 mutex_lock(&callback_mutex);
2059 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2060 cpu_active_mask);
2061 mutex_unlock(&callback_mutex);
2062
2063 /* Move tasks from the empty cpuset to a parent */
2064 if (cpumask_empty(cp->cpus_allowed))
2065 remove_tasks_in_empty_cpuset(cp);
2066 else
2067 update_tasks_cpumask(cp, NULL);
2023 } 2068 }
2069 break;
2024 2070
2025 /* Continue past cpusets with all cpus, mems online */ 2071 case CPUSET_MEM_OFFLINE:
2026 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && 2072 while ((cp = cpuset_next(&queue)) != NULL) {
2027 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2028 continue;
2029 2073
2030 oldmems = cp->mems_allowed; 2074 /* Continue past cpusets with all mems online */
2075 if (nodes_subset(cp->mems_allowed,
2076 node_states[N_HIGH_MEMORY]))
2077 continue;
2031 2078
2032 /* Remove offline cpus and mems from this cpuset. */ 2079 oldmems = cp->mems_allowed;
2033 mutex_lock(&callback_mutex); 2080
2034 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2081 /* Remove offline mems from this cpuset. */
2035 cpu_active_mask); 2082 mutex_lock(&callback_mutex);
2036 nodes_and(cp->mems_allowed, cp->mems_allowed, 2083 nodes_and(cp->mems_allowed, cp->mems_allowed,
2037 node_states[N_HIGH_MEMORY]); 2084 node_states[N_HIGH_MEMORY]);
2038 mutex_unlock(&callback_mutex); 2085 mutex_unlock(&callback_mutex);
2039 2086
2040 /* Move tasks from the empty cpuset to a parent */ 2087 /* Move tasks from the empty cpuset to a parent */
2041 if (cpumask_empty(cp->cpus_allowed) || 2088 if (nodes_empty(cp->mems_allowed))
2042 nodes_empty(cp->mems_allowed)) 2089 remove_tasks_in_empty_cpuset(cp);
2043 remove_tasks_in_empty_cpuset(cp); 2090 else
2044 else { 2091 update_tasks_nodemask(cp, &oldmems, NULL);
2045 update_tasks_cpumask(cp, NULL);
2046 update_tasks_nodemask(cp, &oldmems, NULL);
2047 } 2092 }
2048 } 2093 }
2049} 2094}
@@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2054 * (of no affect) on systems that are actively using CPU hotplug 2099 * (of no affect) on systems that are actively using CPU hotplug
2055 * but making no active use of cpusets. 2100 * but making no active use of cpusets.
2056 * 2101 *
2102 * The only exception to this is suspend/resume, where we don't
2103 * modify cpusets at all.
2104 *
2057 * This routine ensures that top_cpuset.cpus_allowed tracks 2105 * This routine ensures that top_cpuset.cpus_allowed tracks
2058 * cpu_active_mask on each CPU hotplug (cpuhp) event. 2106 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2059 * 2107 *
2060 * Called within get_online_cpus(). Needs to call cgroup_lock() 2108 * Called within get_online_cpus(). Needs to call cgroup_lock()
2061 * before calling generate_sched_domains(). 2109 * before calling generate_sched_domains().
2110 *
2111 * @cpu_online: Indicates whether this is a CPU online event (true) or
2112 * a CPU offline event (false).
2062 */ 2113 */
2063void cpuset_update_active_cpus(void) 2114void cpuset_update_active_cpus(bool cpu_online)
2064{ 2115{
2065 struct sched_domain_attr *attr; 2116 struct sched_domain_attr *attr;
2066 cpumask_var_t *doms; 2117 cpumask_var_t *doms;
@@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void)
2070 mutex_lock(&callback_mutex); 2121 mutex_lock(&callback_mutex);
2071 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2122 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2072 mutex_unlock(&callback_mutex); 2123 mutex_unlock(&callback_mutex);
2073 scan_for_empty_cpusets(&top_cpuset); 2124
2125 if (!cpu_online)
2126 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
2127
2074 ndoms = generate_sched_domains(&doms, &attr); 2128 ndoms = generate_sched_domains(&doms, &attr);
2075 cgroup_unlock(); 2129 cgroup_unlock();
2076 2130
@@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void)
2082/* 2136/*
2083 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2137 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
2084 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. 2138 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
2085 * See also the previous routine cpuset_track_online_cpus(). 2139 * See cpuset_update_active_cpus() for CPU hotplug handling.
2086 */ 2140 */
2087static int cpuset_track_online_nodes(struct notifier_block *self, 2141static int cpuset_track_online_nodes(struct notifier_block *self,
2088 unsigned long action, void *arg) 2142 unsigned long action, void *arg)
@@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2101 case MEM_OFFLINE: 2155 case MEM_OFFLINE:
2102 /* 2156 /*
2103 * needn't update top_cpuset.mems_allowed explicitly because 2157 * needn't update top_cpuset.mems_allowed explicitly because
2104 * scan_for_empty_cpusets() will update it. 2158 * scan_cpusets_upon_hotplug() will update it.
2105 */ 2159 */
2106 scan_for_empty_cpusets(&top_cpuset); 2160 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
2107 break; 2161 break;
2108 default: 2162 default:
2109 break; 2163 break;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 468bdd44c1ba..5d011ef4c0df 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1096 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 1096 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
1097 * 1097 *
1098 * sched_move_task() holds both and thus holding either pins the cgroup, 1098 * sched_move_task() holds both and thus holding either pins the cgroup,
1099 * see set_task_rq(). 1099 * see task_group().
1100 * 1100 *
1101 * Furthermore, all task_rq users should acquire both locks, see 1101 * Furthermore, all task_rq users should acquire both locks, see
1102 * task_rq_lock(). 1102 * task_rq_lock().
@@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6024 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 6024 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
6025 * allows us to avoid some pointer chasing select_idle_sibling(). 6025 * allows us to avoid some pointer chasing select_idle_sibling().
6026 * 6026 *
6027 * Iterate domains and sched_groups downward, assigning CPUs to be
6028 * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
6029 * due to random perturbation self canceling, ie sw buddies pull
6030 * their counterpart to their CPU's hw counterpart.
6031 *
6027 * Also keep a unique ID per domain (we use the first cpu number in 6032 * Also keep a unique ID per domain (we use the first cpu number in
6028 * the cpumask of the domain), this allows us to quickly tell if 6033 * the cpumask of the domain), this allows us to quickly tell if
6029 * two cpus are in the same cache domain, see cpus_share_cache(). 6034 * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu)
6037 int id = cpu; 6042 int id = cpu;
6038 6043
6039 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 6044 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
6040 if (sd) 6045 if (sd) {
6046 struct sched_domain *tmp = sd;
6047 struct sched_group *sg, *prev;
6048 bool right;
6049
6050 /*
6051 * Traverse to first CPU in group, and count hops
6052 * to cpu from there, switching direction on each
6053 * hop, never ever pointing the last CPU rightward.
6054 */
6055 do {
6056 id = cpumask_first(sched_domain_span(tmp));
6057 prev = sg = tmp->groups;
6058 right = 1;
6059
6060 while (cpumask_first(sched_group_cpus(sg)) != id)
6061 sg = sg->next;
6062
6063 while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
6064 prev = sg;
6065 sg = sg->next;
6066 right = !right;
6067 }
6068
6069 /* A CPU went down, never point back to domain start. */
6070 if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
6071 right = false;
6072
6073 sg = right ? sg->next : prev;
6074 tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
6075 } while ((tmp = tmp->child));
6076
6041 id = cpumask_first(sched_domain_span(sd)); 6077 id = cpumask_first(sched_domain_span(sd));
6078 }
6042 6079
6043 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 6080 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
6044 per_cpu(sd_llc_id, cpu) = id; 6081 per_cpu(sd_llc_id, cpu) = id;
@@ -7097,34 +7134,66 @@ match2:
7097 mutex_unlock(&sched_domains_mutex); 7134 mutex_unlock(&sched_domains_mutex);
7098} 7135}
7099 7136
7137static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
7138
7100/* 7139/*
7101 * Update cpusets according to cpu_active mask. If cpusets are 7140 * Update cpusets according to cpu_active mask. If cpusets are
7102 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 7141 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7103 * around partition_sched_domains(). 7142 * around partition_sched_domains().
7143 *
7144 * If we come here as part of a suspend/resume, don't touch cpusets because we
7145 * want to restore it back to its original state upon resume anyway.
7104 */ 7146 */
7105static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 7147static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7106 void *hcpu) 7148 void *hcpu)
7107{ 7149{
7108 switch (action & ~CPU_TASKS_FROZEN) { 7150 switch (action) {
7151 case CPU_ONLINE_FROZEN:
7152 case CPU_DOWN_FAILED_FROZEN:
7153
7154 /*
7155 * num_cpus_frozen tracks how many CPUs are involved in suspend
7156 * resume sequence. As long as this is not the last online
7157 * operation in the resume sequence, just build a single sched
7158 * domain, ignoring cpusets.
7159 */
7160 num_cpus_frozen--;
7161 if (likely(num_cpus_frozen)) {
7162 partition_sched_domains(1, NULL, NULL);
7163 break;
7164 }
7165
7166 /*
7167 * This is the last CPU online operation. So fall through and
7168 * restore the original sched domains by considering the
7169 * cpuset configurations.
7170 */
7171
7109 case CPU_ONLINE: 7172 case CPU_ONLINE:
7110 case CPU_DOWN_FAILED: 7173 case CPU_DOWN_FAILED:
7111 cpuset_update_active_cpus(); 7174 cpuset_update_active_cpus(true);
7112 return NOTIFY_OK; 7175 break;
7113 default: 7176 default:
7114 return NOTIFY_DONE; 7177 return NOTIFY_DONE;
7115 } 7178 }
7179 return NOTIFY_OK;
7116} 7180}
7117 7181
7118static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 7182static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7119 void *hcpu) 7183 void *hcpu)
7120{ 7184{
7121 switch (action & ~CPU_TASKS_FROZEN) { 7185 switch (action) {
7122 case CPU_DOWN_PREPARE: 7186 case CPU_DOWN_PREPARE:
7123 cpuset_update_active_cpus(); 7187 cpuset_update_active_cpus(false);
7124 return NOTIFY_OK; 7188 break;
7189 case CPU_DOWN_PREPARE_FROZEN:
7190 num_cpus_frozen++;
7191 partition_sched_domains(1, NULL, NULL);
7192 break;
7125 default: 7193 default:
7126 return NOTIFY_DONE; 7194 return NOTIFY_DONE;
7127 } 7195 }
7196 return NOTIFY_OK;
7128} 7197}
7129 7198
7130void __init sched_init_smp(void) 7199void __init sched_init_smp(void)
@@ -7589,6 +7658,7 @@ void sched_destroy_group(struct task_group *tg)
7589 */ 7658 */
7590void sched_move_task(struct task_struct *tsk) 7659void sched_move_task(struct task_struct *tsk)
7591{ 7660{
7661 struct task_group *tg;
7592 int on_rq, running; 7662 int on_rq, running;
7593 unsigned long flags; 7663 unsigned long flags;
7594 struct rq *rq; 7664 struct rq *rq;
@@ -7603,6 +7673,12 @@ void sched_move_task(struct task_struct *tsk)
7603 if (unlikely(running)) 7673 if (unlikely(running))
7604 tsk->sched_class->put_prev_task(rq, tsk); 7674 tsk->sched_class->put_prev_task(rq, tsk);
7605 7675
7676 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
7677 lockdep_is_held(&tsk->sighand->siglock)),
7678 struct task_group, css);
7679 tg = autogroup_task_group(tsk, tg);
7680 tsk->sched_task_group = tg;
7681
7606#ifdef CONFIG_FAIR_GROUP_SCHED 7682#ifdef CONFIG_FAIR_GROUP_SCHED
7607 if (tsk->sched_class->task_move_group) 7683 if (tsk->sched_class->task_move_group)
7608 tsk->sched_class->task_move_group(tsk, on_rq); 7684 tsk->sched_class->task_move_group(tsk, on_rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6eebe3..22321db64952 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
2637 int cpu = smp_processor_id(); 2637 int cpu = smp_processor_id();
2638 int prev_cpu = task_cpu(p); 2638 int prev_cpu = task_cpu(p);
2639 struct sched_domain *sd; 2639 struct sched_domain *sd;
2640 struct sched_group *sg;
2641 int i;
2642 2640
2643 /* 2641 /*
2644 * If the task is going to be woken-up on this cpu and if it is 2642 * If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
2655 return prev_cpu; 2653 return prev_cpu;
2656 2654
2657 /* 2655 /*
2658 * Otherwise, iterate the domains and find an elegible idle cpu. 2656 * Otherwise, check assigned siblings to find an elegible idle cpu.
2659 */ 2657 */
2660 sd = rcu_dereference(per_cpu(sd_llc, target)); 2658 sd = rcu_dereference(per_cpu(sd_llc, target));
2661 for_each_lower_domain(sd) {
2662 sg = sd->groups;
2663 do {
2664 if (!cpumask_intersects(sched_group_cpus(sg),
2665 tsk_cpus_allowed(p)))
2666 goto next;
2667
2668 for_each_cpu(i, sched_group_cpus(sg)) {
2669 if (!idle_cpu(i))
2670 goto next;
2671 }
2672 2659
2673 target = cpumask_first_and(sched_group_cpus(sg), 2660 for_each_lower_domain(sd) {
2674 tsk_cpus_allowed(p)); 2661 if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
2675 goto done; 2662 continue;
2676next: 2663 if (idle_cpu(sd->idle_buddy))
2677 sg = sg->next; 2664 return sd->idle_buddy;
2678 } while (sg != sd->groups);
2679 } 2665 }
2680done: 2666
2681 return target; 2667 return target;
2682} 2668}
2683 2669
@@ -3068,16 +3054,19 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3068 3054
3069#define LBF_ALL_PINNED 0x01 3055#define LBF_ALL_PINNED 0x01
3070#define LBF_NEED_BREAK 0x02 3056#define LBF_NEED_BREAK 0x02
3057#define LBF_SOME_PINNED 0x04
3071 3058
3072struct lb_env { 3059struct lb_env {
3073 struct sched_domain *sd; 3060 struct sched_domain *sd;
3074 3061
3075 int src_cpu;
3076 struct rq *src_rq; 3062 struct rq *src_rq;
3063 int src_cpu;
3077 3064
3078 int dst_cpu; 3065 int dst_cpu;
3079 struct rq *dst_rq; 3066 struct rq *dst_rq;
3080 3067
3068 struct cpumask *dst_grpmask;
3069 int new_dst_cpu;
3081 enum cpu_idle_type idle; 3070 enum cpu_idle_type idle;
3082 long imbalance; 3071 long imbalance;
3083 unsigned int flags; 3072 unsigned int flags;
@@ -3145,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3145 * 3) are cache-hot on their current CPU. 3134 * 3) are cache-hot on their current CPU.
3146 */ 3135 */
3147 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 3136 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3137 int new_dst_cpu;
3138
3148 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3139 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3140
3141 /*
3142 * Remember if this task can be migrated to any other cpu in
3143 * our sched_group. We may want to revisit it if we couldn't
3144 * meet load balance goals by pulling other tasks on src_cpu.
3145 *
3146 * Also avoid computing new_dst_cpu if we have already computed
3147 * one in current iteration.
3148 */
3149 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3150 return 0;
3151
3152 new_dst_cpu = cpumask_first_and(env->dst_grpmask,
3153 tsk_cpus_allowed(p));
3154 if (new_dst_cpu < nr_cpu_ids) {
3155 env->flags |= LBF_SOME_PINNED;
3156 env->new_dst_cpu = new_dst_cpu;
3157 }
3149 return 0; 3158 return 0;
3150 } 3159 }
3160
3161 /* Record that we found atleast one task that could run on dst_cpu */
3151 env->flags &= ~LBF_ALL_PINNED; 3162 env->flags &= ~LBF_ALL_PINNED;
3152 3163
3153 if (task_running(env->src_rq, p)) { 3164 if (task_running(env->src_rq, p)) {
@@ -4227,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4227 struct sched_domain *sd, enum cpu_idle_type idle, 4238 struct sched_domain *sd, enum cpu_idle_type idle,
4228 int *balance) 4239 int *balance)
4229{ 4240{
4230 int ld_moved, active_balance = 0; 4241 int ld_moved, cur_ld_moved, active_balance = 0;
4242 int lb_iterations, max_lb_iterations;
4231 struct sched_group *group; 4243 struct sched_group *group;
4232 struct rq *busiest; 4244 struct rq *busiest;
4233 unsigned long flags; 4245 unsigned long flags;
@@ -4237,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4237 .sd = sd, 4249 .sd = sd,
4238 .dst_cpu = this_cpu, 4250 .dst_cpu = this_cpu,
4239 .dst_rq = this_rq, 4251 .dst_rq = this_rq,
4252 .dst_grpmask = sched_group_cpus(sd->groups),
4240 .idle = idle, 4253 .idle = idle,
4241 .loop_break = sched_nr_migrate_break, 4254 .loop_break = sched_nr_migrate_break,
4242 }; 4255 };
4243 4256
4244 cpumask_copy(cpus, cpu_active_mask); 4257 cpumask_copy(cpus, cpu_active_mask);
4258 max_lb_iterations = cpumask_weight(env.dst_grpmask);
4245 4259
4246 schedstat_inc(sd, lb_count[idle]); 4260 schedstat_inc(sd, lb_count[idle]);
4247 4261
@@ -4267,6 +4281,7 @@ redo:
4267 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 4281 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4268 4282
4269 ld_moved = 0; 4283 ld_moved = 0;
4284 lb_iterations = 1;
4270 if (busiest->nr_running > 1) { 4285 if (busiest->nr_running > 1) {
4271 /* 4286 /*
4272 * Attempt to move tasks. If find_busiest_group has found 4287 * Attempt to move tasks. If find_busiest_group has found
@@ -4284,7 +4299,13 @@ more_balance:
4284 double_rq_lock(this_rq, busiest); 4299 double_rq_lock(this_rq, busiest);
4285 if (!env.loop) 4300 if (!env.loop)
4286 update_h_load(env.src_cpu); 4301 update_h_load(env.src_cpu);
4287 ld_moved += move_tasks(&env); 4302
4303 /*
4304 * cur_ld_moved - load moved in current iteration
4305 * ld_moved - cumulative load moved across iterations
4306 */
4307 cur_ld_moved = move_tasks(&env);
4308 ld_moved += cur_ld_moved;
4288 double_rq_unlock(this_rq, busiest); 4309 double_rq_unlock(this_rq, busiest);
4289 local_irq_restore(flags); 4310 local_irq_restore(flags);
4290 4311
@@ -4296,14 +4317,52 @@ more_balance:
4296 /* 4317 /*
4297 * some other cpu did the load balance for us. 4318 * some other cpu did the load balance for us.
4298 */ 4319 */
4299 if (ld_moved && this_cpu != smp_processor_id()) 4320 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
4300 resched_cpu(this_cpu); 4321 resched_cpu(env.dst_cpu);
4322
4323 /*
4324 * Revisit (affine) tasks on src_cpu that couldn't be moved to
4325 * us and move them to an alternate dst_cpu in our sched_group
4326 * where they can run. The upper limit on how many times we
4327 * iterate on same src_cpu is dependent on number of cpus in our
4328 * sched_group.
4329 *
4330 * This changes load balance semantics a bit on who can move
4331 * load to a given_cpu. In addition to the given_cpu itself
4332 * (or a ilb_cpu acting on its behalf where given_cpu is
4333 * nohz-idle), we now have balance_cpu in a position to move
4334 * load to given_cpu. In rare situations, this may cause
4335 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
4336 * _independently_ and at _same_ time to move some load to
4337 * given_cpu) causing exceess load to be moved to given_cpu.
4338 * This however should not happen so much in practice and
4339 * moreover subsequent load balance cycles should correct the
4340 * excess load moved.
4341 */
4342 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
4343 lb_iterations++ < max_lb_iterations) {
4344
4345 this_rq = cpu_rq(env.new_dst_cpu);
4346 env.dst_rq = this_rq;
4347 env.dst_cpu = env.new_dst_cpu;
4348 env.flags &= ~LBF_SOME_PINNED;
4349 env.loop = 0;
4350 env.loop_break = sched_nr_migrate_break;
4351 /*
4352 * Go back to "more_balance" rather than "redo" since we
4353 * need to continue with same src_cpu.
4354 */
4355 goto more_balance;
4356 }
4301 4357
4302 /* All tasks on this runqueue were pinned by CPU affinity */ 4358 /* All tasks on this runqueue were pinned by CPU affinity */
4303 if (unlikely(env.flags & LBF_ALL_PINNED)) { 4359 if (unlikely(env.flags & LBF_ALL_PINNED)) {
4304 cpumask_clear_cpu(cpu_of(busiest), cpus); 4360 cpumask_clear_cpu(cpu_of(busiest), cpus);
4305 if (!cpumask_empty(cpus)) 4361 if (!cpumask_empty(cpus)) {
4362 env.loop = 0;
4363 env.loop_break = sched_nr_migrate_break;
4306 goto redo; 4364 goto redo;
4365 }
4307 goto out_balanced; 4366 goto out_balanced;
4308 } 4367 }
4309 } 4368 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55844f24435a..c35a1a7dd4d6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg);
538/* 538/*
539 * Return the group to which this tasks belongs. 539 * Return the group to which this tasks belongs.
540 * 540 *
541 * We use task_subsys_state_check() and extend the RCU verification with 541 * We cannot use task_subsys_state() and friends because the cgroup
542 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each 542 * subsystem changes that value before the cgroup_subsys::attach() method
543 * task it moves into the cgroup. Therefore by holding either of those locks, 543 * is called, therefore we cannot pin it and might observe the wrong value.
544 * we pin the task to the current cgroup. 544 *
545 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
546 * core changes this before calling sched_move_task().
547 *
548 * Instead we use a 'copy' which is updated from sched_move_task() while
549 * holding both task_struct::pi_lock and rq::lock.
545 */ 550 */
546static inline struct task_group *task_group(struct task_struct *p) 551static inline struct task_group *task_group(struct task_struct *p)
547{ 552{
548 struct task_group *tg; 553 return p->sched_task_group;
549 struct cgroup_subsys_state *css;
550
551 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
552 lockdep_is_held(&p->pi_lock) ||
553 lockdep_is_held(&task_rq(p)->lock));
554 tg = container_of(css, struct task_group, css);
555
556 return autogroup_task_group(p, tg);
557} 554}
558 555
559/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 556/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */