aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c667
-rw-r--r--kernel/sched/debug.c12
-rw-r--r--kernel/sched/fair.c543
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c107
-rw-r--r--kernel/sched/sched.h10
8 files changed, 635 insertions, 709 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a7dd35102a3..173ea52f3af0 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o 18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19
20
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4603b9d8f30a..d5594a4268d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,6 +83,7 @@
83 83
84#include "sched.h" 84#include "sched.h"
85#include "../workqueue_sched.h" 85#include "../workqueue_sched.h"
86#include "../smpboot.h"
86 87
87#define CREATE_TRACE_POINTS 88#define CREATE_TRACE_POINTS
88#include <trace/events/sched.h> 89#include <trace/events/sched.h>
@@ -141,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
141#define SCHED_FEAT(name, enabled) \ 142#define SCHED_FEAT(name, enabled) \
142 #name , 143 #name ,
143 144
144static __read_mostly char *sched_feat_names[] = { 145static const char * const sched_feat_names[] = {
145#include "features.h" 146#include "features.h"
146 NULL
147}; 147};
148 148
149#undef SCHED_FEAT 149#undef SCHED_FEAT
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data)
692} 692}
693#endif 693#endif
694 694
695void update_cpu_load(struct rq *this_rq);
696
697static void set_load_weight(struct task_struct *p) 695static void set_load_weight(struct task_struct *p)
698{ 696{
699 int prio = p->static_prio - MAX_RT_PRIO; 697 int prio = p->static_prio - MAX_RT_PRIO;
@@ -2083,6 +2081,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2083#endif 2081#endif
2084 2082
2085 /* Here we just switch the register state and the stack. */ 2083 /* Here we just switch the register state and the stack. */
2084 rcu_switch_from(prev);
2086 switch_to(prev, next, prev); 2085 switch_to(prev, next, prev);
2087 2086
2088 barrier(); 2087 barrier();
@@ -2486,22 +2485,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2486 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2485 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2487 * every tick. We fix it up based on jiffies. 2486 * every tick. We fix it up based on jiffies.
2488 */ 2487 */
2489void update_cpu_load(struct rq *this_rq) 2488static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2489 unsigned long pending_updates)
2490{ 2490{
2491 unsigned long this_load = this_rq->load.weight;
2492 unsigned long curr_jiffies = jiffies;
2493 unsigned long pending_updates;
2494 int i, scale; 2491 int i, scale;
2495 2492
2496 this_rq->nr_load_updates++; 2493 this_rq->nr_load_updates++;
2497 2494
2498 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
2499 if (curr_jiffies == this_rq->last_load_update_tick)
2500 return;
2501
2502 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2503 this_rq->last_load_update_tick = curr_jiffies;
2504
2505 /* Update our load: */ 2495 /* Update our load: */
2506 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 2496 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2507 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2497 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2526,9 +2516,78 @@ void update_cpu_load(struct rq *this_rq)
2526 sched_avg_update(this_rq); 2516 sched_avg_update(this_rq);
2527} 2517}
2528 2518
2519#ifdef CONFIG_NO_HZ
2520/*
2521 * There is no sane way to deal with nohz on smp when using jiffies because the
2522 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2523 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2524 *
2525 * Therefore we cannot use the delta approach from the regular tick since that
2526 * would seriously skew the load calculation. However we'll make do for those
2527 * updates happening while idle (nohz_idle_balance) or coming out of idle
2528 * (tick_nohz_idle_exit).
2529 *
2530 * This means we might still be one tick off for nohz periods.
2531 */
2532
2533/*
2534 * Called from nohz_idle_balance() to update the load ratings before doing the
2535 * idle balance.
2536 */
2537void update_idle_cpu_load(struct rq *this_rq)
2538{
2539 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2540 unsigned long load = this_rq->load.weight;
2541 unsigned long pending_updates;
2542
2543 /*
2544 * bail if there's load or we're actually up-to-date.
2545 */
2546 if (load || curr_jiffies == this_rq->last_load_update_tick)
2547 return;
2548
2549 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2550 this_rq->last_load_update_tick = curr_jiffies;
2551
2552 __update_cpu_load(this_rq, load, pending_updates);
2553}
2554
2555/*
2556 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2557 */
2558void update_cpu_load_nohz(void)
2559{
2560 struct rq *this_rq = this_rq();
2561 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2562 unsigned long pending_updates;
2563
2564 if (curr_jiffies == this_rq->last_load_update_tick)
2565 return;
2566
2567 raw_spin_lock(&this_rq->lock);
2568 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2569 if (pending_updates) {
2570 this_rq->last_load_update_tick = curr_jiffies;
2571 /*
2572 * We were idle, this means load 0, the current load might be
2573 * !0 due to remote wakeups and the sort.
2574 */
2575 __update_cpu_load(this_rq, 0, pending_updates);
2576 }
2577 raw_spin_unlock(&this_rq->lock);
2578}
2579#endif /* CONFIG_NO_HZ */
2580
2581/*
2582 * Called from scheduler_tick()
2583 */
2529static void update_cpu_load_active(struct rq *this_rq) 2584static void update_cpu_load_active(struct rq *this_rq)
2530{ 2585{
2531 update_cpu_load(this_rq); 2586 /*
2587 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2588 */
2589 this_rq->last_load_update_tick = jiffies;
2590 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2532 2591
2533 calc_load_account_active(this_rq); 2592 calc_load_account_active(this_rq);
2534} 2593}
@@ -3113,6 +3172,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
3113 if (irqs_disabled()) 3172 if (irqs_disabled())
3114 print_irqtrace_events(prev); 3173 print_irqtrace_events(prev);
3115 dump_stack(); 3174 dump_stack();
3175 add_taint(TAINT_WARN);
3116} 3176}
3117 3177
3118/* 3178/*
@@ -4042,11 +4102,8 @@ static bool check_same_owner(struct task_struct *p)
4042 4102
4043 rcu_read_lock(); 4103 rcu_read_lock();
4044 pcred = __task_cred(p); 4104 pcred = __task_cred(p);
4045 if (cred->user->user_ns == pcred->user->user_ns) 4105 match = (uid_eq(cred->euid, pcred->euid) ||
4046 match = (cred->euid == pcred->euid || 4106 uid_eq(cred->euid, pcred->uid));
4047 cred->euid == pcred->uid);
4048 else
4049 match = false;
4050 rcu_read_unlock(); 4107 rcu_read_unlock();
4051 return match; 4108 return match;
4052} 4109}
@@ -4957,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4957 p->sched_class->set_cpus_allowed(p, new_mask); 5014 p->sched_class->set_cpus_allowed(p, new_mask);
4958 5015
4959 cpumask_copy(&p->cpus_allowed, new_mask); 5016 cpumask_copy(&p->cpus_allowed, new_mask);
4960 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 5017 p->nr_cpus_allowed = cpumask_weight(new_mask);
4961} 5018}
4962 5019
4963/* 5020/*
@@ -5499,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5499 5556
5500#ifdef CONFIG_SCHED_DEBUG 5557#ifdef CONFIG_SCHED_DEBUG
5501 5558
5502static __read_mostly int sched_domain_debug_enabled; 5559static __read_mostly int sched_debug_enabled;
5503 5560
5504static int __init sched_domain_debug_setup(char *str) 5561static int __init sched_debug_setup(char *str)
5505{ 5562{
5506 sched_domain_debug_enabled = 1; 5563 sched_debug_enabled = 1;
5507 5564
5508 return 0; 5565 return 0;
5509} 5566}
5510early_param("sched_debug", sched_domain_debug_setup); 5567early_param("sched_debug", sched_debug_setup);
5568
5569static inline bool sched_debug(void)
5570{
5571 return sched_debug_enabled;
5572}
5511 5573
5512static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5574static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5513 struct cpumask *groupmask) 5575 struct cpumask *groupmask)
@@ -5547,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5547 break; 5609 break;
5548 } 5610 }
5549 5611
5550 if (!group->sgp->power) { 5612 /*
5613 * Even though we initialize ->power to something semi-sane,
5614 * we leave power_orig unset. This allows us to detect if
5615 * domain iteration is still funny without causing /0 traps.
5616 */
5617 if (!group->sgp->power_orig) {
5551 printk(KERN_CONT "\n"); 5618 printk(KERN_CONT "\n");
5552 printk(KERN_ERR "ERROR: domain->cpu_power not " 5619 printk(KERN_ERR "ERROR: domain->cpu_power not "
5553 "set\n"); 5620 "set\n");
@@ -5560,7 +5627,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5560 break; 5627 break;
5561 } 5628 }
5562 5629
5563 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 5630 if (!(sd->flags & SD_OVERLAP) &&
5631 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5564 printk(KERN_CONT "\n"); 5632 printk(KERN_CONT "\n");
5565 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5633 printk(KERN_ERR "ERROR: repeated CPUs\n");
5566 break; 5634 break;
@@ -5594,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5594{ 5662{
5595 int level = 0; 5663 int level = 0;
5596 5664
5597 if (!sched_domain_debug_enabled) 5665 if (!sched_debug_enabled)
5598 return; 5666 return;
5599 5667
5600 if (!sd) { 5668 if (!sd) {
@@ -5615,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5615} 5683}
5616#else /* !CONFIG_SCHED_DEBUG */ 5684#else /* !CONFIG_SCHED_DEBUG */
5617# define sched_domain_debug(sd, cpu) do { } while (0) 5685# define sched_domain_debug(sd, cpu) do { } while (0)
5686static inline bool sched_debug(void)
5687{
5688 return false;
5689}
5618#endif /* CONFIG_SCHED_DEBUG */ 5690#endif /* CONFIG_SCHED_DEBUG */
5619 5691
5620static int sd_degenerate(struct sched_domain *sd) 5692static int sd_degenerate(struct sched_domain *sd)
@@ -5898,99 +5970,11 @@ static int __init isolated_cpu_setup(char *str)
5898 5970
5899__setup("isolcpus=", isolated_cpu_setup); 5971__setup("isolcpus=", isolated_cpu_setup);
5900 5972
5901#ifdef CONFIG_NUMA
5902
5903/**
5904 * find_next_best_node - find the next node to include in a sched_domain
5905 * @node: node whose sched_domain we're building
5906 * @used_nodes: nodes already in the sched_domain
5907 *
5908 * Find the next node to include in a given scheduling domain. Simply
5909 * finds the closest node not already in the @used_nodes map.
5910 *
5911 * Should use nodemask_t.
5912 */
5913static int find_next_best_node(int node, nodemask_t *used_nodes)
5914{
5915 int i, n, val, min_val, best_node = -1;
5916
5917 min_val = INT_MAX;
5918
5919 for (i = 0; i < nr_node_ids; i++) {
5920 /* Start at @node */
5921 n = (node + i) % nr_node_ids;
5922
5923 if (!nr_cpus_node(n))
5924 continue;
5925
5926 /* Skip already used nodes */
5927 if (node_isset(n, *used_nodes))
5928 continue;
5929
5930 /* Simple min distance search */
5931 val = node_distance(node, n);
5932
5933 if (val < min_val) {
5934 min_val = val;
5935 best_node = n;
5936 }
5937 }
5938
5939 if (best_node != -1)
5940 node_set(best_node, *used_nodes);
5941 return best_node;
5942}
5943
5944/**
5945 * sched_domain_node_span - get a cpumask for a node's sched_domain
5946 * @node: node whose cpumask we're constructing
5947 * @span: resulting cpumask
5948 *
5949 * Given a node, construct a good cpumask for its sched_domain to span. It
5950 * should be one that prevents unnecessary balancing, but also spreads tasks
5951 * out optimally.
5952 */
5953static void sched_domain_node_span(int node, struct cpumask *span)
5954{
5955 nodemask_t used_nodes;
5956 int i;
5957
5958 cpumask_clear(span);
5959 nodes_clear(used_nodes);
5960
5961 cpumask_or(span, span, cpumask_of_node(node));
5962 node_set(node, used_nodes);
5963
5964 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5965 int next_node = find_next_best_node(node, &used_nodes);
5966 if (next_node < 0)
5967 break;
5968 cpumask_or(span, span, cpumask_of_node(next_node));
5969 }
5970}
5971
5972static const struct cpumask *cpu_node_mask(int cpu)
5973{
5974 lockdep_assert_held(&sched_domains_mutex);
5975
5976 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5977
5978 return sched_domains_tmpmask;
5979}
5980
5981static const struct cpumask *cpu_allnodes_mask(int cpu)
5982{
5983 return cpu_possible_mask;
5984}
5985#endif /* CONFIG_NUMA */
5986
5987static const struct cpumask *cpu_cpu_mask(int cpu) 5973static const struct cpumask *cpu_cpu_mask(int cpu)
5988{ 5974{
5989 return cpumask_of_node(cpu_to_node(cpu)); 5975 return cpumask_of_node(cpu_to_node(cpu));
5990} 5976}
5991 5977
5992int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5993
5994struct sd_data { 5978struct sd_data {
5995 struct sched_domain **__percpu sd; 5979 struct sched_domain **__percpu sd;
5996 struct sched_group **__percpu sg; 5980 struct sched_group **__percpu sg;
@@ -6020,9 +6004,48 @@ struct sched_domain_topology_level {
6020 sched_domain_init_f init; 6004 sched_domain_init_f init;
6021 sched_domain_mask_f mask; 6005 sched_domain_mask_f mask;
6022 int flags; 6006 int flags;
6007 int numa_level;
6023 struct sd_data data; 6008 struct sd_data data;
6024}; 6009};
6025 6010
6011/*
6012 * Build an iteration mask that can exclude certain CPUs from the upwards
6013 * domain traversal.
6014 *
6015 * Asymmetric node setups can result in situations where the domain tree is of
6016 * unequal depth, make sure to skip domains that already cover the entire
6017 * range.
6018 *
6019 * In that case build_sched_domains() will have terminated the iteration early
6020 * and our sibling sd spans will be empty. Domains should always include the
6021 * cpu they're built on, so check that.
6022 *
6023 */
6024static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6025{
6026 const struct cpumask *span = sched_domain_span(sd);
6027 struct sd_data *sdd = sd->private;
6028 struct sched_domain *sibling;
6029 int i;
6030
6031 for_each_cpu(i, span) {
6032 sibling = *per_cpu_ptr(sdd->sd, i);
6033 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6034 continue;
6035
6036 cpumask_set_cpu(i, sched_group_mask(sg));
6037 }
6038}
6039
6040/*
6041 * Return the canonical balance cpu for this group, this is the first cpu
6042 * of this group that's also in the iteration mask.
6043 */
6044int group_balance_cpu(struct sched_group *sg)
6045{
6046 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6047}
6048
6026static int 6049static int
6027build_overlap_sched_groups(struct sched_domain *sd, int cpu) 6050build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6028{ 6051{
@@ -6041,6 +6064,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6041 if (cpumask_test_cpu(i, covered)) 6064 if (cpumask_test_cpu(i, covered))
6042 continue; 6065 continue;
6043 6066
6067 child = *per_cpu_ptr(sdd->sd, i);
6068
6069 /* See the comment near build_group_mask(). */
6070 if (!cpumask_test_cpu(i, sched_domain_span(child)))
6071 continue;
6072
6044 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6073 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6045 GFP_KERNEL, cpu_to_node(cpu)); 6074 GFP_KERNEL, cpu_to_node(cpu));
6046 6075
@@ -6048,8 +6077,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6048 goto fail; 6077 goto fail;
6049 6078
6050 sg_span = sched_group_cpus(sg); 6079 sg_span = sched_group_cpus(sg);
6051
6052 child = *per_cpu_ptr(sdd->sd, i);
6053 if (child->child) { 6080 if (child->child) {
6054 child = child->child; 6081 child = child->child;
6055 cpumask_copy(sg_span, sched_domain_span(child)); 6082 cpumask_copy(sg_span, sched_domain_span(child));
@@ -6058,10 +6085,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6058 6085
6059 cpumask_or(covered, covered, sg_span); 6086 cpumask_or(covered, covered, sg_span);
6060 6087
6061 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); 6088 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
6062 atomic_inc(&sg->sgp->ref); 6089 if (atomic_inc_return(&sg->sgp->ref) == 1)
6090 build_group_mask(sd, sg);
6063 6091
6064 if (cpumask_test_cpu(cpu, sg_span)) 6092 /*
6093 * Initialize sgp->power such that even if we mess up the
6094 * domains and no possible iteration will get us here, we won't
6095 * die on a /0 trap.
6096 */
6097 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
6098
6099 /*
6100 * Make sure the first group of this domain contains the
6101 * canonical balance cpu. Otherwise the sched_domain iteration
6102 * breaks. See update_sg_lb_stats().
6103 */
6104 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6105 group_balance_cpu(sg) == cpu)
6065 groups = sg; 6106 groups = sg;
6066 6107
6067 if (!first) 6108 if (!first)
@@ -6135,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
6135 6176
6136 cpumask_clear(sched_group_cpus(sg)); 6177 cpumask_clear(sched_group_cpus(sg));
6137 sg->sgp->power = 0; 6178 sg->sgp->power = 0;
6179 cpumask_setall(sched_group_mask(sg));
6138 6180
6139 for_each_cpu(j, span) { 6181 for_each_cpu(j, span) {
6140 if (get_group(j, sdd, NULL) != group) 6182 if (get_group(j, sdd, NULL) != group)
@@ -6176,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6176 sg = sg->next; 6218 sg = sg->next;
6177 } while (sg != sd->groups); 6219 } while (sg != sd->groups);
6178 6220
6179 if (cpu != group_first_cpu(sg)) 6221 if (cpu != group_balance_cpu(sg))
6180 return; 6222 return;
6181 6223
6182 update_group_power(sd, cpu); 6224 update_group_power(sd, cpu);
@@ -6211,10 +6253,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6211} 6253}
6212 6254
6213SD_INIT_FUNC(CPU) 6255SD_INIT_FUNC(CPU)
6214#ifdef CONFIG_NUMA
6215 SD_INIT_FUNC(ALLNODES)
6216 SD_INIT_FUNC(NODE)
6217#endif
6218#ifdef CONFIG_SCHED_SMT 6256#ifdef CONFIG_SCHED_SMT
6219 SD_INIT_FUNC(SIBLING) 6257 SD_INIT_FUNC(SIBLING)
6220#endif 6258#endif
@@ -6230,11 +6268,8 @@ int sched_domain_level_max;
6230 6268
6231static int __init setup_relax_domain_level(char *str) 6269static int __init setup_relax_domain_level(char *str)
6232{ 6270{
6233 unsigned long val; 6271 if (kstrtoint(str, 0, &default_relax_domain_level))
6234 6272 pr_warn("Unable to set relax_domain_level\n");
6235 val = simple_strtoul(str, NULL, 0);
6236 if (val < sched_domain_level_max)
6237 default_relax_domain_level = val;
6238 6273
6239 return 1; 6274 return 1;
6240} 6275}
@@ -6336,15 +6371,236 @@ static struct sched_domain_topology_level default_topology[] = {
6336 { sd_init_BOOK, cpu_book_mask, }, 6371 { sd_init_BOOK, cpu_book_mask, },
6337#endif 6372#endif
6338 { sd_init_CPU, cpu_cpu_mask, }, 6373 { sd_init_CPU, cpu_cpu_mask, },
6339#ifdef CONFIG_NUMA
6340 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6341 { sd_init_ALLNODES, cpu_allnodes_mask, },
6342#endif
6343 { NULL, }, 6374 { NULL, },
6344}; 6375};
6345 6376
6346static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6377static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6347 6378
6379#ifdef CONFIG_NUMA
6380
6381static int sched_domains_numa_levels;
6382static int *sched_domains_numa_distance;
6383static struct cpumask ***sched_domains_numa_masks;
6384static int sched_domains_curr_level;
6385
6386static inline int sd_local_flags(int level)
6387{
6388 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
6389 return 0;
6390
6391 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6392}
6393
6394static struct sched_domain *
6395sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6396{
6397 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6398 int level = tl->numa_level;
6399 int sd_weight = cpumask_weight(
6400 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6401
6402 *sd = (struct sched_domain){
6403 .min_interval = sd_weight,
6404 .max_interval = 2*sd_weight,
6405 .busy_factor = 32,
6406 .imbalance_pct = 125,
6407 .cache_nice_tries = 2,
6408 .busy_idx = 3,
6409 .idle_idx = 2,
6410 .newidle_idx = 0,
6411 .wake_idx = 0,
6412 .forkexec_idx = 0,
6413
6414 .flags = 1*SD_LOAD_BALANCE
6415 | 1*SD_BALANCE_NEWIDLE
6416 | 0*SD_BALANCE_EXEC
6417 | 0*SD_BALANCE_FORK
6418 | 0*SD_BALANCE_WAKE
6419 | 0*SD_WAKE_AFFINE
6420 | 0*SD_PREFER_LOCAL
6421 | 0*SD_SHARE_CPUPOWER
6422 | 0*SD_SHARE_PKG_RESOURCES
6423 | 1*SD_SERIALIZE
6424 | 0*SD_PREFER_SIBLING
6425 | sd_local_flags(level)
6426 ,
6427 .last_balance = jiffies,
6428 .balance_interval = sd_weight,
6429 };
6430 SD_INIT_NAME(sd, NUMA);
6431 sd->private = &tl->data;
6432
6433 /*
6434 * Ugly hack to pass state to sd_numa_mask()...
6435 */
6436 sched_domains_curr_level = tl->numa_level;
6437
6438 return sd;
6439}
6440
6441static const struct cpumask *sd_numa_mask(int cpu)
6442{
6443 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6444}
6445
6446static void sched_numa_warn(const char *str)
6447{
6448 static int done = false;
6449 int i,j;
6450
6451 if (done)
6452 return;
6453
6454 done = true;
6455
6456 printk(KERN_WARNING "ERROR: %s\n\n", str);
6457
6458 for (i = 0; i < nr_node_ids; i++) {
6459 printk(KERN_WARNING " ");
6460 for (j = 0; j < nr_node_ids; j++)
6461 printk(KERN_CONT "%02d ", node_distance(i,j));
6462 printk(KERN_CONT "\n");
6463 }
6464 printk(KERN_WARNING "\n");
6465}
6466
6467static bool find_numa_distance(int distance)
6468{
6469 int i;
6470
6471 if (distance == node_distance(0, 0))
6472 return true;
6473
6474 for (i = 0; i < sched_domains_numa_levels; i++) {
6475 if (sched_domains_numa_distance[i] == distance)
6476 return true;
6477 }
6478
6479 return false;
6480}
6481
6482static void sched_init_numa(void)
6483{
6484 int next_distance, curr_distance = node_distance(0, 0);
6485 struct sched_domain_topology_level *tl;
6486 int level = 0;
6487 int i, j, k;
6488
6489 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6490 if (!sched_domains_numa_distance)
6491 return;
6492
6493 /*
6494 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6495 * unique distances in the node_distance() table.
6496 *
6497 * Assumes node_distance(0,j) includes all distances in
6498 * node_distance(i,j) in order to avoid cubic time.
6499 */
6500 next_distance = curr_distance;
6501 for (i = 0; i < nr_node_ids; i++) {
6502 for (j = 0; j < nr_node_ids; j++) {
6503 for (k = 0; k < nr_node_ids; k++) {
6504 int distance = node_distance(i, k);
6505
6506 if (distance > curr_distance &&
6507 (distance < next_distance ||
6508 next_distance == curr_distance))
6509 next_distance = distance;
6510
6511 /*
6512 * While not a strong assumption it would be nice to know
6513 * about cases where if node A is connected to B, B is not
6514 * equally connected to A.
6515 */
6516 if (sched_debug() && node_distance(k, i) != distance)
6517 sched_numa_warn("Node-distance not symmetric");
6518
6519 if (sched_debug() && i && !find_numa_distance(distance))
6520 sched_numa_warn("Node-0 not representative");
6521 }
6522 if (next_distance != curr_distance) {
6523 sched_domains_numa_distance[level++] = next_distance;
6524 sched_domains_numa_levels = level;
6525 curr_distance = next_distance;
6526 } else break;
6527 }
6528
6529 /*
6530 * In case of sched_debug() we verify the above assumption.
6531 */
6532 if (!sched_debug())
6533 break;
6534 }
6535 /*
6536 * 'level' contains the number of unique distances, excluding the
6537 * identity distance node_distance(i,i).
6538 *
6539 * The sched_domains_nume_distance[] array includes the actual distance
6540 * numbers.
6541 */
6542
6543 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6544 if (!sched_domains_numa_masks)
6545 return;
6546
6547 /*
6548 * Now for each level, construct a mask per node which contains all
6549 * cpus of nodes that are that many hops away from us.
6550 */
6551 for (i = 0; i < level; i++) {
6552 sched_domains_numa_masks[i] =
6553 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6554 if (!sched_domains_numa_masks[i])
6555 return;
6556
6557 for (j = 0; j < nr_node_ids; j++) {
6558 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6559 if (!mask)
6560 return;
6561
6562 sched_domains_numa_masks[i][j] = mask;
6563
6564 for (k = 0; k < nr_node_ids; k++) {
6565 if (node_distance(j, k) > sched_domains_numa_distance[i])
6566 continue;
6567
6568 cpumask_or(mask, mask, cpumask_of_node(k));
6569 }
6570 }
6571 }
6572
6573 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6574 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6575 if (!tl)
6576 return;
6577
6578 /*
6579 * Copy the default topology bits..
6580 */
6581 for (i = 0; default_topology[i].init; i++)
6582 tl[i] = default_topology[i];
6583
6584 /*
6585 * .. and append 'j' levels of NUMA goodness.
6586 */
6587 for (j = 0; j < level; i++, j++) {
6588 tl[i] = (struct sched_domain_topology_level){
6589 .init = sd_numa_init,
6590 .mask = sd_numa_mask,
6591 .flags = SDTL_OVERLAP,
6592 .numa_level = j,
6593 };
6594 }
6595
6596 sched_domain_topology = tl;
6597}
6598#else
6599static inline void sched_init_numa(void)
6600{
6601}
6602#endif /* CONFIG_NUMA */
6603
6348static int __sdt_alloc(const struct cpumask *cpu_map) 6604static int __sdt_alloc(const struct cpumask *cpu_map)
6349{ 6605{
6350 struct sched_domain_topology_level *tl; 6606 struct sched_domain_topology_level *tl;
@@ -6382,9 +6638,11 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6382 if (!sg) 6638 if (!sg)
6383 return -ENOMEM; 6639 return -ENOMEM;
6384 6640
6641 sg->next = sg;
6642
6385 *per_cpu_ptr(sdd->sg, j) = sg; 6643 *per_cpu_ptr(sdd->sg, j) = sg;
6386 6644
6387 sgp = kzalloc_node(sizeof(struct sched_group_power), 6645 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6388 GFP_KERNEL, cpu_to_node(j)); 6646 GFP_KERNEL, cpu_to_node(j));
6389 if (!sgp) 6647 if (!sgp)
6390 return -ENOMEM; 6648 return -ENOMEM;
@@ -6405,16 +6663,26 @@ static void __sdt_free(const struct cpumask *cpu_map)
6405 struct sd_data *sdd = &tl->data; 6663 struct sd_data *sdd = &tl->data;
6406 6664
6407 for_each_cpu(j, cpu_map) { 6665 for_each_cpu(j, cpu_map) {
6408 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); 6666 struct sched_domain *sd;
6409 if (sd && (sd->flags & SD_OVERLAP)) 6667
6410 free_sched_groups(sd->groups, 0); 6668 if (sdd->sd) {
6411 kfree(*per_cpu_ptr(sdd->sd, j)); 6669 sd = *per_cpu_ptr(sdd->sd, j);
6412 kfree(*per_cpu_ptr(sdd->sg, j)); 6670 if (sd && (sd->flags & SD_OVERLAP))
6413 kfree(*per_cpu_ptr(sdd->sgp, j)); 6671 free_sched_groups(sd->groups, 0);
6672 kfree(*per_cpu_ptr(sdd->sd, j));
6673 }
6674
6675 if (sdd->sg)
6676 kfree(*per_cpu_ptr(sdd->sg, j));
6677 if (sdd->sgp)
6678 kfree(*per_cpu_ptr(sdd->sgp, j));
6414 } 6679 }
6415 free_percpu(sdd->sd); 6680 free_percpu(sdd->sd);
6681 sdd->sd = NULL;
6416 free_percpu(sdd->sg); 6682 free_percpu(sdd->sg);
6683 sdd->sg = NULL;
6417 free_percpu(sdd->sgp); 6684 free_percpu(sdd->sgp);
6685 sdd->sgp = NULL;
6418 } 6686 }
6419} 6687}
6420 6688
@@ -6427,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6427 if (!sd) 6695 if (!sd)
6428 return child; 6696 return child;
6429 6697
6430 set_domain_attribute(sd, attr);
6431 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6698 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6432 if (child) { 6699 if (child) {
6433 sd->level = child->level + 1; 6700 sd->level = child->level + 1;
@@ -6435,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6435 child->parent = sd; 6702 child->parent = sd;
6436 } 6703 }
6437 sd->child = child; 6704 sd->child = child;
6705 set_domain_attribute(sd, attr);
6438 6706
6439 return sd; 6707 return sd;
6440} 6708}
@@ -6575,7 +6843,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
6575 if (!doms_cur) 6843 if (!doms_cur)
6576 doms_cur = &fallback_doms; 6844 doms_cur = &fallback_doms;
6577 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6845 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6578 dattr_cur = NULL;
6579 err = build_sched_domains(doms_cur[0], NULL); 6846 err = build_sched_domains(doms_cur[0], NULL);
6580 register_sched_domain_sysctl(); 6847 register_sched_domain_sysctl();
6581 6848
@@ -6700,97 +6967,6 @@ match2:
6700 mutex_unlock(&sched_domains_mutex); 6967 mutex_unlock(&sched_domains_mutex);
6701} 6968}
6702 6969
6703#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6704static void reinit_sched_domains(void)
6705{
6706 get_online_cpus();
6707
6708 /* Destroy domains first to force the rebuild */
6709 partition_sched_domains(0, NULL, NULL);
6710
6711 rebuild_sched_domains();
6712 put_online_cpus();
6713}
6714
6715static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6716{
6717 unsigned int level = 0;
6718
6719 if (sscanf(buf, "%u", &level) != 1)
6720 return -EINVAL;
6721
6722 /*
6723 * level is always be positive so don't check for
6724 * level < POWERSAVINGS_BALANCE_NONE which is 0
6725 * What happens on 0 or 1 byte write,
6726 * need to check for count as well?
6727 */
6728
6729 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
6730 return -EINVAL;
6731
6732 if (smt)
6733 sched_smt_power_savings = level;
6734 else
6735 sched_mc_power_savings = level;
6736
6737 reinit_sched_domains();
6738
6739 return count;
6740}
6741
6742#ifdef CONFIG_SCHED_MC
6743static ssize_t sched_mc_power_savings_show(struct device *dev,
6744 struct device_attribute *attr,
6745 char *buf)
6746{
6747 return sprintf(buf, "%u\n", sched_mc_power_savings);
6748}
6749static ssize_t sched_mc_power_savings_store(struct device *dev,
6750 struct device_attribute *attr,
6751 const char *buf, size_t count)
6752{
6753 return sched_power_savings_store(buf, count, 0);
6754}
6755static DEVICE_ATTR(sched_mc_power_savings, 0644,
6756 sched_mc_power_savings_show,
6757 sched_mc_power_savings_store);
6758#endif
6759
6760#ifdef CONFIG_SCHED_SMT
6761static ssize_t sched_smt_power_savings_show(struct device *dev,
6762 struct device_attribute *attr,
6763 char *buf)
6764{
6765 return sprintf(buf, "%u\n", sched_smt_power_savings);
6766}
6767static ssize_t sched_smt_power_savings_store(struct device *dev,
6768 struct device_attribute *attr,
6769 const char *buf, size_t count)
6770{
6771 return sched_power_savings_store(buf, count, 1);
6772}
6773static DEVICE_ATTR(sched_smt_power_savings, 0644,
6774 sched_smt_power_savings_show,
6775 sched_smt_power_savings_store);
6776#endif
6777
6778int __init sched_create_sysfs_power_savings_entries(struct device *dev)
6779{
6780 int err = 0;
6781
6782#ifdef CONFIG_SCHED_SMT
6783 if (smt_capable())
6784 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
6785#endif
6786#ifdef CONFIG_SCHED_MC
6787 if (!err && mc_capable())
6788 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
6789#endif
6790 return err;
6791}
6792#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
6793
6794/* 6970/*
6795 * Update cpusets according to cpu_active mask. If cpusets are 6971 * Update cpusets according to cpu_active mask. If cpusets are
6796 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6972 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@ -6828,6 +7004,8 @@ void __init sched_init_smp(void)
6828 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7004 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6829 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7005 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6830 7006
7007 sched_init_numa();
7008
6831 get_online_cpus(); 7009 get_online_cpus();
6832 mutex_lock(&sched_domains_mutex); 7010 mutex_lock(&sched_domains_mutex);
6833 init_sched_domains(cpu_active_mask); 7011 init_sched_domains(cpu_active_mask);
@@ -7049,6 +7227,7 @@ void __init sched_init(void)
7049 /* May be allocated at isolcpus cmdline parse time */ 7227 /* May be allocated at isolcpus cmdline parse time */
7050 if (cpu_isolated_map == NULL) 7228 if (cpu_isolated_map == NULL)
7051 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7229 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7230 idle_thread_set_boot_cpu();
7052#endif 7231#endif
7053 init_sched_fair_class(); 7232 init_sched_fair_class();
7054 7233
@@ -7970,13 +8149,9 @@ static struct cftype cpu_files[] = {
7970 .write_u64 = cpu_rt_period_write_uint, 8149 .write_u64 = cpu_rt_period_write_uint,
7971 }, 8150 },
7972#endif 8151#endif
8152 { } /* terminate */
7973}; 8153};
7974 8154
7975static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7976{
7977 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
7978}
7979
7980struct cgroup_subsys cpu_cgroup_subsys = { 8155struct cgroup_subsys cpu_cgroup_subsys = {
7981 .name = "cpu", 8156 .name = "cpu",
7982 .create = cpu_cgroup_create, 8157 .create = cpu_cgroup_create,
@@ -7984,8 +8159,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7984 .can_attach = cpu_cgroup_can_attach, 8159 .can_attach = cpu_cgroup_can_attach,
7985 .attach = cpu_cgroup_attach, 8160 .attach = cpu_cgroup_attach,
7986 .exit = cpu_cgroup_exit, 8161 .exit = cpu_cgroup_exit,
7987 .populate = cpu_cgroup_populate,
7988 .subsys_id = cpu_cgroup_subsys_id, 8162 .subsys_id = cpu_cgroup_subsys_id,
8163 .base_cftypes = cpu_files,
7989 .early_init = 1, 8164 .early_init = 1,
7990}; 8165};
7991 8166
@@ -8170,13 +8345,9 @@ static struct cftype files[] = {
8170 .name = "stat", 8345 .name = "stat",
8171 .read_map = cpuacct_stats_show, 8346 .read_map = cpuacct_stats_show,
8172 }, 8347 },
8348 { } /* terminate */
8173}; 8349};
8174 8350
8175static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8176{
8177 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8178}
8179
8180/* 8351/*
8181 * charge this task's execution time to its accounting group. 8352 * charge this task's execution time to its accounting group.
8182 * 8353 *
@@ -8208,7 +8379,7 @@ struct cgroup_subsys cpuacct_subsys = {
8208 .name = "cpuacct", 8379 .name = "cpuacct",
8209 .create = cpuacct_create, 8380 .create = cpuacct_create,
8210 .destroy = cpuacct_destroy, 8381 .destroy = cpuacct_destroy,
8211 .populate = cpuacct_populate,
8212 .subsys_id = cpuacct_subsys_id, 8382 .subsys_id = cpuacct_subsys_id,
8383 .base_cftypes = files,
8213}; 8384};
8214#endif /* CONFIG_CGROUP_CPUACCT */ 8385#endif /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 09acaa15161d..6f79596e0ea9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 SPLIT_NS(spread0)); 202 SPLIT_NS(spread0));
203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
204 cfs_rq->nr_spread_over); 204 cfs_rq->nr_spread_over);
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 205 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207#ifdef CONFIG_FAIR_GROUP_SCHED 207#ifdef CONFIG_FAIR_GROUP_SCHED
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu)
260 SEQ_printf(m, "\ncpu#%d\n", cpu); 260 SEQ_printf(m, "\ncpu#%d\n", cpu);
261#endif 261#endif
262 262
263#define P(x) \ 263#define P(x) \
264 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) 264do { \
265 if (sizeof(rq->x) == 4) \
266 SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
267 else \
268 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
269} while (0)
270
265#define PN(x) \ 271#define PN(x) \
266 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) 272 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
267 273
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0d97ebdc58f0..c099cc6eebe3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
784 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 784 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
785#ifdef CONFIG_SMP 785#ifdef CONFIG_SMP
786 if (entity_is_task(se)) 786 if (entity_is_task(se))
787 list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); 787 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
788#endif 788#endif
789 cfs_rq->nr_running++; 789 cfs_rq->nr_running++;
790} 790}
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2703 int want_sd = 1; 2703 int want_sd = 1;
2704 int sync = wake_flags & WF_SYNC; 2704 int sync = wake_flags & WF_SYNC;
2705 2705
2706 if (p->rt.nr_cpus_allowed == 1) 2706 if (p->nr_cpus_allowed == 1)
2707 return prev_cpu; 2707 return prev_cpu;
2708 2708
2709 if (sd_flag & SD_BALANCE_WAKE) { 2709 if (sd_flag & SD_BALANCE_WAKE) {
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2721 * If power savings logic is enabled for a domain, see if we 2721 * If power savings logic is enabled for a domain, see if we
2722 * are not overloaded, if so, don't balance wider. 2722 * are not overloaded, if so, don't balance wider.
2723 */ 2723 */
2724 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { 2724 if (tmp->flags & (SD_PREFER_LOCAL)) {
2725 unsigned long power = 0; 2725 unsigned long power = 0;
2726 unsigned long nr_running = 0; 2726 unsigned long nr_running = 0;
2727 unsigned long capacity; 2727 unsigned long capacity;
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2734 2734
2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); 2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2736 2736
2737 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2738 nr_running /= 2;
2739
2740 if (nr_running < capacity) 2737 if (nr_running < capacity)
2741 want_sd = 0; 2738 want_sd = 0;
2742 } 2739 }
@@ -3082,7 +3079,7 @@ struct lb_env {
3082 struct rq *dst_rq; 3079 struct rq *dst_rq;
3083 3080
3084 enum cpu_idle_type idle; 3081 enum cpu_idle_type idle;
3085 long load_move; 3082 long imbalance;
3086 unsigned int flags; 3083 unsigned int flags;
3087 3084
3088 unsigned int loop; 3085 unsigned int loop;
@@ -3215,8 +3212,10 @@ static int move_one_task(struct lb_env *env)
3215 3212
3216static unsigned long task_h_load(struct task_struct *p); 3213static unsigned long task_h_load(struct task_struct *p);
3217 3214
3215static const unsigned int sched_nr_migrate_break = 32;
3216
3218/* 3217/*
3219 * move_tasks tries to move up to load_move weighted load from busiest to 3218 * move_tasks tries to move up to imbalance weighted load from busiest to
3220 * this_rq, as part of a balancing operation within domain "sd". 3219 * this_rq, as part of a balancing operation within domain "sd".
3221 * Returns 1 if successful and 0 otherwise. 3220 * Returns 1 if successful and 0 otherwise.
3222 * 3221 *
@@ -3229,7 +3228,7 @@ static int move_tasks(struct lb_env *env)
3229 unsigned long load; 3228 unsigned long load;
3230 int pulled = 0; 3229 int pulled = 0;
3231 3230
3232 if (env->load_move <= 0) 3231 if (env->imbalance <= 0)
3233 return 0; 3232 return 0;
3234 3233
3235 while (!list_empty(tasks)) { 3234 while (!list_empty(tasks)) {
@@ -3242,7 +3241,7 @@ static int move_tasks(struct lb_env *env)
3242 3241
3243 /* take a breather every nr_migrate tasks */ 3242 /* take a breather every nr_migrate tasks */
3244 if (env->loop > env->loop_break) { 3243 if (env->loop > env->loop_break) {
3245 env->loop_break += sysctl_sched_nr_migrate; 3244 env->loop_break += sched_nr_migrate_break;
3246 env->flags |= LBF_NEED_BREAK; 3245 env->flags |= LBF_NEED_BREAK;
3247 break; 3246 break;
3248 } 3247 }
@@ -3252,10 +3251,10 @@ static int move_tasks(struct lb_env *env)
3252 3251
3253 load = task_h_load(p); 3252 load = task_h_load(p);
3254 3253
3255 if (load < 16 && !env->sd->nr_balance_failed) 3254 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
3256 goto next; 3255 goto next;
3257 3256
3258 if ((load / 2) > env->load_move) 3257 if ((load / 2) > env->imbalance)
3259 goto next; 3258 goto next;
3260 3259
3261 if (!can_migrate_task(p, env)) 3260 if (!can_migrate_task(p, env))
@@ -3263,7 +3262,7 @@ static int move_tasks(struct lb_env *env)
3263 3262
3264 move_task(p, env); 3263 move_task(p, env);
3265 pulled++; 3264 pulled++;
3266 env->load_move -= load; 3265 env->imbalance -= load;
3267 3266
3268#ifdef CONFIG_PREEMPT 3267#ifdef CONFIG_PREEMPT
3269 /* 3268 /*
@@ -3279,7 +3278,7 @@ static int move_tasks(struct lb_env *env)
3279 * We only want to steal up to the prescribed amount of 3278 * We only want to steal up to the prescribed amount of
3280 * weighted load. 3279 * weighted load.
3281 */ 3280 */
3282 if (env->load_move <= 0) 3281 if (env->imbalance <= 0)
3283 break; 3282 break;
3284 3283
3285 continue; 3284 continue;
@@ -3433,14 +3432,6 @@ struct sd_lb_stats {
3433 unsigned int busiest_group_weight; 3432 unsigned int busiest_group_weight;
3434 3433
3435 int group_imb; /* Is there imbalance in this sd */ 3434 int group_imb; /* Is there imbalance in this sd */
3436#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3437 int power_savings_balance; /* Is powersave balance needed for this sd */
3438 struct sched_group *group_min; /* Least loaded group in sd */
3439 struct sched_group *group_leader; /* Group which relieves group_min */
3440 unsigned long min_load_per_task; /* load_per_task in group_min */
3441 unsigned long leader_nr_running; /* Nr running of group_leader */
3442 unsigned long min_nr_running; /* Nr running of group_min */
3443#endif
3444}; 3435};
3445 3436
3446/* 3437/*
@@ -3484,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
3484 return load_idx; 3475 return load_idx;
3485} 3476}
3486 3477
3487
3488#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3489/**
3490 * init_sd_power_savings_stats - Initialize power savings statistics for
3491 * the given sched_domain, during load balancing.
3492 *
3493 * @sd: Sched domain whose power-savings statistics are to be initialized.
3494 * @sds: Variable containing the statistics for sd.
3495 * @idle: Idle status of the CPU at which we're performing load-balancing.
3496 */
3497static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3498 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3499{
3500 /*
3501 * Busy processors will not participate in power savings
3502 * balance.
3503 */
3504 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3505 sds->power_savings_balance = 0;
3506 else {
3507 sds->power_savings_balance = 1;
3508 sds->min_nr_running = ULONG_MAX;
3509 sds->leader_nr_running = 0;
3510 }
3511}
3512
3513/**
3514 * update_sd_power_savings_stats - Update the power saving stats for a
3515 * sched_domain while performing load balancing.
3516 *
3517 * @group: sched_group belonging to the sched_domain under consideration.
3518 * @sds: Variable containing the statistics of the sched_domain
3519 * @local_group: Does group contain the CPU for which we're performing
3520 * load balancing ?
3521 * @sgs: Variable containing the statistics of the group.
3522 */
3523static inline void update_sd_power_savings_stats(struct sched_group *group,
3524 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3525{
3526
3527 if (!sds->power_savings_balance)
3528 return;
3529
3530 /*
3531 * If the local group is idle or completely loaded
3532 * no need to do power savings balance at this domain
3533 */
3534 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3535 !sds->this_nr_running))
3536 sds->power_savings_balance = 0;
3537
3538 /*
3539 * If a group is already running at full capacity or idle,
3540 * don't include that group in power savings calculations
3541 */
3542 if (!sds->power_savings_balance ||
3543 sgs->sum_nr_running >= sgs->group_capacity ||
3544 !sgs->sum_nr_running)
3545 return;
3546
3547 /*
3548 * Calculate the group which has the least non-idle load.
3549 * This is the group from where we need to pick up the load
3550 * for saving power
3551 */
3552 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3553 (sgs->sum_nr_running == sds->min_nr_running &&
3554 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3555 sds->group_min = group;
3556 sds->min_nr_running = sgs->sum_nr_running;
3557 sds->min_load_per_task = sgs->sum_weighted_load /
3558 sgs->sum_nr_running;
3559 }
3560
3561 /*
3562 * Calculate the group which is almost near its
3563 * capacity but still has some space to pick up some load
3564 * from other group and save more power
3565 */
3566 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3567 return;
3568
3569 if (sgs->sum_nr_running > sds->leader_nr_running ||
3570 (sgs->sum_nr_running == sds->leader_nr_running &&
3571 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3572 sds->group_leader = group;
3573 sds->leader_nr_running = sgs->sum_nr_running;
3574 }
3575}
3576
3577/**
3578 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3579 * @sds: Variable containing the statistics of the sched_domain
3580 * under consideration.
3581 * @this_cpu: Cpu at which we're currently performing load-balancing.
3582 * @imbalance: Variable to store the imbalance.
3583 *
3584 * Description:
3585 * Check if we have potential to perform some power-savings balance.
3586 * If yes, set the busiest group to be the least loaded group in the
3587 * sched_domain, so that it's CPUs can be put to idle.
3588 *
3589 * Returns 1 if there is potential to perform power-savings balance.
3590 * Else returns 0.
3591 */
3592static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3593 int this_cpu, unsigned long *imbalance)
3594{
3595 if (!sds->power_savings_balance)
3596 return 0;
3597
3598 if (sds->this != sds->group_leader ||
3599 sds->group_leader == sds->group_min)
3600 return 0;
3601
3602 *imbalance = sds->min_load_per_task;
3603 sds->busiest = sds->group_min;
3604
3605 return 1;
3606
3607}
3608#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3609static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3610 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3611{
3612 return;
3613}
3614
3615static inline void update_sd_power_savings_stats(struct sched_group *group,
3616 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3617{
3618 return;
3619}
3620
3621static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3622 int this_cpu, unsigned long *imbalance)
3623{
3624 return 0;
3625}
3626#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3627
3628
3629unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 3478unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3630{ 3479{
3631 return SCHED_POWER_SCALE; 3480 return SCHED_POWER_SCALE;
@@ -3654,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3654unsigned long scale_rt_power(int cpu) 3503unsigned long scale_rt_power(int cpu)
3655{ 3504{
3656 struct rq *rq = cpu_rq(cpu); 3505 struct rq *rq = cpu_rq(cpu);
3657 u64 total, available; 3506 u64 total, available, age_stamp, avg;
3507
3508 /*
3509 * Since we're reading these variables without serialization make sure
3510 * we read them once before doing sanity checks on them.
3511 */
3512 age_stamp = ACCESS_ONCE(rq->age_stamp);
3513 avg = ACCESS_ONCE(rq->rt_avg);
3658 3514
3659 total = sched_avg_period() + (rq->clock - rq->age_stamp); 3515 total = sched_avg_period() + (rq->clock - age_stamp);
3660 3516
3661 if (unlikely(total < rq->rt_avg)) { 3517 if (unlikely(total < avg)) {
3662 /* Ensures that power won't end up being negative */ 3518 /* Ensures that power won't end up being negative */
3663 available = 0; 3519 available = 0;
3664 } else { 3520 } else {
3665 available = total - rq->rt_avg; 3521 available = total - avg;
3666 } 3522 }
3667 3523
3668 if (unlikely((s64)total < SCHED_POWER_SCALE)) 3524 if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3725,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu)
3725 3581
3726 power = 0; 3582 power = 0;
3727 3583
3728 group = child->groups; 3584 if (child->flags & SD_OVERLAP) {
3729 do { 3585 /*
3730 power += group->sgp->power; 3586 * SD_OVERLAP domains cannot assume that child groups
3731 group = group->next; 3587 * span the current group.
3732 } while (group != child->groups); 3588 */
3733 3589
3734 sdg->sgp->power = power; 3590 for_each_cpu(cpu, sched_group_cpus(sdg))
3591 power += power_of(cpu);
3592 } else {
3593 /*
3594 * !SD_OVERLAP domains can assume that child groups
3595 * span the current group.
3596 */
3597
3598 group = child->groups;
3599 do {
3600 power += group->sgp->power;
3601 group = group->next;
3602 } while (group != child->groups);
3603 }
3604
3605 sdg->sgp->power_orig = sdg->sgp->power = power;
3735} 3606}
3736 3607
3737/* 3608/*
@@ -3761,41 +3632,43 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3761 3632
3762/** 3633/**
3763 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3634 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3764 * @sd: The sched_domain whose statistics are to be updated. 3635 * @env: The load balancing environment.
3765 * @group: sched_group whose statistics are to be updated. 3636 * @group: sched_group whose statistics are to be updated.
3766 * @this_cpu: Cpu for which load balance is currently performed.
3767 * @idle: Idle status of this_cpu
3768 * @load_idx: Load index of sched_domain of this_cpu for load calc. 3637 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3769 * @local_group: Does group contain this_cpu. 3638 * @local_group: Does group contain this_cpu.
3770 * @cpus: Set of cpus considered for load balancing. 3639 * @cpus: Set of cpus considered for load balancing.
3771 * @balance: Should we balance. 3640 * @balance: Should we balance.
3772 * @sgs: variable to hold the statistics for this group. 3641 * @sgs: variable to hold the statistics for this group.
3773 */ 3642 */
3774static inline void update_sg_lb_stats(struct sched_domain *sd, 3643static inline void update_sg_lb_stats(struct lb_env *env,
3775 struct sched_group *group, int this_cpu, 3644 struct sched_group *group, int load_idx,
3776 enum cpu_idle_type idle, int load_idx,
3777 int local_group, const struct cpumask *cpus, 3645 int local_group, const struct cpumask *cpus,
3778 int *balance, struct sg_lb_stats *sgs) 3646 int *balance, struct sg_lb_stats *sgs)
3779{ 3647{
3780 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; 3648 unsigned long nr_running, max_nr_running, min_nr_running;
3781 int i; 3649 unsigned long load, max_cpu_load, min_cpu_load;
3782 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3650 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3783 unsigned long avg_load_per_task = 0; 3651 unsigned long avg_load_per_task = 0;
3652 int i;
3784 3653
3785 if (local_group) 3654 if (local_group)
3786 balance_cpu = group_first_cpu(group); 3655 balance_cpu = group_balance_cpu(group);
3787 3656
3788 /* Tally up the load of all CPUs in the group */ 3657 /* Tally up the load of all CPUs in the group */
3789 max_cpu_load = 0; 3658 max_cpu_load = 0;
3790 min_cpu_load = ~0UL; 3659 min_cpu_load = ~0UL;
3791 max_nr_running = 0; 3660 max_nr_running = 0;
3661 min_nr_running = ~0UL;
3792 3662
3793 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3663 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3794 struct rq *rq = cpu_rq(i); 3664 struct rq *rq = cpu_rq(i);
3795 3665
3666 nr_running = rq->nr_running;
3667
3796 /* Bias balancing toward cpus of our domain */ 3668 /* Bias balancing toward cpus of our domain */
3797 if (local_group) { 3669 if (local_group) {
3798 if (idle_cpu(i) && !first_idle_cpu) { 3670 if (idle_cpu(i) && !first_idle_cpu &&
3671 cpumask_test_cpu(i, sched_group_mask(group))) {
3799 first_idle_cpu = 1; 3672 first_idle_cpu = 1;
3800 balance_cpu = i; 3673 balance_cpu = i;
3801 } 3674 }
@@ -3803,16 +3676,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3803 load = target_load(i, load_idx); 3676 load = target_load(i, load_idx);
3804 } else { 3677 } else {
3805 load = source_load(i, load_idx); 3678 load = source_load(i, load_idx);
3806 if (load > max_cpu_load) { 3679 if (load > max_cpu_load)
3807 max_cpu_load = load; 3680 max_cpu_load = load;
3808 max_nr_running = rq->nr_running;
3809 }
3810 if (min_cpu_load > load) 3681 if (min_cpu_load > load)
3811 min_cpu_load = load; 3682 min_cpu_load = load;
3683
3684 if (nr_running > max_nr_running)
3685 max_nr_running = nr_running;
3686 if (min_nr_running > nr_running)
3687 min_nr_running = nr_running;
3812 } 3688 }
3813 3689
3814 sgs->group_load += load; 3690 sgs->group_load += load;
3815 sgs->sum_nr_running += rq->nr_running; 3691 sgs->sum_nr_running += nr_running;
3816 sgs->sum_weighted_load += weighted_cpuload(i); 3692 sgs->sum_weighted_load += weighted_cpuload(i);
3817 if (idle_cpu(i)) 3693 if (idle_cpu(i))
3818 sgs->idle_cpus++; 3694 sgs->idle_cpus++;
@@ -3825,14 +3701,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3825 * to do the newly idle load balance. 3701 * to do the newly idle load balance.
3826 */ 3702 */
3827 if (local_group) { 3703 if (local_group) {
3828 if (idle != CPU_NEWLY_IDLE) { 3704 if (env->idle != CPU_NEWLY_IDLE) {
3829 if (balance_cpu != this_cpu) { 3705 if (balance_cpu != env->dst_cpu) {
3830 *balance = 0; 3706 *balance = 0;
3831 return; 3707 return;
3832 } 3708 }
3833 update_group_power(sd, this_cpu); 3709 update_group_power(env->sd, env->dst_cpu);
3834 } else if (time_after_eq(jiffies, group->sgp->next_update)) 3710 } else if (time_after_eq(jiffies, group->sgp->next_update))
3835 update_group_power(sd, this_cpu); 3711 update_group_power(env->sd, env->dst_cpu);
3836 } 3712 }
3837 3713
3838 /* Adjust by relative CPU power of the group */ 3714 /* Adjust by relative CPU power of the group */
@@ -3850,13 +3726,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3850 if (sgs->sum_nr_running) 3726 if (sgs->sum_nr_running)
3851 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 3727 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
3852 3728
3853 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) 3729 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
3730 (max_nr_running - min_nr_running) > 1)
3854 sgs->group_imb = 1; 3731 sgs->group_imb = 1;
3855 3732
3856 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, 3733 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
3857 SCHED_POWER_SCALE); 3734 SCHED_POWER_SCALE);
3858 if (!sgs->group_capacity) 3735 if (!sgs->group_capacity)
3859 sgs->group_capacity = fix_small_capacity(sd, group); 3736 sgs->group_capacity = fix_small_capacity(env->sd, group);
3860 sgs->group_weight = group->group_weight; 3737 sgs->group_weight = group->group_weight;
3861 3738
3862 if (sgs->group_capacity > sgs->sum_nr_running) 3739 if (sgs->group_capacity > sgs->sum_nr_running)
@@ -3865,20 +3742,18 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3865 3742
3866/** 3743/**
3867 * update_sd_pick_busiest - return 1 on busiest group 3744 * update_sd_pick_busiest - return 1 on busiest group
3868 * @sd: sched_domain whose statistics are to be checked 3745 * @env: The load balancing environment.
3869 * @sds: sched_domain statistics 3746 * @sds: sched_domain statistics
3870 * @sg: sched_group candidate to be checked for being the busiest 3747 * @sg: sched_group candidate to be checked for being the busiest
3871 * @sgs: sched_group statistics 3748 * @sgs: sched_group statistics
3872 * @this_cpu: the current cpu
3873 * 3749 *
3874 * Determine if @sg is a busier group than the previously selected 3750 * Determine if @sg is a busier group than the previously selected
3875 * busiest group. 3751 * busiest group.
3876 */ 3752 */
3877static bool update_sd_pick_busiest(struct sched_domain *sd, 3753static bool update_sd_pick_busiest(struct lb_env *env,
3878 struct sd_lb_stats *sds, 3754 struct sd_lb_stats *sds,
3879 struct sched_group *sg, 3755 struct sched_group *sg,
3880 struct sg_lb_stats *sgs, 3756 struct sg_lb_stats *sgs)
3881 int this_cpu)
3882{ 3757{
3883 if (sgs->avg_load <= sds->max_load) 3758 if (sgs->avg_load <= sds->max_load)
3884 return false; 3759 return false;
@@ -3894,8 +3769,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3894 * numbered CPUs in the group, therefore mark all groups 3769 * numbered CPUs in the group, therefore mark all groups
3895 * higher than ourself as busy. 3770 * higher than ourself as busy.
3896 */ 3771 */
3897 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && 3772 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
3898 this_cpu < group_first_cpu(sg)) { 3773 env->dst_cpu < group_first_cpu(sg)) {
3899 if (!sds->busiest) 3774 if (!sds->busiest)
3900 return true; 3775 return true;
3901 3776
@@ -3908,35 +3783,32 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3908 3783
3909/** 3784/**
3910 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 3785 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
3911 * @sd: sched_domain whose statistics are to be updated. 3786 * @env: The load balancing environment.
3912 * @this_cpu: Cpu for which load balance is currently performed.
3913 * @idle: Idle status of this_cpu
3914 * @cpus: Set of cpus considered for load balancing. 3787 * @cpus: Set of cpus considered for load balancing.
3915 * @balance: Should we balance. 3788 * @balance: Should we balance.
3916 * @sds: variable to hold the statistics for this sched_domain. 3789 * @sds: variable to hold the statistics for this sched_domain.
3917 */ 3790 */
3918static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 3791static inline void update_sd_lb_stats(struct lb_env *env,
3919 enum cpu_idle_type idle, const struct cpumask *cpus, 3792 const struct cpumask *cpus,
3920 int *balance, struct sd_lb_stats *sds) 3793 int *balance, struct sd_lb_stats *sds)
3921{ 3794{
3922 struct sched_domain *child = sd->child; 3795 struct sched_domain *child = env->sd->child;
3923 struct sched_group *sg = sd->groups; 3796 struct sched_group *sg = env->sd->groups;
3924 struct sg_lb_stats sgs; 3797 struct sg_lb_stats sgs;
3925 int load_idx, prefer_sibling = 0; 3798 int load_idx, prefer_sibling = 0;
3926 3799
3927 if (child && child->flags & SD_PREFER_SIBLING) 3800 if (child && child->flags & SD_PREFER_SIBLING)
3928 prefer_sibling = 1; 3801 prefer_sibling = 1;
3929 3802
3930 init_sd_power_savings_stats(sd, sds, idle); 3803 load_idx = get_sd_load_idx(env->sd, env->idle);
3931 load_idx = get_sd_load_idx(sd, idle);
3932 3804
3933 do { 3805 do {
3934 int local_group; 3806 int local_group;
3935 3807
3936 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 3808 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
3937 memset(&sgs, 0, sizeof(sgs)); 3809 memset(&sgs, 0, sizeof(sgs));
3938 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, 3810 update_sg_lb_stats(env, sg, load_idx, local_group,
3939 local_group, cpus, balance, &sgs); 3811 cpus, balance, &sgs);
3940 3812
3941 if (local_group && !(*balance)) 3813 if (local_group && !(*balance))
3942 return; 3814 return;
@@ -3964,7 +3836,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3964 sds->this_load_per_task = sgs.sum_weighted_load; 3836 sds->this_load_per_task = sgs.sum_weighted_load;
3965 sds->this_has_capacity = sgs.group_has_capacity; 3837 sds->this_has_capacity = sgs.group_has_capacity;
3966 sds->this_idle_cpus = sgs.idle_cpus; 3838 sds->this_idle_cpus = sgs.idle_cpus;
3967 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 3839 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
3968 sds->max_load = sgs.avg_load; 3840 sds->max_load = sgs.avg_load;
3969 sds->busiest = sg; 3841 sds->busiest = sg;
3970 sds->busiest_nr_running = sgs.sum_nr_running; 3842 sds->busiest_nr_running = sgs.sum_nr_running;
@@ -3976,9 +3848,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3976 sds->group_imb = sgs.group_imb; 3848 sds->group_imb = sgs.group_imb;
3977 } 3849 }
3978 3850
3979 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
3980 sg = sg->next; 3851 sg = sg->next;
3981 } while (sg != sd->groups); 3852 } while (sg != env->sd->groups);
3982} 3853}
3983 3854
3984/** 3855/**
@@ -4001,29 +3872,26 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
4001 * Returns 1 when packing is required and a task should be moved to 3872 * Returns 1 when packing is required and a task should be moved to
4002 * this CPU. The amount of the imbalance is returned in *imbalance. 3873 * this CPU. The amount of the imbalance is returned in *imbalance.
4003 * 3874 *
4004 * @sd: The sched_domain whose packing is to be checked. 3875 * @env: The load balancing environment.
4005 * @sds: Statistics of the sched_domain which is to be packed 3876 * @sds: Statistics of the sched_domain which is to be packed
4006 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4007 * @imbalance: returns amount of imbalanced due to packing.
4008 */ 3877 */
4009static int check_asym_packing(struct sched_domain *sd, 3878static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4010 struct sd_lb_stats *sds,
4011 int this_cpu, unsigned long *imbalance)
4012{ 3879{
4013 int busiest_cpu; 3880 int busiest_cpu;
4014 3881
4015 if (!(sd->flags & SD_ASYM_PACKING)) 3882 if (!(env->sd->flags & SD_ASYM_PACKING))
4016 return 0; 3883 return 0;
4017 3884
4018 if (!sds->busiest) 3885 if (!sds->busiest)
4019 return 0; 3886 return 0;
4020 3887
4021 busiest_cpu = group_first_cpu(sds->busiest); 3888 busiest_cpu = group_first_cpu(sds->busiest);
4022 if (this_cpu > busiest_cpu) 3889 if (env->dst_cpu > busiest_cpu)
4023 return 0; 3890 return 0;
4024 3891
4025 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, 3892 env->imbalance = DIV_ROUND_CLOSEST(
4026 SCHED_POWER_SCALE); 3893 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
3894
4027 return 1; 3895 return 1;
4028} 3896}
4029 3897
@@ -4031,12 +3899,11 @@ static int check_asym_packing(struct sched_domain *sd,
4031 * fix_small_imbalance - Calculate the minor imbalance that exists 3899 * fix_small_imbalance - Calculate the minor imbalance that exists
4032 * amongst the groups of a sched_domain, during 3900 * amongst the groups of a sched_domain, during
4033 * load balancing. 3901 * load balancing.
3902 * @env: The load balancing environment.
4034 * @sds: Statistics of the sched_domain whose imbalance is to be calculated. 3903 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
4035 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4036 * @imbalance: Variable to store the imbalance.
4037 */ 3904 */
4038static inline void fix_small_imbalance(struct sd_lb_stats *sds, 3905static inline
4039 int this_cpu, unsigned long *imbalance) 3906void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4040{ 3907{
4041 unsigned long tmp, pwr_now = 0, pwr_move = 0; 3908 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4042 unsigned int imbn = 2; 3909 unsigned int imbn = 2;
@@ -4047,9 +3914,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4047 if (sds->busiest_load_per_task > 3914 if (sds->busiest_load_per_task >
4048 sds->this_load_per_task) 3915 sds->this_load_per_task)
4049 imbn = 1; 3916 imbn = 1;
4050 } else 3917 } else {
4051 sds->this_load_per_task = 3918 sds->this_load_per_task =
4052 cpu_avg_load_per_task(this_cpu); 3919 cpu_avg_load_per_task(env->dst_cpu);
3920 }
4053 3921
4054 scaled_busy_load_per_task = sds->busiest_load_per_task 3922 scaled_busy_load_per_task = sds->busiest_load_per_task
4055 * SCHED_POWER_SCALE; 3923 * SCHED_POWER_SCALE;
@@ -4057,7 +3925,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4057 3925
4058 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 3926 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4059 (scaled_busy_load_per_task * imbn)) { 3927 (scaled_busy_load_per_task * imbn)) {
4060 *imbalance = sds->busiest_load_per_task; 3928 env->imbalance = sds->busiest_load_per_task;
4061 return; 3929 return;
4062 } 3930 }
4063 3931
@@ -4094,18 +3962,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4094 3962
4095 /* Move if we gain throughput */ 3963 /* Move if we gain throughput */
4096 if (pwr_move > pwr_now) 3964 if (pwr_move > pwr_now)
4097 *imbalance = sds->busiest_load_per_task; 3965 env->imbalance = sds->busiest_load_per_task;
4098} 3966}
4099 3967
4100/** 3968/**
4101 * calculate_imbalance - Calculate the amount of imbalance present within the 3969 * calculate_imbalance - Calculate the amount of imbalance present within the
4102 * groups of a given sched_domain during load balance. 3970 * groups of a given sched_domain during load balance.
3971 * @env: load balance environment
4103 * @sds: statistics of the sched_domain whose imbalance is to be calculated. 3972 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4104 * @this_cpu: Cpu for which currently load balance is being performed.
4105 * @imbalance: The variable to store the imbalance.
4106 */ 3973 */
4107static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, 3974static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4108 unsigned long *imbalance)
4109{ 3975{
4110 unsigned long max_pull, load_above_capacity = ~0UL; 3976 unsigned long max_pull, load_above_capacity = ~0UL;
4111 3977
@@ -4121,8 +3987,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4121 * its cpu_power, while calculating max_load..) 3987 * its cpu_power, while calculating max_load..)
4122 */ 3988 */
4123 if (sds->max_load < sds->avg_load) { 3989 if (sds->max_load < sds->avg_load) {
4124 *imbalance = 0; 3990 env->imbalance = 0;
4125 return fix_small_imbalance(sds, this_cpu, imbalance); 3991 return fix_small_imbalance(env, sds);
4126 } 3992 }
4127 3993
4128 if (!sds->group_imb) { 3994 if (!sds->group_imb) {
@@ -4150,7 +4016,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4150 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 4016 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4151 4017
4152 /* How much load to actually move to equalise the imbalance */ 4018 /* How much load to actually move to equalise the imbalance */
4153 *imbalance = min(max_pull * sds->busiest->sgp->power, 4019 env->imbalance = min(max_pull * sds->busiest->sgp->power,
4154 (sds->avg_load - sds->this_load) * sds->this->sgp->power) 4020 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4155 / SCHED_POWER_SCALE; 4021 / SCHED_POWER_SCALE;
4156 4022
@@ -4160,8 +4026,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4160 * a think about bumping its value to force at least one task to be 4026 * a think about bumping its value to force at least one task to be
4161 * moved 4027 * moved
4162 */ 4028 */
4163 if (*imbalance < sds->busiest_load_per_task) 4029 if (env->imbalance < sds->busiest_load_per_task)
4164 return fix_small_imbalance(sds, this_cpu, imbalance); 4030 return fix_small_imbalance(env, sds);
4165 4031
4166} 4032}
4167 4033
@@ -4177,11 +4043,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4177 * Also calculates the amount of weighted load which should be moved 4043 * Also calculates the amount of weighted load which should be moved
4178 * to restore balance. 4044 * to restore balance.
4179 * 4045 *
4180 * @sd: The sched_domain whose busiest group is to be returned. 4046 * @env: The load balancing environment.
4181 * @this_cpu: The cpu for which load balancing is currently being performed.
4182 * @imbalance: Variable which stores amount of weighted load which should
4183 * be moved to restore balance/put a group to idle.
4184 * @idle: The idle status of this_cpu.
4185 * @cpus: The set of CPUs under consideration for load-balancing. 4047 * @cpus: The set of CPUs under consideration for load-balancing.
4186 * @balance: Pointer to a variable indicating if this_cpu 4048 * @balance: Pointer to a variable indicating if this_cpu
4187 * is the appropriate cpu to perform load balancing at this_level. 4049 * is the appropriate cpu to perform load balancing at this_level.
@@ -4192,9 +4054,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4192 * put to idle by rebalancing its tasks onto our group. 4054 * put to idle by rebalancing its tasks onto our group.
4193 */ 4055 */
4194static struct sched_group * 4056static struct sched_group *
4195find_busiest_group(struct sched_domain *sd, int this_cpu, 4057find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
4196 unsigned long *imbalance, enum cpu_idle_type idle,
4197 const struct cpumask *cpus, int *balance)
4198{ 4058{
4199 struct sd_lb_stats sds; 4059 struct sd_lb_stats sds;
4200 4060
@@ -4204,7 +4064,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4204 * Compute the various statistics relavent for load balancing at 4064 * Compute the various statistics relavent for load balancing at
4205 * this level. 4065 * this level.
4206 */ 4066 */
4207 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); 4067 update_sd_lb_stats(env, cpus, balance, &sds);
4208 4068
4209 /* 4069 /*
4210 * this_cpu is not the appropriate cpu to perform load balancing at 4070 * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4213,8 +4073,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4213 if (!(*balance)) 4073 if (!(*balance))
4214 goto ret; 4074 goto ret;
4215 4075
4216 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && 4076 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4217 check_asym_packing(sd, &sds, this_cpu, imbalance)) 4077 check_asym_packing(env, &sds))
4218 return sds.busiest; 4078 return sds.busiest;
4219 4079
4220 /* There is no busy sibling group to pull tasks from */ 4080 /* There is no busy sibling group to pull tasks from */
@@ -4232,7 +4092,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4232 goto force_balance; 4092 goto force_balance;
4233 4093
4234 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 4094 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4235 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 4095 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4236 !sds.busiest_has_capacity) 4096 !sds.busiest_has_capacity)
4237 goto force_balance; 4097 goto force_balance;
4238 4098
@@ -4250,7 +4110,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4250 if (sds.this_load >= sds.avg_load) 4110 if (sds.this_load >= sds.avg_load)
4251 goto out_balanced; 4111 goto out_balanced;
4252 4112
4253 if (idle == CPU_IDLE) { 4113 if (env->idle == CPU_IDLE) {
4254 /* 4114 /*
4255 * This cpu is idle. If the busiest group load doesn't 4115 * This cpu is idle. If the busiest group load doesn't
4256 * have more tasks than the number of available cpu's and 4116 * have more tasks than the number of available cpu's and
@@ -4265,34 +4125,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4265 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 4125 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4266 * imbalance_pct to be conservative. 4126 * imbalance_pct to be conservative.
4267 */ 4127 */
4268 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 4128 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4269 goto out_balanced; 4129 goto out_balanced;
4270 } 4130 }
4271 4131
4272force_balance: 4132force_balance:
4273 /* Looks like there is an imbalance. Compute it */ 4133 /* Looks like there is an imbalance. Compute it */
4274 calculate_imbalance(&sds, this_cpu, imbalance); 4134 calculate_imbalance(env, &sds);
4275 return sds.busiest; 4135 return sds.busiest;
4276 4136
4277out_balanced: 4137out_balanced:
4278 /*
4279 * There is no obvious imbalance. But check if we can do some balancing
4280 * to save power.
4281 */
4282 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4283 return sds.busiest;
4284ret: 4138ret:
4285 *imbalance = 0; 4139 env->imbalance = 0;
4286 return NULL; 4140 return NULL;
4287} 4141}
4288 4142
4289/* 4143/*
4290 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4144 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4291 */ 4145 */
4292static struct rq * 4146static struct rq *find_busiest_queue(struct lb_env *env,
4293find_busiest_queue(struct sched_domain *sd, struct sched_group *group, 4147 struct sched_group *group,
4294 enum cpu_idle_type idle, unsigned long imbalance, 4148 const struct cpumask *cpus)
4295 const struct cpumask *cpus)
4296{ 4149{
4297 struct rq *busiest = NULL, *rq; 4150 struct rq *busiest = NULL, *rq;
4298 unsigned long max_load = 0; 4151 unsigned long max_load = 0;
@@ -4305,7 +4158,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4305 unsigned long wl; 4158 unsigned long wl;
4306 4159
4307 if (!capacity) 4160 if (!capacity)
4308 capacity = fix_small_capacity(sd, group); 4161 capacity = fix_small_capacity(env->sd, group);
4309 4162
4310 if (!cpumask_test_cpu(i, cpus)) 4163 if (!cpumask_test_cpu(i, cpus))
4311 continue; 4164 continue;
@@ -4317,7 +4170,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4317 * When comparing with imbalance, use weighted_cpuload() 4170 * When comparing with imbalance, use weighted_cpuload()
4318 * which is not scaled with the cpu power. 4171 * which is not scaled with the cpu power.
4319 */ 4172 */
4320 if (capacity && rq->nr_running == 1 && wl > imbalance) 4173 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4321 continue; 4174 continue;
4322 4175
4323 /* 4176 /*
@@ -4346,40 +4199,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4346/* Working cpumask for load_balance and load_balance_newidle. */ 4199/* Working cpumask for load_balance and load_balance_newidle. */
4347DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4200DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4348 4201
4349static int need_active_balance(struct sched_domain *sd, int idle, 4202static int need_active_balance(struct lb_env *env)
4350 int busiest_cpu, int this_cpu)
4351{ 4203{
4352 if (idle == CPU_NEWLY_IDLE) { 4204 struct sched_domain *sd = env->sd;
4205
4206 if (env->idle == CPU_NEWLY_IDLE) {
4353 4207
4354 /* 4208 /*
4355 * ASYM_PACKING needs to force migrate tasks from busy but 4209 * ASYM_PACKING needs to force migrate tasks from busy but
4356 * higher numbered CPUs in order to pack all tasks in the 4210 * higher numbered CPUs in order to pack all tasks in the
4357 * lowest numbered CPUs. 4211 * lowest numbered CPUs.
4358 */ 4212 */
4359 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) 4213 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4360 return 1; 4214 return 1;
4361
4362 /*
4363 * The only task running in a non-idle cpu can be moved to this
4364 * cpu in an attempt to completely freeup the other CPU
4365 * package.
4366 *
4367 * The package power saving logic comes from
4368 * find_busiest_group(). If there are no imbalance, then
4369 * f_b_g() will return NULL. However when sched_mc={1,2} then
4370 * f_b_g() will select a group from which a running task may be
4371 * pulled to this cpu in order to make the other package idle.
4372 * If there is no opportunity to make a package idle and if
4373 * there are no imbalance, then f_b_g() will return NULL and no
4374 * action will be taken in load_balance_newidle().
4375 *
4376 * Under normal task pull operation due to imbalance, there
4377 * will be more than one task in the source run queue and
4378 * move_tasks() will succeed. ld_moved will be true and this
4379 * active balance code will not be triggered.
4380 */
4381 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4382 return 0;
4383 } 4215 }
4384 4216
4385 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 4217 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4397,7 +4229,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4397{ 4229{
4398 int ld_moved, active_balance = 0; 4230 int ld_moved, active_balance = 0;
4399 struct sched_group *group; 4231 struct sched_group *group;
4400 unsigned long imbalance;
4401 struct rq *busiest; 4232 struct rq *busiest;
4402 unsigned long flags; 4233 unsigned long flags;
4403 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4234 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
@@ -4407,7 +4238,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4407 .dst_cpu = this_cpu, 4238 .dst_cpu = this_cpu,
4408 .dst_rq = this_rq, 4239 .dst_rq = this_rq,
4409 .idle = idle, 4240 .idle = idle,
4410 .loop_break = sysctl_sched_nr_migrate, 4241 .loop_break = sched_nr_migrate_break,
4411 }; 4242 };
4412 4243
4413 cpumask_copy(cpus, cpu_active_mask); 4244 cpumask_copy(cpus, cpu_active_mask);
@@ -4415,8 +4246,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4415 schedstat_inc(sd, lb_count[idle]); 4246 schedstat_inc(sd, lb_count[idle]);
4416 4247
4417redo: 4248redo:
4418 group = find_busiest_group(sd, this_cpu, &imbalance, idle, 4249 group = find_busiest_group(&env, cpus, balance);
4419 cpus, balance);
4420 4250
4421 if (*balance == 0) 4251 if (*balance == 0)
4422 goto out_balanced; 4252 goto out_balanced;
@@ -4426,7 +4256,7 @@ redo:
4426 goto out_balanced; 4256 goto out_balanced;
4427 } 4257 }
4428 4258
4429 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); 4259 busiest = find_busiest_queue(&env, group, cpus);
4430 if (!busiest) { 4260 if (!busiest) {
4431 schedstat_inc(sd, lb_nobusyq[idle]); 4261 schedstat_inc(sd, lb_nobusyq[idle]);
4432 goto out_balanced; 4262 goto out_balanced;
@@ -4434,7 +4264,7 @@ redo:
4434 4264
4435 BUG_ON(busiest == this_rq); 4265 BUG_ON(busiest == this_rq);
4436 4266
4437 schedstat_add(sd, lb_imbalance[idle], imbalance); 4267 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4438 4268
4439 ld_moved = 0; 4269 ld_moved = 0;
4440 if (busiest->nr_running > 1) { 4270 if (busiest->nr_running > 1) {
@@ -4445,10 +4275,9 @@ redo:
4445 * correctly treated as an imbalance. 4275 * correctly treated as an imbalance.
4446 */ 4276 */
4447 env.flags |= LBF_ALL_PINNED; 4277 env.flags |= LBF_ALL_PINNED;
4448 env.load_move = imbalance; 4278 env.src_cpu = busiest->cpu;
4449 env.src_cpu = busiest->cpu; 4279 env.src_rq = busiest;
4450 env.src_rq = busiest; 4280 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4451 env.loop_max = busiest->nr_running;
4452 4281
4453more_balance: 4282more_balance:
4454 local_irq_save(flags); 4283 local_irq_save(flags);
@@ -4490,7 +4319,7 @@ more_balance:
4490 if (idle != CPU_NEWLY_IDLE) 4319 if (idle != CPU_NEWLY_IDLE)
4491 sd->nr_balance_failed++; 4320 sd->nr_balance_failed++;
4492 4321
4493 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { 4322 if (need_active_balance(&env)) {
4494 raw_spin_lock_irqsave(&busiest->lock, flags); 4323 raw_spin_lock_irqsave(&busiest->lock, flags);
4495 4324
4496 /* don't kick the active_load_balance_cpu_stop, 4325 /* don't kick the active_load_balance_cpu_stop,
@@ -4517,10 +4346,11 @@ more_balance:
4517 } 4346 }
4518 raw_spin_unlock_irqrestore(&busiest->lock, flags); 4347 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4519 4348
4520 if (active_balance) 4349 if (active_balance) {
4521 stop_one_cpu_nowait(cpu_of(busiest), 4350 stop_one_cpu_nowait(cpu_of(busiest),
4522 active_load_balance_cpu_stop, busiest, 4351 active_load_balance_cpu_stop, busiest,
4523 &busiest->active_balance_work); 4352 &busiest->active_balance_work);
4353 }
4524 4354
4525 /* 4355 /*
4526 * We've kicked active balancing, reset the failure 4356 * We've kicked active balancing, reset the failure
@@ -4701,104 +4531,15 @@ static struct {
4701 unsigned long next_balance; /* in jiffy units */ 4531 unsigned long next_balance; /* in jiffy units */
4702} nohz ____cacheline_aligned; 4532} nohz ____cacheline_aligned;
4703 4533
4704#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4534static inline int find_new_ilb(int call_cpu)
4705/**
4706 * lowest_flag_domain - Return lowest sched_domain containing flag.
4707 * @cpu: The cpu whose lowest level of sched domain is to
4708 * be returned.
4709 * @flag: The flag to check for the lowest sched_domain
4710 * for the given cpu.
4711 *
4712 * Returns the lowest sched_domain of a cpu which contains the given flag.
4713 */
4714static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4715{
4716 struct sched_domain *sd;
4717
4718 for_each_domain(cpu, sd)
4719 if (sd->flags & flag)
4720 break;
4721
4722 return sd;
4723}
4724
4725/**
4726 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4727 * @cpu: The cpu whose domains we're iterating over.
4728 * @sd: variable holding the value of the power_savings_sd
4729 * for cpu.
4730 * @flag: The flag to filter the sched_domains to be iterated.
4731 *
4732 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4733 * set, starting from the lowest sched_domain to the highest.
4734 */
4735#define for_each_flag_domain(cpu, sd, flag) \
4736 for (sd = lowest_flag_domain(cpu, flag); \
4737 (sd && (sd->flags & flag)); sd = sd->parent)
4738
4739/**
4740 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4741 * @cpu: The cpu which is nominating a new idle_load_balancer.
4742 *
4743 * Returns: Returns the id of the idle load balancer if it exists,
4744 * Else, returns >= nr_cpu_ids.
4745 *
4746 * This algorithm picks the idle load balancer such that it belongs to a
4747 * semi-idle powersavings sched_domain. The idea is to try and avoid
4748 * completely idle packages/cores just for the purpose of idle load balancing
4749 * when there are other idle cpu's which are better suited for that job.
4750 */
4751static int find_new_ilb(int cpu)
4752{ 4535{
4753 int ilb = cpumask_first(nohz.idle_cpus_mask); 4536 int ilb = cpumask_first(nohz.idle_cpus_mask);
4754 struct sched_group *ilbg;
4755 struct sched_domain *sd;
4756
4757 /*
4758 * Have idle load balancer selection from semi-idle packages only
4759 * when power-aware load balancing is enabled
4760 */
4761 if (!(sched_smt_power_savings || sched_mc_power_savings))
4762 goto out_done;
4763
4764 /*
4765 * Optimize for the case when we have no idle CPUs or only one
4766 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4767 */
4768 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
4769 goto out_done;
4770 4537
4771 rcu_read_lock();
4772 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4773 ilbg = sd->groups;
4774
4775 do {
4776 if (ilbg->group_weight !=
4777 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4778 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4779 sched_group_cpus(ilbg));
4780 goto unlock;
4781 }
4782
4783 ilbg = ilbg->next;
4784
4785 } while (ilbg != sd->groups);
4786 }
4787unlock:
4788 rcu_read_unlock();
4789
4790out_done:
4791 if (ilb < nr_cpu_ids && idle_cpu(ilb)) 4538 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4792 return ilb; 4539 return ilb;
4793 4540
4794 return nr_cpu_ids; 4541 return nr_cpu_ids;
4795} 4542}
4796#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4797static inline int find_new_ilb(int call_cpu)
4798{
4799 return nr_cpu_ids;
4800}
4801#endif
4802 4543
4803/* 4544/*
4804 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the 4545 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
@@ -5021,7 +4762,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5021 4762
5022 raw_spin_lock_irq(&this_rq->lock); 4763 raw_spin_lock_irq(&this_rq->lock);
5023 update_rq_clock(this_rq); 4764 update_rq_clock(this_rq);
5024 update_cpu_load(this_rq); 4765 update_idle_cpu_load(this_rq);
5025 raw_spin_unlock_irq(&this_rq->lock); 4766 raw_spin_unlock_irq(&this_rq->lock);
5026 4767
5027 rebalance_domains(balance_cpu, CPU_IDLE); 4768 rebalance_domains(balance_cpu, CPU_IDLE);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e61fd73913d0..de00a486c5c6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -68,3 +68,4 @@ SCHED_FEAT(TTWU_QUEUE, true)
68 68
69SCHED_FEAT(FORCE_SD_OVERLAP, false) 69SCHED_FEAT(FORCE_SD_OVERLAP, false)
70SCHED_FEAT(RT_RUNTIME_SHARE, true) 70SCHED_FEAT(RT_RUNTIME_SHARE, true)
71SCHED_FEAT(LB_MIN, false)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c957f289..b44d604b35d1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -4,7 +4,7 @@
4 * idle-task scheduling class. 4 * idle-task scheduling class.
5 * 5 *
6 * (NOTE: these are not related to SCHED_IDLE tasks which are 6 * (NOTE: these are not related to SCHED_IDLE tasks which are
7 * handled in sched_fair.c) 7 * handled in sched/fair.c)
8 */ 8 */
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 44af55e6d5d0..573e1ca01102 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
274 274
275static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 275static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
276{ 276{
277 struct task_struct *p;
278
277 if (!rt_entity_is_task(rt_se)) 279 if (!rt_entity_is_task(rt_se))
278 return; 280 return;
279 281
282 p = rt_task_of(rt_se);
280 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 283 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
281 284
282 rt_rq->rt_nr_total++; 285 rt_rq->rt_nr_total++;
283 if (rt_se->nr_cpus_allowed > 1) 286 if (p->nr_cpus_allowed > 1)
284 rt_rq->rt_nr_migratory++; 287 rt_rq->rt_nr_migratory++;
285 288
286 update_rt_migration(rt_rq); 289 update_rt_migration(rt_rq);
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
288 291
289static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 292static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
290{ 293{
294 struct task_struct *p;
295
291 if (!rt_entity_is_task(rt_se)) 296 if (!rt_entity_is_task(rt_se))
292 return; 297 return;
293 298
299 p = rt_task_of(rt_se);
294 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 300 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
295 301
296 rt_rq->rt_nr_total--; 302 rt_rq->rt_nr_total--;
297 if (rt_se->nr_cpus_allowed > 1) 303 if (p->nr_cpus_allowed > 1)
298 rt_rq->rt_nr_migratory--; 304 rt_rq->rt_nr_migratory--;
299 305
300 update_rt_migration(rt_rq); 306 update_rt_migration(rt_rq);
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1161 1167
1162 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); 1168 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
1163 1169
1164 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 1170 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1165 enqueue_pushable_task(rq, p); 1171 enqueue_pushable_task(rq, p);
1166 1172
1167 inc_nr_running(rq); 1173 inc_nr_running(rq);
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1225 1231
1226 cpu = task_cpu(p); 1232 cpu = task_cpu(p);
1227 1233
1228 if (p->rt.nr_cpus_allowed == 1) 1234 if (p->nr_cpus_allowed == 1)
1229 goto out; 1235 goto out;
1230 1236
1231 /* For anything but wake ups, just return the task_cpu */ 1237 /* For anything but wake ups, just return the task_cpu */
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1260 * will have to sort it out. 1266 * will have to sort it out.
1261 */ 1267 */
1262 if (curr && unlikely(rt_task(curr)) && 1268 if (curr && unlikely(rt_task(curr)) &&
1263 (curr->rt.nr_cpus_allowed < 2 || 1269 (curr->nr_cpus_allowed < 2 ||
1264 curr->prio <= p->prio) && 1270 curr->prio <= p->prio) &&
1265 (p->rt.nr_cpus_allowed > 1)) { 1271 (p->nr_cpus_allowed > 1)) {
1266 int target = find_lowest_rq(p); 1272 int target = find_lowest_rq(p);
1267 1273
1268 if (target != -1) 1274 if (target != -1)
@@ -1276,10 +1282,10 @@ out:
1276 1282
1277static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1283static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1278{ 1284{
1279 if (rq->curr->rt.nr_cpus_allowed == 1) 1285 if (rq->curr->nr_cpus_allowed == 1)
1280 return; 1286 return;
1281 1287
1282 if (p->rt.nr_cpus_allowed != 1 1288 if (p->nr_cpus_allowed != 1
1283 && cpupri_find(&rq->rd->cpupri, p, NULL)) 1289 && cpupri_find(&rq->rd->cpupri, p, NULL))
1284 return; 1290 return;
1285 1291
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1395 * The previous task needs to be made eligible for pushing 1401 * The previous task needs to be made eligible for pushing
1396 * if it is still active 1402 * if it is still active
1397 */ 1403 */
1398 if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) 1404 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1399 enqueue_pushable_task(rq, p); 1405 enqueue_pushable_task(rq, p);
1400} 1406}
1401 1407
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1408{ 1414{
1409 if (!task_running(rq, p) && 1415 if (!task_running(rq, p) &&
1410 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && 1416 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
1411 (p->rt.nr_cpus_allowed > 1)) 1417 (p->nr_cpus_allowed > 1))
1412 return 1; 1418 return 1;
1413 return 0; 1419 return 0;
1414} 1420}
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task)
1464 if (unlikely(!lowest_mask)) 1470 if (unlikely(!lowest_mask))
1465 return -1; 1471 return -1;
1466 1472
1467 if (task->rt.nr_cpus_allowed == 1) 1473 if (task->nr_cpus_allowed == 1)
1468 return -1; /* No other targets possible */ 1474 return -1; /* No other targets possible */
1469 1475
1470 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) 1476 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1556 task_running(rq, task) || 1562 task_running(rq, task) ||
1557 !task->on_rq)) { 1563 !task->on_rq)) {
1558 1564
1559 raw_spin_unlock(&lowest_rq->lock); 1565 double_unlock_balance(rq, lowest_rq);
1560 lowest_rq = NULL; 1566 lowest_rq = NULL;
1561 break; 1567 break;
1562 } 1568 }
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1586 1592
1587 BUG_ON(rq->cpu != task_cpu(p)); 1593 BUG_ON(rq->cpu != task_cpu(p));
1588 BUG_ON(task_current(rq, p)); 1594 BUG_ON(task_current(rq, p));
1589 BUG_ON(p->rt.nr_cpus_allowed <= 1); 1595 BUG_ON(p->nr_cpus_allowed <= 1);
1590 1596
1591 BUG_ON(!p->on_rq); 1597 BUG_ON(!p->on_rq);
1592 BUG_ON(!rt_task(p)); 1598 BUG_ON(!rt_task(p));
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1793 if (!task_running(rq, p) && 1799 if (!task_running(rq, p) &&
1794 !test_tsk_need_resched(rq->curr) && 1800 !test_tsk_need_resched(rq->curr) &&
1795 has_pushable_tasks(rq) && 1801 has_pushable_tasks(rq) &&
1796 p->rt.nr_cpus_allowed > 1 && 1802 p->nr_cpus_allowed > 1 &&
1797 rt_task(rq->curr) && 1803 rt_task(rq->curr) &&
1798 (rq->curr->rt.nr_cpus_allowed < 2 || 1804 (rq->curr->nr_cpus_allowed < 2 ||
1799 rq->curr->prio <= p->prio)) 1805 rq->curr->prio <= p->prio))
1800 push_rt_tasks(rq); 1806 push_rt_tasks(rq);
1801} 1807}
@@ -1803,44 +1809,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1803static void set_cpus_allowed_rt(struct task_struct *p, 1809static void set_cpus_allowed_rt(struct task_struct *p,
1804 const struct cpumask *new_mask) 1810 const struct cpumask *new_mask)
1805{ 1811{
1806 int weight = cpumask_weight(new_mask); 1812 struct rq *rq;
1813 int weight;
1807 1814
1808 BUG_ON(!rt_task(p)); 1815 BUG_ON(!rt_task(p));
1809 1816
1810 /* 1817 if (!p->on_rq)
1811 * Update the migration status of the RQ if we have an RT task 1818 return;
1812 * which is running AND changing its weight value.
1813 */
1814 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1815 struct rq *rq = task_rq(p);
1816 1819
1817 if (!task_current(rq, p)) { 1820 weight = cpumask_weight(new_mask);
1818 /*
1819 * Make sure we dequeue this task from the pushable list
1820 * before going further. It will either remain off of
1821 * the list because we are no longer pushable, or it
1822 * will be requeued.
1823 */
1824 if (p->rt.nr_cpus_allowed > 1)
1825 dequeue_pushable_task(rq, p);
1826 1821
1827 /* 1822 /*
1828 * Requeue if our weight is changing and still > 1 1823 * Only update if the process changes its state from whether it
1829 */ 1824 * can migrate or not.
1830 if (weight > 1) 1825 */
1831 enqueue_pushable_task(rq, p); 1826 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1832 1827 return;
1833 }
1834 1828
1835 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { 1829 rq = task_rq(p);
1836 rq->rt.rt_nr_migratory++;
1837 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1838 BUG_ON(!rq->rt.rt_nr_migratory);
1839 rq->rt.rt_nr_migratory--;
1840 }
1841 1830
1842 update_rt_migration(&rq->rt); 1831 /*
1832 * The process used to be able to migrate OR it can now migrate
1833 */
1834 if (weight <= 1) {
1835 if (!task_current(rq, p))
1836 dequeue_pushable_task(rq, p);
1837 BUG_ON(!rq->rt.rt_nr_migratory);
1838 rq->rt.rt_nr_migratory--;
1839 } else {
1840 if (!task_current(rq, p))
1841 enqueue_pushable_task(rq, p);
1842 rq->rt.rt_nr_migratory++;
1843 } 1843 }
1844
1845 update_rt_migration(&rq->rt);
1844} 1846}
1845 1847
1846/* Assumes rq->lock is held */ 1848/* Assumes rq->lock is held */
@@ -1983,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1983 1985
1984static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 1986static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1985{ 1987{
1988 struct sched_rt_entity *rt_se = &p->rt;
1989
1986 update_curr_rt(rq); 1990 update_curr_rt(rq);
1987 1991
1988 watchdog(rq, p); 1992 watchdog(rq, p);
@@ -2000,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2000 p->rt.time_slice = RR_TIMESLICE; 2004 p->rt.time_slice = RR_TIMESLICE;
2001 2005
2002 /* 2006 /*
2003 * Requeue to the end of queue if we are not the only element 2007 * Requeue to the end of queue if we (and all of our ancestors) are the
2004 * on the queue: 2008 * only element on the queue
2005 */ 2009 */
2006 if (p->rt.run_list.prev != p->rt.run_list.next) { 2010 for_each_sched_rt_entity(rt_se) {
2007 requeue_task_rt(rq, p, 0); 2011 if (rt_se->run_list.prev != rt_se->run_list.next) {
2008 set_tsk_need_resched(p); 2012 requeue_task_rt(rq, p, 0);
2013 set_tsk_need_resched(p);
2014 return;
2015 }
2009 } 2016 }
2010} 2017}
2011 2018
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb3acba4d52e..6d52cea7f33d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -201,7 +201,7 @@ struct cfs_bandwidth { };
201/* CFS-related fields in a runqueue */ 201/* CFS-related fields in a runqueue */
202struct cfs_rq { 202struct cfs_rq {
203 struct load_weight load; 203 struct load_weight load;
204 unsigned long nr_running, h_nr_running; 204 unsigned int nr_running, h_nr_running;
205 205
206 u64 exec_clock; 206 u64 exec_clock;
207 u64 min_vruntime; 207 u64 min_vruntime;
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void)
279/* Real-Time classes' related field in a runqueue: */ 279/* Real-Time classes' related field in a runqueue: */
280struct rt_rq { 280struct rt_rq {
281 struct rt_prio_array active; 281 struct rt_prio_array active;
282 unsigned long rt_nr_running; 282 unsigned int rt_nr_running;
283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
284 struct { 284 struct {
285 int curr; /* highest queued rt task prio */ 285 int curr; /* highest queued rt task prio */
@@ -353,7 +353,7 @@ struct rq {
353 * nr_running and cpu_load should be in the same cacheline because 353 * nr_running and cpu_load should be in the same cacheline because
354 * remote CPUs use both these fields when doing load calculation. 354 * remote CPUs use both these fields when doing load calculation.
355 */ 355 */
356 unsigned long nr_running; 356 unsigned int nr_running;
357 #define CPU_LOAD_IDX_MAX 5 357 #define CPU_LOAD_IDX_MAX 5
358 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 358 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
359 unsigned long last_load_update_tick; 359 unsigned long last_load_update_tick;
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
526DECLARE_PER_CPU(struct sched_domain *, sd_llc); 526DECLARE_PER_CPU(struct sched_domain *, sd_llc);
527DECLARE_PER_CPU(int, sd_llc_id); 527DECLARE_PER_CPU(int, sd_llc_id);
528 528
529extern int group_balance_cpu(struct sched_group *sg);
530
529#endif /* CONFIG_SMP */ 531#endif /* CONFIG_SMP */
530 532
531#include "stats.h" 533#include "stats.h"
@@ -876,7 +878,7 @@ extern void resched_cpu(int cpu);
876extern struct rt_bandwidth def_rt_bandwidth; 878extern struct rt_bandwidth def_rt_bandwidth;
877extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 879extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
878 880
879extern void update_cpu_load(struct rq *this_rq); 881extern void update_idle_cpu_load(struct rq *this_rq);
880 882
881#ifdef CONFIG_CGROUP_CPUACCT 883#ifdef CONFIG_CGROUP_CPUACCT
882#include <linux/cgroup.h> 884#include <linux/cgroup.h>