aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c667
1 files changed, 419 insertions, 248 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4603b9d8f30a..d5594a4268d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,6 +83,7 @@
83 83
84#include "sched.h" 84#include "sched.h"
85#include "../workqueue_sched.h" 85#include "../workqueue_sched.h"
86#include "../smpboot.h"
86 87
87#define CREATE_TRACE_POINTS 88#define CREATE_TRACE_POINTS
88#include <trace/events/sched.h> 89#include <trace/events/sched.h>
@@ -141,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
141#define SCHED_FEAT(name, enabled) \ 142#define SCHED_FEAT(name, enabled) \
142 #name , 143 #name ,
143 144
144static __read_mostly char *sched_feat_names[] = { 145static const char * const sched_feat_names[] = {
145#include "features.h" 146#include "features.h"
146 NULL
147}; 147};
148 148
149#undef SCHED_FEAT 149#undef SCHED_FEAT
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data)
692} 692}
693#endif 693#endif
694 694
695void update_cpu_load(struct rq *this_rq);
696
697static void set_load_weight(struct task_struct *p) 695static void set_load_weight(struct task_struct *p)
698{ 696{
699 int prio = p->static_prio - MAX_RT_PRIO; 697 int prio = p->static_prio - MAX_RT_PRIO;
@@ -2083,6 +2081,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2083#endif 2081#endif
2084 2082
2085 /* Here we just switch the register state and the stack. */ 2083 /* Here we just switch the register state and the stack. */
2084 rcu_switch_from(prev);
2086 switch_to(prev, next, prev); 2085 switch_to(prev, next, prev);
2087 2086
2088 barrier(); 2087 barrier();
@@ -2486,22 +2485,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2486 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2485 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2487 * every tick. We fix it up based on jiffies. 2486 * every tick. We fix it up based on jiffies.
2488 */ 2487 */
2489void update_cpu_load(struct rq *this_rq) 2488static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2489 unsigned long pending_updates)
2490{ 2490{
2491 unsigned long this_load = this_rq->load.weight;
2492 unsigned long curr_jiffies = jiffies;
2493 unsigned long pending_updates;
2494 int i, scale; 2491 int i, scale;
2495 2492
2496 this_rq->nr_load_updates++; 2493 this_rq->nr_load_updates++;
2497 2494
2498 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
2499 if (curr_jiffies == this_rq->last_load_update_tick)
2500 return;
2501
2502 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2503 this_rq->last_load_update_tick = curr_jiffies;
2504
2505 /* Update our load: */ 2495 /* Update our load: */
2506 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 2496 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2507 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2497 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2526,9 +2516,78 @@ void update_cpu_load(struct rq *this_rq)
2526 sched_avg_update(this_rq); 2516 sched_avg_update(this_rq);
2527} 2517}
2528 2518
2519#ifdef CONFIG_NO_HZ
2520/*
2521 * There is no sane way to deal with nohz on smp when using jiffies because the
2522 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2523 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2524 *
2525 * Therefore we cannot use the delta approach from the regular tick since that
2526 * would seriously skew the load calculation. However we'll make do for those
2527 * updates happening while idle (nohz_idle_balance) or coming out of idle
2528 * (tick_nohz_idle_exit).
2529 *
2530 * This means we might still be one tick off for nohz periods.
2531 */
2532
2533/*
2534 * Called from nohz_idle_balance() to update the load ratings before doing the
2535 * idle balance.
2536 */
2537void update_idle_cpu_load(struct rq *this_rq)
2538{
2539 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2540 unsigned long load = this_rq->load.weight;
2541 unsigned long pending_updates;
2542
2543 /*
2544 * bail if there's load or we're actually up-to-date.
2545 */
2546 if (load || curr_jiffies == this_rq->last_load_update_tick)
2547 return;
2548
2549 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2550 this_rq->last_load_update_tick = curr_jiffies;
2551
2552 __update_cpu_load(this_rq, load, pending_updates);
2553}
2554
2555/*
2556 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2557 */
2558void update_cpu_load_nohz(void)
2559{
2560 struct rq *this_rq = this_rq();
2561 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2562 unsigned long pending_updates;
2563
2564 if (curr_jiffies == this_rq->last_load_update_tick)
2565 return;
2566
2567 raw_spin_lock(&this_rq->lock);
2568 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2569 if (pending_updates) {
2570 this_rq->last_load_update_tick = curr_jiffies;
2571 /*
2572 * We were idle, this means load 0, the current load might be
2573 * !0 due to remote wakeups and the sort.
2574 */
2575 __update_cpu_load(this_rq, 0, pending_updates);
2576 }
2577 raw_spin_unlock(&this_rq->lock);
2578}
2579#endif /* CONFIG_NO_HZ */
2580
2581/*
2582 * Called from scheduler_tick()
2583 */
2529static void update_cpu_load_active(struct rq *this_rq) 2584static void update_cpu_load_active(struct rq *this_rq)
2530{ 2585{
2531 update_cpu_load(this_rq); 2586 /*
2587 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2588 */
2589 this_rq->last_load_update_tick = jiffies;
2590 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2532 2591
2533 calc_load_account_active(this_rq); 2592 calc_load_account_active(this_rq);
2534} 2593}
@@ -3113,6 +3172,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
3113 if (irqs_disabled()) 3172 if (irqs_disabled())
3114 print_irqtrace_events(prev); 3173 print_irqtrace_events(prev);
3115 dump_stack(); 3174 dump_stack();
3175 add_taint(TAINT_WARN);
3116} 3176}
3117 3177
3118/* 3178/*
@@ -4042,11 +4102,8 @@ static bool check_same_owner(struct task_struct *p)
4042 4102
4043 rcu_read_lock(); 4103 rcu_read_lock();
4044 pcred = __task_cred(p); 4104 pcred = __task_cred(p);
4045 if (cred->user->user_ns == pcred->user->user_ns) 4105 match = (uid_eq(cred->euid, pcred->euid) ||
4046 match = (cred->euid == pcred->euid || 4106 uid_eq(cred->euid, pcred->uid));
4047 cred->euid == pcred->uid);
4048 else
4049 match = false;
4050 rcu_read_unlock(); 4107 rcu_read_unlock();
4051 return match; 4108 return match;
4052} 4109}
@@ -4957,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4957 p->sched_class->set_cpus_allowed(p, new_mask); 5014 p->sched_class->set_cpus_allowed(p, new_mask);
4958 5015
4959 cpumask_copy(&p->cpus_allowed, new_mask); 5016 cpumask_copy(&p->cpus_allowed, new_mask);
4960 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 5017 p->nr_cpus_allowed = cpumask_weight(new_mask);
4961} 5018}
4962 5019
4963/* 5020/*
@@ -5499,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5499 5556
5500#ifdef CONFIG_SCHED_DEBUG 5557#ifdef CONFIG_SCHED_DEBUG
5501 5558
5502static __read_mostly int sched_domain_debug_enabled; 5559static __read_mostly int sched_debug_enabled;
5503 5560
5504static int __init sched_domain_debug_setup(char *str) 5561static int __init sched_debug_setup(char *str)
5505{ 5562{
5506 sched_domain_debug_enabled = 1; 5563 sched_debug_enabled = 1;
5507 5564
5508 return 0; 5565 return 0;
5509} 5566}
5510early_param("sched_debug", sched_domain_debug_setup); 5567early_param("sched_debug", sched_debug_setup);
5568
5569static inline bool sched_debug(void)
5570{
5571 return sched_debug_enabled;
5572}
5511 5573
5512static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5574static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5513 struct cpumask *groupmask) 5575 struct cpumask *groupmask)
@@ -5547,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5547 break; 5609 break;
5548 } 5610 }
5549 5611
5550 if (!group->sgp->power) { 5612 /*
5613 * Even though we initialize ->power to something semi-sane,
5614 * we leave power_orig unset. This allows us to detect if
5615 * domain iteration is still funny without causing /0 traps.
5616 */
5617 if (!group->sgp->power_orig) {
5551 printk(KERN_CONT "\n"); 5618 printk(KERN_CONT "\n");
5552 printk(KERN_ERR "ERROR: domain->cpu_power not " 5619 printk(KERN_ERR "ERROR: domain->cpu_power not "
5553 "set\n"); 5620 "set\n");
@@ -5560,7 +5627,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5560 break; 5627 break;
5561 } 5628 }
5562 5629
5563 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 5630 if (!(sd->flags & SD_OVERLAP) &&
5631 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5564 printk(KERN_CONT "\n"); 5632 printk(KERN_CONT "\n");
5565 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5633 printk(KERN_ERR "ERROR: repeated CPUs\n");
5566 break; 5634 break;
@@ -5594,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5594{ 5662{
5595 int level = 0; 5663 int level = 0;
5596 5664
5597 if (!sched_domain_debug_enabled) 5665 if (!sched_debug_enabled)
5598 return; 5666 return;
5599 5667
5600 if (!sd) { 5668 if (!sd) {
@@ -5615,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5615} 5683}
5616#else /* !CONFIG_SCHED_DEBUG */ 5684#else /* !CONFIG_SCHED_DEBUG */
5617# define sched_domain_debug(sd, cpu) do { } while (0) 5685# define sched_domain_debug(sd, cpu) do { } while (0)
5686static inline bool sched_debug(void)
5687{
5688 return false;
5689}
5618#endif /* CONFIG_SCHED_DEBUG */ 5690#endif /* CONFIG_SCHED_DEBUG */
5619 5691
5620static int sd_degenerate(struct sched_domain *sd) 5692static int sd_degenerate(struct sched_domain *sd)
@@ -5898,99 +5970,11 @@ static int __init isolated_cpu_setup(char *str)
5898 5970
5899__setup("isolcpus=", isolated_cpu_setup); 5971__setup("isolcpus=", isolated_cpu_setup);
5900 5972
5901#ifdef CONFIG_NUMA
5902
5903/**
5904 * find_next_best_node - find the next node to include in a sched_domain
5905 * @node: node whose sched_domain we're building
5906 * @used_nodes: nodes already in the sched_domain
5907 *
5908 * Find the next node to include in a given scheduling domain. Simply
5909 * finds the closest node not already in the @used_nodes map.
5910 *
5911 * Should use nodemask_t.
5912 */
5913static int find_next_best_node(int node, nodemask_t *used_nodes)
5914{
5915 int i, n, val, min_val, best_node = -1;
5916
5917 min_val = INT_MAX;
5918
5919 for (i = 0; i < nr_node_ids; i++) {
5920 /* Start at @node */
5921 n = (node + i) % nr_node_ids;
5922
5923 if (!nr_cpus_node(n))
5924 continue;
5925
5926 /* Skip already used nodes */
5927 if (node_isset(n, *used_nodes))
5928 continue;
5929
5930 /* Simple min distance search */
5931 val = node_distance(node, n);
5932
5933 if (val < min_val) {
5934 min_val = val;
5935 best_node = n;
5936 }
5937 }
5938
5939 if (best_node != -1)
5940 node_set(best_node, *used_nodes);
5941 return best_node;
5942}
5943
5944/**
5945 * sched_domain_node_span - get a cpumask for a node's sched_domain
5946 * @node: node whose cpumask we're constructing
5947 * @span: resulting cpumask
5948 *
5949 * Given a node, construct a good cpumask for its sched_domain to span. It
5950 * should be one that prevents unnecessary balancing, but also spreads tasks
5951 * out optimally.
5952 */
5953static void sched_domain_node_span(int node, struct cpumask *span)
5954{
5955 nodemask_t used_nodes;
5956 int i;
5957
5958 cpumask_clear(span);
5959 nodes_clear(used_nodes);
5960
5961 cpumask_or(span, span, cpumask_of_node(node));
5962 node_set(node, used_nodes);
5963
5964 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5965 int next_node = find_next_best_node(node, &used_nodes);
5966 if (next_node < 0)
5967 break;
5968 cpumask_or(span, span, cpumask_of_node(next_node));
5969 }
5970}
5971
5972static const struct cpumask *cpu_node_mask(int cpu)
5973{
5974 lockdep_assert_held(&sched_domains_mutex);
5975
5976 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5977
5978 return sched_domains_tmpmask;
5979}
5980
5981static const struct cpumask *cpu_allnodes_mask(int cpu)
5982{
5983 return cpu_possible_mask;
5984}
5985#endif /* CONFIG_NUMA */
5986
5987static const struct cpumask *cpu_cpu_mask(int cpu) 5973static const struct cpumask *cpu_cpu_mask(int cpu)
5988{ 5974{
5989 return cpumask_of_node(cpu_to_node(cpu)); 5975 return cpumask_of_node(cpu_to_node(cpu));
5990} 5976}
5991 5977
5992int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5993
5994struct sd_data { 5978struct sd_data {
5995 struct sched_domain **__percpu sd; 5979 struct sched_domain **__percpu sd;
5996 struct sched_group **__percpu sg; 5980 struct sched_group **__percpu sg;
@@ -6020,9 +6004,48 @@ struct sched_domain_topology_level {
6020 sched_domain_init_f init; 6004 sched_domain_init_f init;
6021 sched_domain_mask_f mask; 6005 sched_domain_mask_f mask;
6022 int flags; 6006 int flags;
6007 int numa_level;
6023 struct sd_data data; 6008 struct sd_data data;
6024}; 6009};
6025 6010
6011/*
6012 * Build an iteration mask that can exclude certain CPUs from the upwards
6013 * domain traversal.
6014 *
6015 * Asymmetric node setups can result in situations where the domain tree is of
6016 * unequal depth, make sure to skip domains that already cover the entire
6017 * range.
6018 *
6019 * In that case build_sched_domains() will have terminated the iteration early
6020 * and our sibling sd spans will be empty. Domains should always include the
6021 * cpu they're built on, so check that.
6022 *
6023 */
6024static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6025{
6026 const struct cpumask *span = sched_domain_span(sd);
6027 struct sd_data *sdd = sd->private;
6028 struct sched_domain *sibling;
6029 int i;
6030
6031 for_each_cpu(i, span) {
6032 sibling = *per_cpu_ptr(sdd->sd, i);
6033 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6034 continue;
6035
6036 cpumask_set_cpu(i, sched_group_mask(sg));
6037 }
6038}
6039
6040/*
6041 * Return the canonical balance cpu for this group, this is the first cpu
6042 * of this group that's also in the iteration mask.
6043 */
6044int group_balance_cpu(struct sched_group *sg)
6045{
6046 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6047}
6048
6026static int 6049static int
6027build_overlap_sched_groups(struct sched_domain *sd, int cpu) 6050build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6028{ 6051{
@@ -6041,6 +6064,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6041 if (cpumask_test_cpu(i, covered)) 6064 if (cpumask_test_cpu(i, covered))
6042 continue; 6065 continue;
6043 6066
6067 child = *per_cpu_ptr(sdd->sd, i);
6068
6069 /* See the comment near build_group_mask(). */
6070 if (!cpumask_test_cpu(i, sched_domain_span(child)))
6071 continue;
6072
6044 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6073 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6045 GFP_KERNEL, cpu_to_node(cpu)); 6074 GFP_KERNEL, cpu_to_node(cpu));
6046 6075
@@ -6048,8 +6077,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6048 goto fail; 6077 goto fail;
6049 6078
6050 sg_span = sched_group_cpus(sg); 6079 sg_span = sched_group_cpus(sg);
6051
6052 child = *per_cpu_ptr(sdd->sd, i);
6053 if (child->child) { 6080 if (child->child) {
6054 child = child->child; 6081 child = child->child;
6055 cpumask_copy(sg_span, sched_domain_span(child)); 6082 cpumask_copy(sg_span, sched_domain_span(child));
@@ -6058,10 +6085,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6058 6085
6059 cpumask_or(covered, covered, sg_span); 6086 cpumask_or(covered, covered, sg_span);
6060 6087
6061 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); 6088 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
6062 atomic_inc(&sg->sgp->ref); 6089 if (atomic_inc_return(&sg->sgp->ref) == 1)
6090 build_group_mask(sd, sg);
6063 6091
6064 if (cpumask_test_cpu(cpu, sg_span)) 6092 /*
6093 * Initialize sgp->power such that even if we mess up the
6094 * domains and no possible iteration will get us here, we won't
6095 * die on a /0 trap.
6096 */
6097 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
6098
6099 /*
6100 * Make sure the first group of this domain contains the
6101 * canonical balance cpu. Otherwise the sched_domain iteration
6102 * breaks. See update_sg_lb_stats().
6103 */
6104 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6105 group_balance_cpu(sg) == cpu)
6065 groups = sg; 6106 groups = sg;
6066 6107
6067 if (!first) 6108 if (!first)
@@ -6135,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
6135 6176
6136 cpumask_clear(sched_group_cpus(sg)); 6177 cpumask_clear(sched_group_cpus(sg));
6137 sg->sgp->power = 0; 6178 sg->sgp->power = 0;
6179 cpumask_setall(sched_group_mask(sg));
6138 6180
6139 for_each_cpu(j, span) { 6181 for_each_cpu(j, span) {
6140 if (get_group(j, sdd, NULL) != group) 6182 if (get_group(j, sdd, NULL) != group)
@@ -6176,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6176 sg = sg->next; 6218 sg = sg->next;
6177 } while (sg != sd->groups); 6219 } while (sg != sd->groups);
6178 6220
6179 if (cpu != group_first_cpu(sg)) 6221 if (cpu != group_balance_cpu(sg))
6180 return; 6222 return;
6181 6223
6182 update_group_power(sd, cpu); 6224 update_group_power(sd, cpu);
@@ -6211,10 +6253,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6211} 6253}
6212 6254
6213SD_INIT_FUNC(CPU) 6255SD_INIT_FUNC(CPU)
6214#ifdef CONFIG_NUMA
6215 SD_INIT_FUNC(ALLNODES)
6216 SD_INIT_FUNC(NODE)
6217#endif
6218#ifdef CONFIG_SCHED_SMT 6256#ifdef CONFIG_SCHED_SMT
6219 SD_INIT_FUNC(SIBLING) 6257 SD_INIT_FUNC(SIBLING)
6220#endif 6258#endif
@@ -6230,11 +6268,8 @@ int sched_domain_level_max;
6230 6268
6231static int __init setup_relax_domain_level(char *str) 6269static int __init setup_relax_domain_level(char *str)
6232{ 6270{
6233 unsigned long val; 6271 if (kstrtoint(str, 0, &default_relax_domain_level))
6234 6272 pr_warn("Unable to set relax_domain_level\n");
6235 val = simple_strtoul(str, NULL, 0);
6236 if (val < sched_domain_level_max)
6237 default_relax_domain_level = val;
6238 6273
6239 return 1; 6274 return 1;
6240} 6275}
@@ -6336,15 +6371,236 @@ static struct sched_domain_topology_level default_topology[] = {
6336 { sd_init_BOOK, cpu_book_mask, }, 6371 { sd_init_BOOK, cpu_book_mask, },
6337#endif 6372#endif
6338 { sd_init_CPU, cpu_cpu_mask, }, 6373 { sd_init_CPU, cpu_cpu_mask, },
6339#ifdef CONFIG_NUMA
6340 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6341 { sd_init_ALLNODES, cpu_allnodes_mask, },
6342#endif
6343 { NULL, }, 6374 { NULL, },
6344}; 6375};
6345 6376
6346static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6377static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6347 6378
6379#ifdef CONFIG_NUMA
6380
6381static int sched_domains_numa_levels;
6382static int *sched_domains_numa_distance;
6383static struct cpumask ***sched_domains_numa_masks;
6384static int sched_domains_curr_level;
6385
6386static inline int sd_local_flags(int level)
6387{
6388 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
6389 return 0;
6390
6391 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6392}
6393
6394static struct sched_domain *
6395sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6396{
6397 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6398 int level = tl->numa_level;
6399 int sd_weight = cpumask_weight(
6400 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6401
6402 *sd = (struct sched_domain){
6403 .min_interval = sd_weight,
6404 .max_interval = 2*sd_weight,
6405 .busy_factor = 32,
6406 .imbalance_pct = 125,
6407 .cache_nice_tries = 2,
6408 .busy_idx = 3,
6409 .idle_idx = 2,
6410 .newidle_idx = 0,
6411 .wake_idx = 0,
6412 .forkexec_idx = 0,
6413
6414 .flags = 1*SD_LOAD_BALANCE
6415 | 1*SD_BALANCE_NEWIDLE
6416 | 0*SD_BALANCE_EXEC
6417 | 0*SD_BALANCE_FORK
6418 | 0*SD_BALANCE_WAKE
6419 | 0*SD_WAKE_AFFINE
6420 | 0*SD_PREFER_LOCAL
6421 | 0*SD_SHARE_CPUPOWER
6422 | 0*SD_SHARE_PKG_RESOURCES
6423 | 1*SD_SERIALIZE
6424 | 0*SD_PREFER_SIBLING
6425 | sd_local_flags(level)
6426 ,
6427 .last_balance = jiffies,
6428 .balance_interval = sd_weight,
6429 };
6430 SD_INIT_NAME(sd, NUMA);
6431 sd->private = &tl->data;
6432
6433 /*
6434 * Ugly hack to pass state to sd_numa_mask()...
6435 */
6436 sched_domains_curr_level = tl->numa_level;
6437
6438 return sd;
6439}
6440
6441static const struct cpumask *sd_numa_mask(int cpu)
6442{
6443 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6444}
6445
6446static void sched_numa_warn(const char *str)
6447{
6448 static int done = false;
6449 int i,j;
6450
6451 if (done)
6452 return;
6453
6454 done = true;
6455
6456 printk(KERN_WARNING "ERROR: %s\n\n", str);
6457
6458 for (i = 0; i < nr_node_ids; i++) {
6459 printk(KERN_WARNING " ");
6460 for (j = 0; j < nr_node_ids; j++)
6461 printk(KERN_CONT "%02d ", node_distance(i,j));
6462 printk(KERN_CONT "\n");
6463 }
6464 printk(KERN_WARNING "\n");
6465}
6466
6467static bool find_numa_distance(int distance)
6468{
6469 int i;
6470
6471 if (distance == node_distance(0, 0))
6472 return true;
6473
6474 for (i = 0; i < sched_domains_numa_levels; i++) {
6475 if (sched_domains_numa_distance[i] == distance)
6476 return true;
6477 }
6478
6479 return false;
6480}
6481
6482static void sched_init_numa(void)
6483{
6484 int next_distance, curr_distance = node_distance(0, 0);
6485 struct sched_domain_topology_level *tl;
6486 int level = 0;
6487 int i, j, k;
6488
6489 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6490 if (!sched_domains_numa_distance)
6491 return;
6492
6493 /*
6494 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6495 * unique distances in the node_distance() table.
6496 *
6497 * Assumes node_distance(0,j) includes all distances in
6498 * node_distance(i,j) in order to avoid cubic time.
6499 */
6500 next_distance = curr_distance;
6501 for (i = 0; i < nr_node_ids; i++) {
6502 for (j = 0; j < nr_node_ids; j++) {
6503 for (k = 0; k < nr_node_ids; k++) {
6504 int distance = node_distance(i, k);
6505
6506 if (distance > curr_distance &&
6507 (distance < next_distance ||
6508 next_distance == curr_distance))
6509 next_distance = distance;
6510
6511 /*
6512 * While not a strong assumption it would be nice to know
6513 * about cases where if node A is connected to B, B is not
6514 * equally connected to A.
6515 */
6516 if (sched_debug() && node_distance(k, i) != distance)
6517 sched_numa_warn("Node-distance not symmetric");
6518
6519 if (sched_debug() && i && !find_numa_distance(distance))
6520 sched_numa_warn("Node-0 not representative");
6521 }
6522 if (next_distance != curr_distance) {
6523 sched_domains_numa_distance[level++] = next_distance;
6524 sched_domains_numa_levels = level;
6525 curr_distance = next_distance;
6526 } else break;
6527 }
6528
6529 /*
6530 * In case of sched_debug() we verify the above assumption.
6531 */
6532 if (!sched_debug())
6533 break;
6534 }
6535 /*
6536 * 'level' contains the number of unique distances, excluding the
6537 * identity distance node_distance(i,i).
6538 *
6539 * The sched_domains_nume_distance[] array includes the actual distance
6540 * numbers.
6541 */
6542
6543 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6544 if (!sched_domains_numa_masks)
6545 return;
6546
6547 /*
6548 * Now for each level, construct a mask per node which contains all
6549 * cpus of nodes that are that many hops away from us.
6550 */
6551 for (i = 0; i < level; i++) {
6552 sched_domains_numa_masks[i] =
6553 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6554 if (!sched_domains_numa_masks[i])
6555 return;
6556
6557 for (j = 0; j < nr_node_ids; j++) {
6558 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6559 if (!mask)
6560 return;
6561
6562 sched_domains_numa_masks[i][j] = mask;
6563
6564 for (k = 0; k < nr_node_ids; k++) {
6565 if (node_distance(j, k) > sched_domains_numa_distance[i])
6566 continue;
6567
6568 cpumask_or(mask, mask, cpumask_of_node(k));
6569 }
6570 }
6571 }
6572
6573 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6574 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6575 if (!tl)
6576 return;
6577
6578 /*
6579 * Copy the default topology bits..
6580 */
6581 for (i = 0; default_topology[i].init; i++)
6582 tl[i] = default_topology[i];
6583
6584 /*
6585 * .. and append 'j' levels of NUMA goodness.
6586 */
6587 for (j = 0; j < level; i++, j++) {
6588 tl[i] = (struct sched_domain_topology_level){
6589 .init = sd_numa_init,
6590 .mask = sd_numa_mask,
6591 .flags = SDTL_OVERLAP,
6592 .numa_level = j,
6593 };
6594 }
6595
6596 sched_domain_topology = tl;
6597}
6598#else
6599static inline void sched_init_numa(void)
6600{
6601}
6602#endif /* CONFIG_NUMA */
6603
6348static int __sdt_alloc(const struct cpumask *cpu_map) 6604static int __sdt_alloc(const struct cpumask *cpu_map)
6349{ 6605{
6350 struct sched_domain_topology_level *tl; 6606 struct sched_domain_topology_level *tl;
@@ -6382,9 +6638,11 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6382 if (!sg) 6638 if (!sg)
6383 return -ENOMEM; 6639 return -ENOMEM;
6384 6640
6641 sg->next = sg;
6642
6385 *per_cpu_ptr(sdd->sg, j) = sg; 6643 *per_cpu_ptr(sdd->sg, j) = sg;
6386 6644
6387 sgp = kzalloc_node(sizeof(struct sched_group_power), 6645 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6388 GFP_KERNEL, cpu_to_node(j)); 6646 GFP_KERNEL, cpu_to_node(j));
6389 if (!sgp) 6647 if (!sgp)
6390 return -ENOMEM; 6648 return -ENOMEM;
@@ -6405,16 +6663,26 @@ static void __sdt_free(const struct cpumask *cpu_map)
6405 struct sd_data *sdd = &tl->data; 6663 struct sd_data *sdd = &tl->data;
6406 6664
6407 for_each_cpu(j, cpu_map) { 6665 for_each_cpu(j, cpu_map) {
6408 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); 6666 struct sched_domain *sd;
6409 if (sd && (sd->flags & SD_OVERLAP)) 6667
6410 free_sched_groups(sd->groups, 0); 6668 if (sdd->sd) {
6411 kfree(*per_cpu_ptr(sdd->sd, j)); 6669 sd = *per_cpu_ptr(sdd->sd, j);
6412 kfree(*per_cpu_ptr(sdd->sg, j)); 6670 if (sd && (sd->flags & SD_OVERLAP))
6413 kfree(*per_cpu_ptr(sdd->sgp, j)); 6671 free_sched_groups(sd->groups, 0);
6672 kfree(*per_cpu_ptr(sdd->sd, j));
6673 }
6674
6675 if (sdd->sg)
6676 kfree(*per_cpu_ptr(sdd->sg, j));
6677 if (sdd->sgp)
6678 kfree(*per_cpu_ptr(sdd->sgp, j));
6414 } 6679 }
6415 free_percpu(sdd->sd); 6680 free_percpu(sdd->sd);
6681 sdd->sd = NULL;
6416 free_percpu(sdd->sg); 6682 free_percpu(sdd->sg);
6683 sdd->sg = NULL;
6417 free_percpu(sdd->sgp); 6684 free_percpu(sdd->sgp);
6685 sdd->sgp = NULL;
6418 } 6686 }
6419} 6687}
6420 6688
@@ -6427,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6427 if (!sd) 6695 if (!sd)
6428 return child; 6696 return child;
6429 6697
6430 set_domain_attribute(sd, attr);
6431 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6698 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6432 if (child) { 6699 if (child) {
6433 sd->level = child->level + 1; 6700 sd->level = child->level + 1;
@@ -6435,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6435 child->parent = sd; 6702 child->parent = sd;
6436 } 6703 }
6437 sd->child = child; 6704 sd->child = child;
6705 set_domain_attribute(sd, attr);
6438 6706
6439 return sd; 6707 return sd;
6440} 6708}
@@ -6575,7 +6843,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
6575 if (!doms_cur) 6843 if (!doms_cur)
6576 doms_cur = &fallback_doms; 6844 doms_cur = &fallback_doms;
6577 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6845 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6578 dattr_cur = NULL;
6579 err = build_sched_domains(doms_cur[0], NULL); 6846 err = build_sched_domains(doms_cur[0], NULL);
6580 register_sched_domain_sysctl(); 6847 register_sched_domain_sysctl();
6581 6848
@@ -6700,97 +6967,6 @@ match2:
6700 mutex_unlock(&sched_domains_mutex); 6967 mutex_unlock(&sched_domains_mutex);
6701} 6968}
6702 6969
6703#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6704static void reinit_sched_domains(void)
6705{
6706 get_online_cpus();
6707
6708 /* Destroy domains first to force the rebuild */
6709 partition_sched_domains(0, NULL, NULL);
6710
6711 rebuild_sched_domains();
6712 put_online_cpus();
6713}
6714
6715static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6716{
6717 unsigned int level = 0;
6718
6719 if (sscanf(buf, "%u", &level) != 1)
6720 return -EINVAL;
6721
6722 /*
6723 * level is always be positive so don't check for
6724 * level < POWERSAVINGS_BALANCE_NONE which is 0
6725 * What happens on 0 or 1 byte write,
6726 * need to check for count as well?
6727 */
6728
6729 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
6730 return -EINVAL;
6731
6732 if (smt)
6733 sched_smt_power_savings = level;
6734 else
6735 sched_mc_power_savings = level;
6736
6737 reinit_sched_domains();
6738
6739 return count;
6740}
6741
6742#ifdef CONFIG_SCHED_MC
6743static ssize_t sched_mc_power_savings_show(struct device *dev,
6744 struct device_attribute *attr,
6745 char *buf)
6746{
6747 return sprintf(buf, "%u\n", sched_mc_power_savings);
6748}
6749static ssize_t sched_mc_power_savings_store(struct device *dev,
6750 struct device_attribute *attr,
6751 const char *buf, size_t count)
6752{
6753 return sched_power_savings_store(buf, count, 0);
6754}
6755static DEVICE_ATTR(sched_mc_power_savings, 0644,
6756 sched_mc_power_savings_show,
6757 sched_mc_power_savings_store);
6758#endif
6759
6760#ifdef CONFIG_SCHED_SMT
6761static ssize_t sched_smt_power_savings_show(struct device *dev,
6762 struct device_attribute *attr,
6763 char *buf)
6764{
6765 return sprintf(buf, "%u\n", sched_smt_power_savings);
6766}
6767static ssize_t sched_smt_power_savings_store(struct device *dev,
6768 struct device_attribute *attr,
6769 const char *buf, size_t count)
6770{
6771 return sched_power_savings_store(buf, count, 1);
6772}
6773static DEVICE_ATTR(sched_smt_power_savings, 0644,
6774 sched_smt_power_savings_show,
6775 sched_smt_power_savings_store);
6776#endif
6777
6778int __init sched_create_sysfs_power_savings_entries(struct device *dev)
6779{
6780 int err = 0;
6781
6782#ifdef CONFIG_SCHED_SMT
6783 if (smt_capable())
6784 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
6785#endif
6786#ifdef CONFIG_SCHED_MC
6787 if (!err && mc_capable())
6788 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
6789#endif
6790 return err;
6791}
6792#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
6793
6794/* 6970/*
6795 * Update cpusets according to cpu_active mask. If cpusets are 6971 * Update cpusets according to cpu_active mask. If cpusets are
6796 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6972 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@ -6828,6 +7004,8 @@ void __init sched_init_smp(void)
6828 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7004 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6829 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7005 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6830 7006
7007 sched_init_numa();
7008
6831 get_online_cpus(); 7009 get_online_cpus();
6832 mutex_lock(&sched_domains_mutex); 7010 mutex_lock(&sched_domains_mutex);
6833 init_sched_domains(cpu_active_mask); 7011 init_sched_domains(cpu_active_mask);
@@ -7049,6 +7227,7 @@ void __init sched_init(void)
7049 /* May be allocated at isolcpus cmdline parse time */ 7227 /* May be allocated at isolcpus cmdline parse time */
7050 if (cpu_isolated_map == NULL) 7228 if (cpu_isolated_map == NULL)
7051 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7229 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7230 idle_thread_set_boot_cpu();
7052#endif 7231#endif
7053 init_sched_fair_class(); 7232 init_sched_fair_class();
7054 7233
@@ -7970,13 +8149,9 @@ static struct cftype cpu_files[] = {
7970 .write_u64 = cpu_rt_period_write_uint, 8149 .write_u64 = cpu_rt_period_write_uint,
7971 }, 8150 },
7972#endif 8151#endif
8152 { } /* terminate */
7973}; 8153};
7974 8154
7975static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7976{
7977 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
7978}
7979
7980struct cgroup_subsys cpu_cgroup_subsys = { 8155struct cgroup_subsys cpu_cgroup_subsys = {
7981 .name = "cpu", 8156 .name = "cpu",
7982 .create = cpu_cgroup_create, 8157 .create = cpu_cgroup_create,
@@ -7984,8 +8159,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7984 .can_attach = cpu_cgroup_can_attach, 8159 .can_attach = cpu_cgroup_can_attach,
7985 .attach = cpu_cgroup_attach, 8160 .attach = cpu_cgroup_attach,
7986 .exit = cpu_cgroup_exit, 8161 .exit = cpu_cgroup_exit,
7987 .populate = cpu_cgroup_populate,
7988 .subsys_id = cpu_cgroup_subsys_id, 8162 .subsys_id = cpu_cgroup_subsys_id,
8163 .base_cftypes = cpu_files,
7989 .early_init = 1, 8164 .early_init = 1,
7990}; 8165};
7991 8166
@@ -8170,13 +8345,9 @@ static struct cftype files[] = {
8170 .name = "stat", 8345 .name = "stat",
8171 .read_map = cpuacct_stats_show, 8346 .read_map = cpuacct_stats_show,
8172 }, 8347 },
8348 { } /* terminate */
8173}; 8349};
8174 8350
8175static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8176{
8177 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8178}
8179
8180/* 8351/*
8181 * charge this task's execution time to its accounting group. 8352 * charge this task's execution time to its accounting group.
8182 * 8353 *
@@ -8208,7 +8379,7 @@ struct cgroup_subsys cpuacct_subsys = {
8208 .name = "cpuacct", 8379 .name = "cpuacct",
8209 .create = cpuacct_create, 8380 .create = cpuacct_create,
8210 .destroy = cpuacct_destroy, 8381 .destroy = cpuacct_destroy,
8211 .populate = cpuacct_populate,
8212 .subsys_id = cpuacct_subsys_id, 8382 .subsys_id = cpuacct_subsys_id,
8383 .base_cftypes = files,
8213}; 8384};
8214#endif /* CONFIG_CGROUP_CPUACCT */ 8385#endif /* CONFIG_CGROUP_CPUACCT */