aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c420
1 files changed, 218 insertions, 202 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2e2e173d8f7..d833cc94eedc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -693,8 +693,6 @@ int tg_nop(struct task_group *tg, void *data)
693} 693}
694#endif 694#endif
695 695
696void update_cpu_load(struct rq *this_rq);
697
698static void set_load_weight(struct task_struct *p) 696static void set_load_weight(struct task_struct *p)
699{ 697{
700 int prio = p->static_prio - MAX_RT_PRIO; 698 int prio = p->static_prio - MAX_RT_PRIO;
@@ -2481,22 +2479,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2481 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2479 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2482 * every tick. We fix it up based on jiffies. 2480 * every tick. We fix it up based on jiffies.
2483 */ 2481 */
2484void update_cpu_load(struct rq *this_rq) 2482static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2483 unsigned long pending_updates)
2485{ 2484{
2486 unsigned long this_load = this_rq->load.weight;
2487 unsigned long curr_jiffies = jiffies;
2488 unsigned long pending_updates;
2489 int i, scale; 2485 int i, scale;
2490 2486
2491 this_rq->nr_load_updates++; 2487 this_rq->nr_load_updates++;
2492 2488
2493 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
2494 if (curr_jiffies == this_rq->last_load_update_tick)
2495 return;
2496
2497 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2498 this_rq->last_load_update_tick = curr_jiffies;
2499
2500 /* Update our load: */ 2489 /* Update our load: */
2501 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 2490 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2502 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2491 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2521,9 +2510,45 @@ void update_cpu_load(struct rq *this_rq)
2521 sched_avg_update(this_rq); 2510 sched_avg_update(this_rq);
2522} 2511}
2523 2512
2513/*
2514 * Called from nohz_idle_balance() to update the load ratings before doing the
2515 * idle balance.
2516 */
2517void update_idle_cpu_load(struct rq *this_rq)
2518{
2519 unsigned long curr_jiffies = jiffies;
2520 unsigned long load = this_rq->load.weight;
2521 unsigned long pending_updates;
2522
2523 /*
2524 * Bloody broken means of dealing with nohz, but better than nothing..
2525 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
2526 * update and see 0 difference the one time and 2 the next, even though
2527 * we ticked at roughtly the same rate.
2528 *
2529 * Hence we only use this from nohz_idle_balance() and skip this
2530 * nonsense when called from the scheduler_tick() since that's
2531 * guaranteed a stable rate.
2532 */
2533 if (load || curr_jiffies == this_rq->last_load_update_tick)
2534 return;
2535
2536 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2537 this_rq->last_load_update_tick = curr_jiffies;
2538
2539 __update_cpu_load(this_rq, load, pending_updates);
2540}
2541
2542/*
2543 * Called from scheduler_tick()
2544 */
2524static void update_cpu_load_active(struct rq *this_rq) 2545static void update_cpu_load_active(struct rq *this_rq)
2525{ 2546{
2526 update_cpu_load(this_rq); 2547 /*
2548 * See the mess in update_idle_cpu_load().
2549 */
2550 this_rq->last_load_update_tick = jiffies;
2551 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2527 2552
2528 calc_load_account_active(this_rq); 2553 calc_load_account_active(this_rq);
2529} 2554}
@@ -3108,6 +3133,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
3108 if (irqs_disabled()) 3133 if (irqs_disabled())
3109 print_irqtrace_events(prev); 3134 print_irqtrace_events(prev);
3110 dump_stack(); 3135 dump_stack();
3136 add_taint(TAINT_WARN);
3111} 3137}
3112 3138
3113/* 3139/*
@@ -5555,7 +5581,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5555 break; 5581 break;
5556 } 5582 }
5557 5583
5558 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 5584 if (!(sd->flags & SD_OVERLAP) &&
5585 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5559 printk(KERN_CONT "\n"); 5586 printk(KERN_CONT "\n");
5560 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5587 printk(KERN_ERR "ERROR: repeated CPUs\n");
5561 break; 5588 break;
@@ -5893,99 +5920,11 @@ static int __init isolated_cpu_setup(char *str)
5893 5920
5894__setup("isolcpus=", isolated_cpu_setup); 5921__setup("isolcpus=", isolated_cpu_setup);
5895 5922
5896#ifdef CONFIG_NUMA
5897
5898/**
5899 * find_next_best_node - find the next node to include in a sched_domain
5900 * @node: node whose sched_domain we're building
5901 * @used_nodes: nodes already in the sched_domain
5902 *
5903 * Find the next node to include in a given scheduling domain. Simply
5904 * finds the closest node not already in the @used_nodes map.
5905 *
5906 * Should use nodemask_t.
5907 */
5908static int find_next_best_node(int node, nodemask_t *used_nodes)
5909{
5910 int i, n, val, min_val, best_node = -1;
5911
5912 min_val = INT_MAX;
5913
5914 for (i = 0; i < nr_node_ids; i++) {
5915 /* Start at @node */
5916 n = (node + i) % nr_node_ids;
5917
5918 if (!nr_cpus_node(n))
5919 continue;
5920
5921 /* Skip already used nodes */
5922 if (node_isset(n, *used_nodes))
5923 continue;
5924
5925 /* Simple min distance search */
5926 val = node_distance(node, n);
5927
5928 if (val < min_val) {
5929 min_val = val;
5930 best_node = n;
5931 }
5932 }
5933
5934 if (best_node != -1)
5935 node_set(best_node, *used_nodes);
5936 return best_node;
5937}
5938
5939/**
5940 * sched_domain_node_span - get a cpumask for a node's sched_domain
5941 * @node: node whose cpumask we're constructing
5942 * @span: resulting cpumask
5943 *
5944 * Given a node, construct a good cpumask for its sched_domain to span. It
5945 * should be one that prevents unnecessary balancing, but also spreads tasks
5946 * out optimally.
5947 */
5948static void sched_domain_node_span(int node, struct cpumask *span)
5949{
5950 nodemask_t used_nodes;
5951 int i;
5952
5953 cpumask_clear(span);
5954 nodes_clear(used_nodes);
5955
5956 cpumask_or(span, span, cpumask_of_node(node));
5957 node_set(node, used_nodes);
5958
5959 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5960 int next_node = find_next_best_node(node, &used_nodes);
5961 if (next_node < 0)
5962 break;
5963 cpumask_or(span, span, cpumask_of_node(next_node));
5964 }
5965}
5966
5967static const struct cpumask *cpu_node_mask(int cpu)
5968{
5969 lockdep_assert_held(&sched_domains_mutex);
5970
5971 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5972
5973 return sched_domains_tmpmask;
5974}
5975
5976static const struct cpumask *cpu_allnodes_mask(int cpu)
5977{
5978 return cpu_possible_mask;
5979}
5980#endif /* CONFIG_NUMA */
5981
5982static const struct cpumask *cpu_cpu_mask(int cpu) 5923static const struct cpumask *cpu_cpu_mask(int cpu)
5983{ 5924{
5984 return cpumask_of_node(cpu_to_node(cpu)); 5925 return cpumask_of_node(cpu_to_node(cpu));
5985} 5926}
5986 5927
5987int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5988
5989struct sd_data { 5928struct sd_data {
5990 struct sched_domain **__percpu sd; 5929 struct sched_domain **__percpu sd;
5991 struct sched_group **__percpu sg; 5930 struct sched_group **__percpu sg;
@@ -6015,6 +5954,7 @@ struct sched_domain_topology_level {
6015 sched_domain_init_f init; 5954 sched_domain_init_f init;
6016 sched_domain_mask_f mask; 5955 sched_domain_mask_f mask;
6017 int flags; 5956 int flags;
5957 int numa_level;
6018 struct sd_data data; 5958 struct sd_data data;
6019}; 5959};
6020 5960
@@ -6206,10 +6146,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6206} 6146}
6207 6147
6208SD_INIT_FUNC(CPU) 6148SD_INIT_FUNC(CPU)
6209#ifdef CONFIG_NUMA
6210 SD_INIT_FUNC(ALLNODES)
6211 SD_INIT_FUNC(NODE)
6212#endif
6213#ifdef CONFIG_SCHED_SMT 6149#ifdef CONFIG_SCHED_SMT
6214 SD_INIT_FUNC(SIBLING) 6150 SD_INIT_FUNC(SIBLING)
6215#endif 6151#endif
@@ -6331,15 +6267,184 @@ static struct sched_domain_topology_level default_topology[] = {
6331 { sd_init_BOOK, cpu_book_mask, }, 6267 { sd_init_BOOK, cpu_book_mask, },
6332#endif 6268#endif
6333 { sd_init_CPU, cpu_cpu_mask, }, 6269 { sd_init_CPU, cpu_cpu_mask, },
6334#ifdef CONFIG_NUMA
6335 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6336 { sd_init_ALLNODES, cpu_allnodes_mask, },
6337#endif
6338 { NULL, }, 6270 { NULL, },
6339}; 6271};
6340 6272
6341static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6273static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6342 6274
6275#ifdef CONFIG_NUMA
6276
6277static int sched_domains_numa_levels;
6278static int sched_domains_numa_scale;
6279static int *sched_domains_numa_distance;
6280static struct cpumask ***sched_domains_numa_masks;
6281static int sched_domains_curr_level;
6282
6283static inline int sd_local_flags(int level)
6284{
6285 if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
6286 return 0;
6287
6288 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6289}
6290
6291static struct sched_domain *
6292sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6293{
6294 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6295 int level = tl->numa_level;
6296 int sd_weight = cpumask_weight(
6297 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6298
6299 *sd = (struct sched_domain){
6300 .min_interval = sd_weight,
6301 .max_interval = 2*sd_weight,
6302 .busy_factor = 32,
6303 .imbalance_pct = 125,
6304 .cache_nice_tries = 2,
6305 .busy_idx = 3,
6306 .idle_idx = 2,
6307 .newidle_idx = 0,
6308 .wake_idx = 0,
6309 .forkexec_idx = 0,
6310
6311 .flags = 1*SD_LOAD_BALANCE
6312 | 1*SD_BALANCE_NEWIDLE
6313 | 0*SD_BALANCE_EXEC
6314 | 0*SD_BALANCE_FORK
6315 | 0*SD_BALANCE_WAKE
6316 | 0*SD_WAKE_AFFINE
6317 | 0*SD_PREFER_LOCAL
6318 | 0*SD_SHARE_CPUPOWER
6319 | 0*SD_SHARE_PKG_RESOURCES
6320 | 1*SD_SERIALIZE
6321 | 0*SD_PREFER_SIBLING
6322 | sd_local_flags(level)
6323 ,
6324 .last_balance = jiffies,
6325 .balance_interval = sd_weight,
6326 };
6327 SD_INIT_NAME(sd, NUMA);
6328 sd->private = &tl->data;
6329
6330 /*
6331 * Ugly hack to pass state to sd_numa_mask()...
6332 */
6333 sched_domains_curr_level = tl->numa_level;
6334
6335 return sd;
6336}
6337
6338static const struct cpumask *sd_numa_mask(int cpu)
6339{
6340 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6341}
6342
6343static void sched_init_numa(void)
6344{
6345 int next_distance, curr_distance = node_distance(0, 0);
6346 struct sched_domain_topology_level *tl;
6347 int level = 0;
6348 int i, j, k;
6349
6350 sched_domains_numa_scale = curr_distance;
6351 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6352 if (!sched_domains_numa_distance)
6353 return;
6354
6355 /*
6356 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6357 * unique distances in the node_distance() table.
6358 *
6359 * Assumes node_distance(0,j) includes all distances in
6360 * node_distance(i,j) in order to avoid cubic time.
6361 *
6362 * XXX: could be optimized to O(n log n) by using sort()
6363 */
6364 next_distance = curr_distance;
6365 for (i = 0; i < nr_node_ids; i++) {
6366 for (j = 0; j < nr_node_ids; j++) {
6367 int distance = node_distance(0, j);
6368 if (distance > curr_distance &&
6369 (distance < next_distance ||
6370 next_distance == curr_distance))
6371 next_distance = distance;
6372 }
6373 if (next_distance != curr_distance) {
6374 sched_domains_numa_distance[level++] = next_distance;
6375 sched_domains_numa_levels = level;
6376 curr_distance = next_distance;
6377 } else break;
6378 }
6379 /*
6380 * 'level' contains the number of unique distances, excluding the
6381 * identity distance node_distance(i,i).
6382 *
6383 * The sched_domains_nume_distance[] array includes the actual distance
6384 * numbers.
6385 */
6386
6387 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6388 if (!sched_domains_numa_masks)
6389 return;
6390
6391 /*
6392 * Now for each level, construct a mask per node which contains all
6393 * cpus of nodes that are that many hops away from us.
6394 */
6395 for (i = 0; i < level; i++) {
6396 sched_domains_numa_masks[i] =
6397 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6398 if (!sched_domains_numa_masks[i])
6399 return;
6400
6401 for (j = 0; j < nr_node_ids; j++) {
6402 struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
6403 if (!mask)
6404 return;
6405
6406 sched_domains_numa_masks[i][j] = mask;
6407
6408 for (k = 0; k < nr_node_ids; k++) {
6409 if (node_distance(j, k) > sched_domains_numa_distance[i])
6410 continue;
6411
6412 cpumask_or(mask, mask, cpumask_of_node(k));
6413 }
6414 }
6415 }
6416
6417 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6418 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6419 if (!tl)
6420 return;
6421
6422 /*
6423 * Copy the default topology bits..
6424 */
6425 for (i = 0; default_topology[i].init; i++)
6426 tl[i] = default_topology[i];
6427
6428 /*
6429 * .. and append 'j' levels of NUMA goodness.
6430 */
6431 for (j = 0; j < level; i++, j++) {
6432 tl[i] = (struct sched_domain_topology_level){
6433 .init = sd_numa_init,
6434 .mask = sd_numa_mask,
6435 .flags = SDTL_OVERLAP,
6436 .numa_level = j,
6437 };
6438 }
6439
6440 sched_domain_topology = tl;
6441}
6442#else
6443static inline void sched_init_numa(void)
6444{
6445}
6446#endif /* CONFIG_NUMA */
6447
6343static int __sdt_alloc(const struct cpumask *cpu_map) 6448static int __sdt_alloc(const struct cpumask *cpu_map)
6344{ 6449{
6345 struct sched_domain_topology_level *tl; 6450 struct sched_domain_topology_level *tl;
@@ -6707,97 +6812,6 @@ match2:
6707 mutex_unlock(&sched_domains_mutex); 6812 mutex_unlock(&sched_domains_mutex);
6708} 6813}
6709 6814
6710#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6711static void reinit_sched_domains(void)
6712{
6713 get_online_cpus();
6714
6715 /* Destroy domains first to force the rebuild */
6716 partition_sched_domains(0, NULL, NULL);
6717
6718 rebuild_sched_domains();
6719 put_online_cpus();
6720}
6721
6722static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6723{
6724 unsigned int level = 0;
6725
6726 if (sscanf(buf, "%u", &level) != 1)
6727 return -EINVAL;
6728
6729 /*
6730 * level is always be positive so don't check for
6731 * level < POWERSAVINGS_BALANCE_NONE which is 0
6732 * What happens on 0 or 1 byte write,
6733 * need to check for count as well?
6734 */
6735
6736 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
6737 return -EINVAL;
6738
6739 if (smt)
6740 sched_smt_power_savings = level;
6741 else
6742 sched_mc_power_savings = level;
6743
6744 reinit_sched_domains();
6745
6746 return count;
6747}
6748
6749#ifdef CONFIG_SCHED_MC
6750static ssize_t sched_mc_power_savings_show(struct device *dev,
6751 struct device_attribute *attr,
6752 char *buf)
6753{
6754 return sprintf(buf, "%u\n", sched_mc_power_savings);
6755}
6756static ssize_t sched_mc_power_savings_store(struct device *dev,
6757 struct device_attribute *attr,
6758 const char *buf, size_t count)
6759{
6760 return sched_power_savings_store(buf, count, 0);
6761}
6762static DEVICE_ATTR(sched_mc_power_savings, 0644,
6763 sched_mc_power_savings_show,
6764 sched_mc_power_savings_store);
6765#endif
6766
6767#ifdef CONFIG_SCHED_SMT
6768static ssize_t sched_smt_power_savings_show(struct device *dev,
6769 struct device_attribute *attr,
6770 char *buf)
6771{
6772 return sprintf(buf, "%u\n", sched_smt_power_savings);
6773}
6774static ssize_t sched_smt_power_savings_store(struct device *dev,
6775 struct device_attribute *attr,
6776 const char *buf, size_t count)
6777{
6778 return sched_power_savings_store(buf, count, 1);
6779}
6780static DEVICE_ATTR(sched_smt_power_savings, 0644,
6781 sched_smt_power_savings_show,
6782 sched_smt_power_savings_store);
6783#endif
6784
6785int __init sched_create_sysfs_power_savings_entries(struct device *dev)
6786{
6787 int err = 0;
6788
6789#ifdef CONFIG_SCHED_SMT
6790 if (smt_capable())
6791 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
6792#endif
6793#ifdef CONFIG_SCHED_MC
6794 if (!err && mc_capable())
6795 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
6796#endif
6797 return err;
6798}
6799#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
6800
6801/* 6815/*
6802 * Update cpusets according to cpu_active mask. If cpusets are 6816 * Update cpusets according to cpu_active mask. If cpusets are
6803 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6817 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@ -6835,6 +6849,8 @@ void __init sched_init_smp(void)
6835 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6849 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6836 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6850 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6837 6851
6852 sched_init_numa();
6853
6838 get_online_cpus(); 6854 get_online_cpus();
6839 mutex_lock(&sched_domains_mutex); 6855 mutex_lock(&sched_domains_mutex);
6840 init_sched_domains(cpu_active_mask); 6856 init_sched_domains(cpu_active_mask);