aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c327
1 files changed, 218 insertions, 109 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e5212ae294f6..bd314d7cd9f8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data)
692} 692}
693#endif 693#endif
694 694
695void update_cpu_load(struct rq *this_rq);
696
697static void set_load_weight(struct task_struct *p) 695static void set_load_weight(struct task_struct *p)
698{ 696{
699 int prio = p->static_prio - MAX_RT_PRIO; 697 int prio = p->static_prio - MAX_RT_PRIO;
@@ -2486,22 +2484,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2486 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2484 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2487 * every tick. We fix it up based on jiffies. 2485 * every tick. We fix it up based on jiffies.
2488 */ 2486 */
2489void update_cpu_load(struct rq *this_rq) 2487static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2488 unsigned long pending_updates)
2490{ 2489{
2491 unsigned long this_load = this_rq->load.weight;
2492 unsigned long curr_jiffies = jiffies;
2493 unsigned long pending_updates;
2494 int i, scale; 2490 int i, scale;
2495 2491
2496 this_rq->nr_load_updates++; 2492 this_rq->nr_load_updates++;
2497 2493
2498 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
2499 if (curr_jiffies == this_rq->last_load_update_tick)
2500 return;
2501
2502 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2503 this_rq->last_load_update_tick = curr_jiffies;
2504
2505 /* Update our load: */ 2494 /* Update our load: */
2506 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 2495 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2507 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2496 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2526,9 +2515,45 @@ void update_cpu_load(struct rq *this_rq)
2526 sched_avg_update(this_rq); 2515 sched_avg_update(this_rq);
2527} 2516}
2528 2517
2518/*
2519 * Called from nohz_idle_balance() to update the load ratings before doing the
2520 * idle balance.
2521 */
2522void update_idle_cpu_load(struct rq *this_rq)
2523{
2524 unsigned long curr_jiffies = jiffies;
2525 unsigned long load = this_rq->load.weight;
2526 unsigned long pending_updates;
2527
2528 /*
2529 * Bloody broken means of dealing with nohz, but better than nothing..
2530 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
2531 * update and see 0 difference the one time and 2 the next, even though
2532 * we ticked at roughtly the same rate.
2533 *
2534 * Hence we only use this from nohz_idle_balance() and skip this
2535 * nonsense when called from the scheduler_tick() since that's
2536 * guaranteed a stable rate.
2537 */
2538 if (load || curr_jiffies == this_rq->last_load_update_tick)
2539 return;
2540
2541 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2542 this_rq->last_load_update_tick = curr_jiffies;
2543
2544 __update_cpu_load(this_rq, load, pending_updates);
2545}
2546
2547/*
2548 * Called from scheduler_tick()
2549 */
2529static void update_cpu_load_active(struct rq *this_rq) 2550static void update_cpu_load_active(struct rq *this_rq)
2530{ 2551{
2531 update_cpu_load(this_rq); 2552 /*
2553 * See the mess in update_idle_cpu_load().
2554 */
2555 this_rq->last_load_update_tick = jiffies;
2556 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2532 2557
2533 calc_load_account_active(this_rq); 2558 calc_load_account_active(this_rq);
2534} 2559}
@@ -5560,7 +5585,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5560 break; 5585 break;
5561 } 5586 }
5562 5587
5563 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 5588 if (!(sd->flags & SD_OVERLAP) &&
5589 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5564 printk(KERN_CONT "\n"); 5590 printk(KERN_CONT "\n");
5565 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5591 printk(KERN_ERR "ERROR: repeated CPUs\n");
5566 break; 5592 break;
@@ -5898,92 +5924,6 @@ static int __init isolated_cpu_setup(char *str)
5898 5924
5899__setup("isolcpus=", isolated_cpu_setup); 5925__setup("isolcpus=", isolated_cpu_setup);
5900 5926
5901#ifdef CONFIG_NUMA
5902
5903/**
5904 * find_next_best_node - find the next node to include in a sched_domain
5905 * @node: node whose sched_domain we're building
5906 * @used_nodes: nodes already in the sched_domain
5907 *
5908 * Find the next node to include in a given scheduling domain. Simply
5909 * finds the closest node not already in the @used_nodes map.
5910 *
5911 * Should use nodemask_t.
5912 */
5913static int find_next_best_node(int node, nodemask_t *used_nodes)
5914{
5915 int i, n, val, min_val, best_node = -1;
5916
5917 min_val = INT_MAX;
5918
5919 for (i = 0; i < nr_node_ids; i++) {
5920 /* Start at @node */
5921 n = (node + i) % nr_node_ids;
5922
5923 if (!nr_cpus_node(n))
5924 continue;
5925
5926 /* Skip already used nodes */
5927 if (node_isset(n, *used_nodes))
5928 continue;
5929
5930 /* Simple min distance search */
5931 val = node_distance(node, n);
5932
5933 if (val < min_val) {
5934 min_val = val;
5935 best_node = n;
5936 }
5937 }
5938
5939 if (best_node != -1)
5940 node_set(best_node, *used_nodes);
5941 return best_node;
5942}
5943
5944/**
5945 * sched_domain_node_span - get a cpumask for a node's sched_domain
5946 * @node: node whose cpumask we're constructing
5947 * @span: resulting cpumask
5948 *
5949 * Given a node, construct a good cpumask for its sched_domain to span. It
5950 * should be one that prevents unnecessary balancing, but also spreads tasks
5951 * out optimally.
5952 */
5953static void sched_domain_node_span(int node, struct cpumask *span)
5954{
5955 nodemask_t used_nodes;
5956 int i;
5957
5958 cpumask_clear(span);
5959 nodes_clear(used_nodes);
5960
5961 cpumask_or(span, span, cpumask_of_node(node));
5962 node_set(node, used_nodes);
5963
5964 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5965 int next_node = find_next_best_node(node, &used_nodes);
5966 if (next_node < 0)
5967 break;
5968 cpumask_or(span, span, cpumask_of_node(next_node));
5969 }
5970}
5971
5972static const struct cpumask *cpu_node_mask(int cpu)
5973{
5974 lockdep_assert_held(&sched_domains_mutex);
5975
5976 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5977
5978 return sched_domains_tmpmask;
5979}
5980
5981static const struct cpumask *cpu_allnodes_mask(int cpu)
5982{
5983 return cpu_possible_mask;
5984}
5985#endif /* CONFIG_NUMA */
5986
5987static const struct cpumask *cpu_cpu_mask(int cpu) 5927static const struct cpumask *cpu_cpu_mask(int cpu)
5988{ 5928{
5989 return cpumask_of_node(cpu_to_node(cpu)); 5929 return cpumask_of_node(cpu_to_node(cpu));
@@ -6020,6 +5960,7 @@ struct sched_domain_topology_level {
6020 sched_domain_init_f init; 5960 sched_domain_init_f init;
6021 sched_domain_mask_f mask; 5961 sched_domain_mask_f mask;
6022 int flags; 5962 int flags;
5963 int numa_level;
6023 struct sd_data data; 5964 struct sd_data data;
6024}; 5965};
6025 5966
@@ -6211,10 +6152,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6211} 6152}
6212 6153
6213SD_INIT_FUNC(CPU) 6154SD_INIT_FUNC(CPU)
6214#ifdef CONFIG_NUMA
6215 SD_INIT_FUNC(ALLNODES)
6216 SD_INIT_FUNC(NODE)
6217#endif
6218#ifdef CONFIG_SCHED_SMT 6155#ifdef CONFIG_SCHED_SMT
6219 SD_INIT_FUNC(SIBLING) 6156 SD_INIT_FUNC(SIBLING)
6220#endif 6157#endif
@@ -6336,15 +6273,185 @@ static struct sched_domain_topology_level default_topology[] = {
6336 { sd_init_BOOK, cpu_book_mask, }, 6273 { sd_init_BOOK, cpu_book_mask, },
6337#endif 6274#endif
6338 { sd_init_CPU, cpu_cpu_mask, }, 6275 { sd_init_CPU, cpu_cpu_mask, },
6339#ifdef CONFIG_NUMA
6340 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6341 { sd_init_ALLNODES, cpu_allnodes_mask, },
6342#endif
6343 { NULL, }, 6276 { NULL, },
6344}; 6277};
6345 6278
6346static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6279static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6347 6280
6281#ifdef CONFIG_NUMA
6282
6283static int sched_domains_numa_levels;
6284static int sched_domains_numa_scale;
6285static int *sched_domains_numa_distance;
6286static struct cpumask ***sched_domains_numa_masks;
6287static int sched_domains_curr_level;
6288
6289static inline int sd_local_flags(int level)
6290{
6291 if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
6292 return 0;
6293
6294 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6295}
6296
6297static struct sched_domain *
6298sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6299{
6300 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6301 int level = tl->numa_level;
6302 int sd_weight = cpumask_weight(
6303 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6304
6305 *sd = (struct sched_domain){
6306 .min_interval = sd_weight,
6307 .max_interval = 2*sd_weight,
6308 .busy_factor = 32,
6309 .imbalance_pct = 125,
6310 .cache_nice_tries = 2,
6311 .busy_idx = 3,
6312 .idle_idx = 2,
6313 .newidle_idx = 0,
6314 .wake_idx = 0,
6315 .forkexec_idx = 0,
6316
6317 .flags = 1*SD_LOAD_BALANCE
6318 | 1*SD_BALANCE_NEWIDLE
6319 | 0*SD_BALANCE_EXEC
6320 | 0*SD_BALANCE_FORK
6321 | 0*SD_BALANCE_WAKE
6322 | 0*SD_WAKE_AFFINE
6323 | 0*SD_PREFER_LOCAL
6324 | 0*SD_SHARE_CPUPOWER
6325 | 0*SD_POWERSAVINGS_BALANCE
6326 | 0*SD_SHARE_PKG_RESOURCES
6327 | 1*SD_SERIALIZE
6328 | 0*SD_PREFER_SIBLING
6329 | sd_local_flags(level)
6330 ,
6331 .last_balance = jiffies,
6332 .balance_interval = sd_weight,
6333 };
6334 SD_INIT_NAME(sd, NUMA);
6335 sd->private = &tl->data;
6336
6337 /*
6338 * Ugly hack to pass state to sd_numa_mask()...
6339 */
6340 sched_domains_curr_level = tl->numa_level;
6341
6342 return sd;
6343}
6344
6345static const struct cpumask *sd_numa_mask(int cpu)
6346{
6347 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6348}
6349
6350static void sched_init_numa(void)
6351{
6352 int next_distance, curr_distance = node_distance(0, 0);
6353 struct sched_domain_topology_level *tl;
6354 int level = 0;
6355 int i, j, k;
6356
6357 sched_domains_numa_scale = curr_distance;
6358 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6359 if (!sched_domains_numa_distance)
6360 return;
6361
6362 /*
6363 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6364 * unique distances in the node_distance() table.
6365 *
6366 * Assumes node_distance(0,j) includes all distances in
6367 * node_distance(i,j) in order to avoid cubic time.
6368 *
6369 * XXX: could be optimized to O(n log n) by using sort()
6370 */
6371 next_distance = curr_distance;
6372 for (i = 0; i < nr_node_ids; i++) {
6373 for (j = 0; j < nr_node_ids; j++) {
6374 int distance = node_distance(0, j);
6375 if (distance > curr_distance &&
6376 (distance < next_distance ||
6377 next_distance == curr_distance))
6378 next_distance = distance;
6379 }
6380 if (next_distance != curr_distance) {
6381 sched_domains_numa_distance[level++] = next_distance;
6382 sched_domains_numa_levels = level;
6383 curr_distance = next_distance;
6384 } else break;
6385 }
6386 /*
6387 * 'level' contains the number of unique distances, excluding the
6388 * identity distance node_distance(i,i).
6389 *
6390 * The sched_domains_nume_distance[] array includes the actual distance
6391 * numbers.
6392 */
6393
6394 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6395 if (!sched_domains_numa_masks)
6396 return;
6397
6398 /*
6399 * Now for each level, construct a mask per node which contains all
6400 * cpus of nodes that are that many hops away from us.
6401 */
6402 for (i = 0; i < level; i++) {
6403 sched_domains_numa_masks[i] =
6404 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6405 if (!sched_domains_numa_masks[i])
6406 return;
6407
6408 for (j = 0; j < nr_node_ids; j++) {
6409 struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
6410 if (!mask)
6411 return;
6412
6413 sched_domains_numa_masks[i][j] = mask;
6414
6415 for (k = 0; k < nr_node_ids; k++) {
6416 if (node_distance(j, k) > sched_domains_numa_distance[i])
6417 continue;
6418
6419 cpumask_or(mask, mask, cpumask_of_node(k));
6420 }
6421 }
6422 }
6423
6424 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6425 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6426 if (!tl)
6427 return;
6428
6429 /*
6430 * Copy the default topology bits..
6431 */
6432 for (i = 0; default_topology[i].init; i++)
6433 tl[i] = default_topology[i];
6434
6435 /*
6436 * .. and append 'j' levels of NUMA goodness.
6437 */
6438 for (j = 0; j < level; i++, j++) {
6439 tl[i] = (struct sched_domain_topology_level){
6440 .init = sd_numa_init,
6441 .mask = sd_numa_mask,
6442 .flags = SDTL_OVERLAP,
6443 .numa_level = j,
6444 };
6445 }
6446
6447 sched_domain_topology = tl;
6448}
6449#else
6450static inline void sched_init_numa(void)
6451{
6452}
6453#endif /* CONFIG_NUMA */
6454
6348static int __sdt_alloc(const struct cpumask *cpu_map) 6455static int __sdt_alloc(const struct cpumask *cpu_map)
6349{ 6456{
6350 struct sched_domain_topology_level *tl; 6457 struct sched_domain_topology_level *tl;
@@ -6840,6 +6947,8 @@ void __init sched_init_smp(void)
6840 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6947 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6841 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6948 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6842 6949
6950 sched_init_numa();
6951
6843 get_online_cpus(); 6952 get_online_cpus();
6844 mutex_lock(&sched_domains_mutex); 6953 mutex_lock(&sched_domains_mutex);
6845 init_sched_domains(cpu_active_mask); 6954 init_sched_domains(cpu_active_mask);