diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 327 |
1 files changed, 218 insertions, 109 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e5212ae294f6..bd314d7cd9f8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data) | |||
692 | } | 692 | } |
693 | #endif | 693 | #endif |
694 | 694 | ||
695 | void update_cpu_load(struct rq *this_rq); | ||
696 | |||
697 | static void set_load_weight(struct task_struct *p) | 695 | static void set_load_weight(struct task_struct *p) |
698 | { | 696 | { |
699 | int prio = p->static_prio - MAX_RT_PRIO; | 697 | int prio = p->static_prio - MAX_RT_PRIO; |
@@ -2486,22 +2484,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | |||
2486 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | 2484 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
2487 | * every tick. We fix it up based on jiffies. | 2485 | * every tick. We fix it up based on jiffies. |
2488 | */ | 2486 | */ |
2489 | void update_cpu_load(struct rq *this_rq) | 2487 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, |
2488 | unsigned long pending_updates) | ||
2490 | { | 2489 | { |
2491 | unsigned long this_load = this_rq->load.weight; | ||
2492 | unsigned long curr_jiffies = jiffies; | ||
2493 | unsigned long pending_updates; | ||
2494 | int i, scale; | 2490 | int i, scale; |
2495 | 2491 | ||
2496 | this_rq->nr_load_updates++; | 2492 | this_rq->nr_load_updates++; |
2497 | 2493 | ||
2498 | /* Avoid repeated calls on same jiffy, when moving in and out of idle */ | ||
2499 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
2500 | return; | ||
2501 | |||
2502 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2503 | this_rq->last_load_update_tick = curr_jiffies; | ||
2504 | |||
2505 | /* Update our load: */ | 2494 | /* Update our load: */ |
2506 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | 2495 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ |
2507 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 2496 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { |
@@ -2526,9 +2515,45 @@ void update_cpu_load(struct rq *this_rq) | |||
2526 | sched_avg_update(this_rq); | 2515 | sched_avg_update(this_rq); |
2527 | } | 2516 | } |
2528 | 2517 | ||
2518 | /* | ||
2519 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
2520 | * idle balance. | ||
2521 | */ | ||
2522 | void update_idle_cpu_load(struct rq *this_rq) | ||
2523 | { | ||
2524 | unsigned long curr_jiffies = jiffies; | ||
2525 | unsigned long load = this_rq->load.weight; | ||
2526 | unsigned long pending_updates; | ||
2527 | |||
2528 | /* | ||
2529 | * Bloody broken means of dealing with nohz, but better than nothing.. | ||
2530 | * jiffies is updated by one cpu, another cpu can drift wrt the jiffy | ||
2531 | * update and see 0 difference the one time and 2 the next, even though | ||
2532 | * we ticked at roughtly the same rate. | ||
2533 | * | ||
2534 | * Hence we only use this from nohz_idle_balance() and skip this | ||
2535 | * nonsense when called from the scheduler_tick() since that's | ||
2536 | * guaranteed a stable rate. | ||
2537 | */ | ||
2538 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
2539 | return; | ||
2540 | |||
2541 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2542 | this_rq->last_load_update_tick = curr_jiffies; | ||
2543 | |||
2544 | __update_cpu_load(this_rq, load, pending_updates); | ||
2545 | } | ||
2546 | |||
2547 | /* | ||
2548 | * Called from scheduler_tick() | ||
2549 | */ | ||
2529 | static void update_cpu_load_active(struct rq *this_rq) | 2550 | static void update_cpu_load_active(struct rq *this_rq) |
2530 | { | 2551 | { |
2531 | update_cpu_load(this_rq); | 2552 | /* |
2553 | * See the mess in update_idle_cpu_load(). | ||
2554 | */ | ||
2555 | this_rq->last_load_update_tick = jiffies; | ||
2556 | __update_cpu_load(this_rq, this_rq->load.weight, 1); | ||
2532 | 2557 | ||
2533 | calc_load_account_active(this_rq); | 2558 | calc_load_account_active(this_rq); |
2534 | } | 2559 | } |
@@ -5560,7 +5585,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5560 | break; | 5585 | break; |
5561 | } | 5586 | } |
5562 | 5587 | ||
5563 | if (cpumask_intersects(groupmask, sched_group_cpus(group))) { | 5588 | if (!(sd->flags & SD_OVERLAP) && |
5589 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | ||
5564 | printk(KERN_CONT "\n"); | 5590 | printk(KERN_CONT "\n"); |
5565 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 5591 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
5566 | break; | 5592 | break; |
@@ -5898,92 +5924,6 @@ static int __init isolated_cpu_setup(char *str) | |||
5898 | 5924 | ||
5899 | __setup("isolcpus=", isolated_cpu_setup); | 5925 | __setup("isolcpus=", isolated_cpu_setup); |
5900 | 5926 | ||
5901 | #ifdef CONFIG_NUMA | ||
5902 | |||
5903 | /** | ||
5904 | * find_next_best_node - find the next node to include in a sched_domain | ||
5905 | * @node: node whose sched_domain we're building | ||
5906 | * @used_nodes: nodes already in the sched_domain | ||
5907 | * | ||
5908 | * Find the next node to include in a given scheduling domain. Simply | ||
5909 | * finds the closest node not already in the @used_nodes map. | ||
5910 | * | ||
5911 | * Should use nodemask_t. | ||
5912 | */ | ||
5913 | static int find_next_best_node(int node, nodemask_t *used_nodes) | ||
5914 | { | ||
5915 | int i, n, val, min_val, best_node = -1; | ||
5916 | |||
5917 | min_val = INT_MAX; | ||
5918 | |||
5919 | for (i = 0; i < nr_node_ids; i++) { | ||
5920 | /* Start at @node */ | ||
5921 | n = (node + i) % nr_node_ids; | ||
5922 | |||
5923 | if (!nr_cpus_node(n)) | ||
5924 | continue; | ||
5925 | |||
5926 | /* Skip already used nodes */ | ||
5927 | if (node_isset(n, *used_nodes)) | ||
5928 | continue; | ||
5929 | |||
5930 | /* Simple min distance search */ | ||
5931 | val = node_distance(node, n); | ||
5932 | |||
5933 | if (val < min_val) { | ||
5934 | min_val = val; | ||
5935 | best_node = n; | ||
5936 | } | ||
5937 | } | ||
5938 | |||
5939 | if (best_node != -1) | ||
5940 | node_set(best_node, *used_nodes); | ||
5941 | return best_node; | ||
5942 | } | ||
5943 | |||
5944 | /** | ||
5945 | * sched_domain_node_span - get a cpumask for a node's sched_domain | ||
5946 | * @node: node whose cpumask we're constructing | ||
5947 | * @span: resulting cpumask | ||
5948 | * | ||
5949 | * Given a node, construct a good cpumask for its sched_domain to span. It | ||
5950 | * should be one that prevents unnecessary balancing, but also spreads tasks | ||
5951 | * out optimally. | ||
5952 | */ | ||
5953 | static void sched_domain_node_span(int node, struct cpumask *span) | ||
5954 | { | ||
5955 | nodemask_t used_nodes; | ||
5956 | int i; | ||
5957 | |||
5958 | cpumask_clear(span); | ||
5959 | nodes_clear(used_nodes); | ||
5960 | |||
5961 | cpumask_or(span, span, cpumask_of_node(node)); | ||
5962 | node_set(node, used_nodes); | ||
5963 | |||
5964 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | ||
5965 | int next_node = find_next_best_node(node, &used_nodes); | ||
5966 | if (next_node < 0) | ||
5967 | break; | ||
5968 | cpumask_or(span, span, cpumask_of_node(next_node)); | ||
5969 | } | ||
5970 | } | ||
5971 | |||
5972 | static const struct cpumask *cpu_node_mask(int cpu) | ||
5973 | { | ||
5974 | lockdep_assert_held(&sched_domains_mutex); | ||
5975 | |||
5976 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | ||
5977 | |||
5978 | return sched_domains_tmpmask; | ||
5979 | } | ||
5980 | |||
5981 | static const struct cpumask *cpu_allnodes_mask(int cpu) | ||
5982 | { | ||
5983 | return cpu_possible_mask; | ||
5984 | } | ||
5985 | #endif /* CONFIG_NUMA */ | ||
5986 | |||
5987 | static const struct cpumask *cpu_cpu_mask(int cpu) | 5927 | static const struct cpumask *cpu_cpu_mask(int cpu) |
5988 | { | 5928 | { |
5989 | return cpumask_of_node(cpu_to_node(cpu)); | 5929 | return cpumask_of_node(cpu_to_node(cpu)); |
@@ -6020,6 +5960,7 @@ struct sched_domain_topology_level { | |||
6020 | sched_domain_init_f init; | 5960 | sched_domain_init_f init; |
6021 | sched_domain_mask_f mask; | 5961 | sched_domain_mask_f mask; |
6022 | int flags; | 5962 | int flags; |
5963 | int numa_level; | ||
6023 | struct sd_data data; | 5964 | struct sd_data data; |
6024 | }; | 5965 | }; |
6025 | 5966 | ||
@@ -6211,10 +6152,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ | |||
6211 | } | 6152 | } |
6212 | 6153 | ||
6213 | SD_INIT_FUNC(CPU) | 6154 | SD_INIT_FUNC(CPU) |
6214 | #ifdef CONFIG_NUMA | ||
6215 | SD_INIT_FUNC(ALLNODES) | ||
6216 | SD_INIT_FUNC(NODE) | ||
6217 | #endif | ||
6218 | #ifdef CONFIG_SCHED_SMT | 6155 | #ifdef CONFIG_SCHED_SMT |
6219 | SD_INIT_FUNC(SIBLING) | 6156 | SD_INIT_FUNC(SIBLING) |
6220 | #endif | 6157 | #endif |
@@ -6336,15 +6273,185 @@ static struct sched_domain_topology_level default_topology[] = { | |||
6336 | { sd_init_BOOK, cpu_book_mask, }, | 6273 | { sd_init_BOOK, cpu_book_mask, }, |
6337 | #endif | 6274 | #endif |
6338 | { sd_init_CPU, cpu_cpu_mask, }, | 6275 | { sd_init_CPU, cpu_cpu_mask, }, |
6339 | #ifdef CONFIG_NUMA | ||
6340 | { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, | ||
6341 | { sd_init_ALLNODES, cpu_allnodes_mask, }, | ||
6342 | #endif | ||
6343 | { NULL, }, | 6276 | { NULL, }, |
6344 | }; | 6277 | }; |
6345 | 6278 | ||
6346 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | 6279 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; |
6347 | 6280 | ||
6281 | #ifdef CONFIG_NUMA | ||
6282 | |||
6283 | static int sched_domains_numa_levels; | ||
6284 | static int sched_domains_numa_scale; | ||
6285 | static int *sched_domains_numa_distance; | ||
6286 | static struct cpumask ***sched_domains_numa_masks; | ||
6287 | static int sched_domains_curr_level; | ||
6288 | |||
6289 | static inline int sd_local_flags(int level) | ||
6290 | { | ||
6291 | if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) | ||
6292 | return 0; | ||
6293 | |||
6294 | return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; | ||
6295 | } | ||
6296 | |||
6297 | static struct sched_domain * | ||
6298 | sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | ||
6299 | { | ||
6300 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); | ||
6301 | int level = tl->numa_level; | ||
6302 | int sd_weight = cpumask_weight( | ||
6303 | sched_domains_numa_masks[level][cpu_to_node(cpu)]); | ||
6304 | |||
6305 | *sd = (struct sched_domain){ | ||
6306 | .min_interval = sd_weight, | ||
6307 | .max_interval = 2*sd_weight, | ||
6308 | .busy_factor = 32, | ||
6309 | .imbalance_pct = 125, | ||
6310 | .cache_nice_tries = 2, | ||
6311 | .busy_idx = 3, | ||
6312 | .idle_idx = 2, | ||
6313 | .newidle_idx = 0, | ||
6314 | .wake_idx = 0, | ||
6315 | .forkexec_idx = 0, | ||
6316 | |||
6317 | .flags = 1*SD_LOAD_BALANCE | ||
6318 | | 1*SD_BALANCE_NEWIDLE | ||
6319 | | 0*SD_BALANCE_EXEC | ||
6320 | | 0*SD_BALANCE_FORK | ||
6321 | | 0*SD_BALANCE_WAKE | ||
6322 | | 0*SD_WAKE_AFFINE | ||
6323 | | 0*SD_PREFER_LOCAL | ||
6324 | | 0*SD_SHARE_CPUPOWER | ||
6325 | | 0*SD_POWERSAVINGS_BALANCE | ||
6326 | | 0*SD_SHARE_PKG_RESOURCES | ||
6327 | | 1*SD_SERIALIZE | ||
6328 | | 0*SD_PREFER_SIBLING | ||
6329 | | sd_local_flags(level) | ||
6330 | , | ||
6331 | .last_balance = jiffies, | ||
6332 | .balance_interval = sd_weight, | ||
6333 | }; | ||
6334 | SD_INIT_NAME(sd, NUMA); | ||
6335 | sd->private = &tl->data; | ||
6336 | |||
6337 | /* | ||
6338 | * Ugly hack to pass state to sd_numa_mask()... | ||
6339 | */ | ||
6340 | sched_domains_curr_level = tl->numa_level; | ||
6341 | |||
6342 | return sd; | ||
6343 | } | ||
6344 | |||
6345 | static const struct cpumask *sd_numa_mask(int cpu) | ||
6346 | { | ||
6347 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | ||
6348 | } | ||
6349 | |||
6350 | static void sched_init_numa(void) | ||
6351 | { | ||
6352 | int next_distance, curr_distance = node_distance(0, 0); | ||
6353 | struct sched_domain_topology_level *tl; | ||
6354 | int level = 0; | ||
6355 | int i, j, k; | ||
6356 | |||
6357 | sched_domains_numa_scale = curr_distance; | ||
6358 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | ||
6359 | if (!sched_domains_numa_distance) | ||
6360 | return; | ||
6361 | |||
6362 | /* | ||
6363 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | ||
6364 | * unique distances in the node_distance() table. | ||
6365 | * | ||
6366 | * Assumes node_distance(0,j) includes all distances in | ||
6367 | * node_distance(i,j) in order to avoid cubic time. | ||
6368 | * | ||
6369 | * XXX: could be optimized to O(n log n) by using sort() | ||
6370 | */ | ||
6371 | next_distance = curr_distance; | ||
6372 | for (i = 0; i < nr_node_ids; i++) { | ||
6373 | for (j = 0; j < nr_node_ids; j++) { | ||
6374 | int distance = node_distance(0, j); | ||
6375 | if (distance > curr_distance && | ||
6376 | (distance < next_distance || | ||
6377 | next_distance == curr_distance)) | ||
6378 | next_distance = distance; | ||
6379 | } | ||
6380 | if (next_distance != curr_distance) { | ||
6381 | sched_domains_numa_distance[level++] = next_distance; | ||
6382 | sched_domains_numa_levels = level; | ||
6383 | curr_distance = next_distance; | ||
6384 | } else break; | ||
6385 | } | ||
6386 | /* | ||
6387 | * 'level' contains the number of unique distances, excluding the | ||
6388 | * identity distance node_distance(i,i). | ||
6389 | * | ||
6390 | * The sched_domains_nume_distance[] array includes the actual distance | ||
6391 | * numbers. | ||
6392 | */ | ||
6393 | |||
6394 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | ||
6395 | if (!sched_domains_numa_masks) | ||
6396 | return; | ||
6397 | |||
6398 | /* | ||
6399 | * Now for each level, construct a mask per node which contains all | ||
6400 | * cpus of nodes that are that many hops away from us. | ||
6401 | */ | ||
6402 | for (i = 0; i < level; i++) { | ||
6403 | sched_domains_numa_masks[i] = | ||
6404 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); | ||
6405 | if (!sched_domains_numa_masks[i]) | ||
6406 | return; | ||
6407 | |||
6408 | for (j = 0; j < nr_node_ids; j++) { | ||
6409 | struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); | ||
6410 | if (!mask) | ||
6411 | return; | ||
6412 | |||
6413 | sched_domains_numa_masks[i][j] = mask; | ||
6414 | |||
6415 | for (k = 0; k < nr_node_ids; k++) { | ||
6416 | if (node_distance(j, k) > sched_domains_numa_distance[i]) | ||
6417 | continue; | ||
6418 | |||
6419 | cpumask_or(mask, mask, cpumask_of_node(k)); | ||
6420 | } | ||
6421 | } | ||
6422 | } | ||
6423 | |||
6424 | tl = kzalloc((ARRAY_SIZE(default_topology) + level) * | ||
6425 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | ||
6426 | if (!tl) | ||
6427 | return; | ||
6428 | |||
6429 | /* | ||
6430 | * Copy the default topology bits.. | ||
6431 | */ | ||
6432 | for (i = 0; default_topology[i].init; i++) | ||
6433 | tl[i] = default_topology[i]; | ||
6434 | |||
6435 | /* | ||
6436 | * .. and append 'j' levels of NUMA goodness. | ||
6437 | */ | ||
6438 | for (j = 0; j < level; i++, j++) { | ||
6439 | tl[i] = (struct sched_domain_topology_level){ | ||
6440 | .init = sd_numa_init, | ||
6441 | .mask = sd_numa_mask, | ||
6442 | .flags = SDTL_OVERLAP, | ||
6443 | .numa_level = j, | ||
6444 | }; | ||
6445 | } | ||
6446 | |||
6447 | sched_domain_topology = tl; | ||
6448 | } | ||
6449 | #else | ||
6450 | static inline void sched_init_numa(void) | ||
6451 | { | ||
6452 | } | ||
6453 | #endif /* CONFIG_NUMA */ | ||
6454 | |||
6348 | static int __sdt_alloc(const struct cpumask *cpu_map) | 6455 | static int __sdt_alloc(const struct cpumask *cpu_map) |
6349 | { | 6456 | { |
6350 | struct sched_domain_topology_level *tl; | 6457 | struct sched_domain_topology_level *tl; |
@@ -6840,6 +6947,8 @@ void __init sched_init_smp(void) | |||
6840 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 6947 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
6841 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 6948 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
6842 | 6949 | ||
6950 | sched_init_numa(); | ||
6951 | |||
6843 | get_online_cpus(); | 6952 | get_online_cpus(); |
6844 | mutex_lock(&sched_domains_mutex); | 6953 | mutex_lock(&sched_domains_mutex); |
6845 | init_sched_domains(cpu_active_mask); | 6954 | init_sched_domains(cpu_active_mask); |