diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 468 |
1 files changed, 468 insertions, 0 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index c0c60c926d5e..98461de1ab65 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/notifier.h> | 34 | #include <linux/notifier.h> |
35 | #include <linux/profile.h> | 35 | #include <linux/profile.h> |
36 | #include <linux/suspend.h> | 36 | #include <linux/suspend.h> |
37 | #include <linux/vmalloc.h> | ||
37 | #include <linux/blkdev.h> | 38 | #include <linux/blkdev.h> |
38 | #include <linux/delay.h> | 39 | #include <linux/delay.h> |
39 | #include <linux/smp.h> | 40 | #include <linux/smp.h> |
@@ -5082,7 +5083,470 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, | |||
5082 | 5083 | ||
5083 | #define SD_NODES_PER_DOMAIN 16 | 5084 | #define SD_NODES_PER_DOMAIN 16 |
5084 | 5085 | ||
5086 | /* | ||
5087 | * Self-tuning task migration cost measurement between source and target CPUs. | ||
5088 | * | ||
5089 | * This is done by measuring the cost of manipulating buffers of varying | ||
5090 | * sizes. For a given buffer-size here are the steps that are taken: | ||
5091 | * | ||
5092 | * 1) the source CPU reads+dirties a shared buffer | ||
5093 | * 2) the target CPU reads+dirties the same shared buffer | ||
5094 | * | ||
5095 | * We measure how long they take, in the following 4 scenarios: | ||
5096 | * | ||
5097 | * - source: CPU1, target: CPU2 | cost1 | ||
5098 | * - source: CPU2, target: CPU1 | cost2 | ||
5099 | * - source: CPU1, target: CPU1 | cost3 | ||
5100 | * - source: CPU2, target: CPU2 | cost4 | ||
5101 | * | ||
5102 | * We then calculate the cost3+cost4-cost1-cost2 difference - this is | ||
5103 | * the cost of migration. | ||
5104 | * | ||
5105 | * We then start off from a small buffer-size and iterate up to larger | ||
5106 | * buffer sizes, in 5% steps - measuring each buffer-size separately, and | ||
5107 | * doing a maximum search for the cost. (The maximum cost for a migration | ||
5108 | * normally occurs when the working set size is around the effective cache | ||
5109 | * size.) | ||
5110 | */ | ||
5111 | #define SEARCH_SCOPE 2 | ||
5112 | #define MIN_CACHE_SIZE (64*1024U) | ||
5113 | #define DEFAULT_CACHE_SIZE (5*1024*1024U) | ||
5114 | #define ITERATIONS 2 | ||
5115 | #define SIZE_THRESH 130 | ||
5116 | #define COST_THRESH 130 | ||
5117 | |||
5118 | /* | ||
5119 | * The migration cost is a function of 'domain distance'. Domain | ||
5120 | * distance is the number of steps a CPU has to iterate down its | ||
5121 | * domain tree to share a domain with the other CPU. The farther | ||
5122 | * two CPUs are from each other, the larger the distance gets. | ||
5123 | * | ||
5124 | * Note that we use the distance only to cache measurement results, | ||
5125 | * the distance value is not used numerically otherwise. When two | ||
5126 | * CPUs have the same distance it is assumed that the migration | ||
5127 | * cost is the same. (this is a simplification but quite practical) | ||
5128 | */ | ||
5129 | #define MAX_DOMAIN_DISTANCE 32 | ||
5130 | |||
5131 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = | ||
5132 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL }; | ||
5133 | |||
5134 | /* | ||
5135 | * Allow override of migration cost - in units of microseconds. | ||
5136 | * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost | ||
5137 | * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: | ||
5138 | */ | ||
5139 | static int __init migration_cost_setup(char *str) | ||
5140 | { | ||
5141 | int ints[MAX_DOMAIN_DISTANCE+1], i; | ||
5142 | |||
5143 | str = get_options(str, ARRAY_SIZE(ints), ints); | ||
5144 | |||
5145 | printk("#ints: %d\n", ints[0]); | ||
5146 | for (i = 1; i <= ints[0]; i++) { | ||
5147 | migration_cost[i-1] = (unsigned long long)ints[i]*1000; | ||
5148 | printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); | ||
5149 | } | ||
5150 | return 1; | ||
5151 | } | ||
5152 | |||
5153 | __setup ("migration_cost=", migration_cost_setup); | ||
5154 | |||
5155 | /* | ||
5156 | * Global multiplier (divisor) for migration-cutoff values, | ||
5157 | * in percentiles. E.g. use a value of 150 to get 1.5 times | ||
5158 | * longer cache-hot cutoff times. | ||
5159 | * | ||
5160 | * (We scale it from 100 to 128 to long long handling easier.) | ||
5161 | */ | ||
5162 | |||
5163 | #define MIGRATION_FACTOR_SCALE 128 | ||
5164 | |||
5165 | static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; | ||
5166 | |||
5167 | static int __init setup_migration_factor(char *str) | ||
5168 | { | ||
5169 | get_option(&str, &migration_factor); | ||
5170 | migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; | ||
5171 | return 1; | ||
5172 | } | ||
5173 | |||
5174 | __setup("migration_factor=", setup_migration_factor); | ||
5175 | |||
5176 | /* | ||
5177 | * Estimated distance of two CPUs, measured via the number of domains | ||
5178 | * we have to pass for the two CPUs to be in the same span: | ||
5179 | */ | ||
5180 | static unsigned long domain_distance(int cpu1, int cpu2) | ||
5181 | { | ||
5182 | unsigned long distance = 0; | ||
5183 | struct sched_domain *sd; | ||
5184 | |||
5185 | for_each_domain(cpu1, sd) { | ||
5186 | WARN_ON(!cpu_isset(cpu1, sd->span)); | ||
5187 | if (cpu_isset(cpu2, sd->span)) | ||
5188 | return distance; | ||
5189 | distance++; | ||
5190 | } | ||
5191 | if (distance >= MAX_DOMAIN_DISTANCE) { | ||
5192 | WARN_ON(1); | ||
5193 | distance = MAX_DOMAIN_DISTANCE-1; | ||
5194 | } | ||
5195 | |||
5196 | return distance; | ||
5197 | } | ||
5198 | |||
5199 | static unsigned int migration_debug; | ||
5200 | |||
5201 | static int __init setup_migration_debug(char *str) | ||
5202 | { | ||
5203 | get_option(&str, &migration_debug); | ||
5204 | return 1; | ||
5205 | } | ||
5206 | |||
5207 | __setup("migration_debug=", setup_migration_debug); | ||
5208 | |||
5209 | /* | ||
5210 | * Maximum cache-size that the scheduler should try to measure. | ||
5211 | * Architectures with larger caches should tune this up during | ||
5212 | * bootup. Gets used in the domain-setup code (i.e. during SMP | ||
5213 | * bootup). | ||
5214 | */ | ||
5215 | unsigned int max_cache_size; | ||
5216 | |||
5217 | static int __init setup_max_cache_size(char *str) | ||
5218 | { | ||
5219 | get_option(&str, &max_cache_size); | ||
5220 | return 1; | ||
5221 | } | ||
5222 | |||
5223 | __setup("max_cache_size=", setup_max_cache_size); | ||
5224 | |||
5225 | /* | ||
5226 | * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This | ||
5227 | * is the operation that is timed, so we try to generate unpredictable | ||
5228 | * cachemisses that still end up filling the L2 cache: | ||
5229 | */ | ||
5230 | static void touch_cache(void *__cache, unsigned long __size) | ||
5231 | { | ||
5232 | unsigned long size = __size/sizeof(long), chunk1 = size/3, | ||
5233 | chunk2 = 2*size/3; | ||
5234 | unsigned long *cache = __cache; | ||
5235 | int i; | ||
5236 | |||
5237 | for (i = 0; i < size/6; i += 8) { | ||
5238 | switch (i % 6) { | ||
5239 | case 0: cache[i]++; | ||
5240 | case 1: cache[size-1-i]++; | ||
5241 | case 2: cache[chunk1-i]++; | ||
5242 | case 3: cache[chunk1+i]++; | ||
5243 | case 4: cache[chunk2-i]++; | ||
5244 | case 5: cache[chunk2+i]++; | ||
5245 | } | ||
5246 | } | ||
5247 | } | ||
5248 | |||
5249 | /* | ||
5250 | * Measure the cache-cost of one task migration. Returns in units of nsec. | ||
5251 | */ | ||
5252 | static unsigned long long measure_one(void *cache, unsigned long size, | ||
5253 | int source, int target) | ||
5254 | { | ||
5255 | cpumask_t mask, saved_mask; | ||
5256 | unsigned long long t0, t1, t2, t3, cost; | ||
5257 | |||
5258 | saved_mask = current->cpus_allowed; | ||
5259 | |||
5260 | /* | ||
5261 | * Flush source caches to RAM and invalidate them: | ||
5262 | */ | ||
5263 | sched_cacheflush(); | ||
5264 | |||
5265 | /* | ||
5266 | * Migrate to the source CPU: | ||
5267 | */ | ||
5268 | mask = cpumask_of_cpu(source); | ||
5269 | set_cpus_allowed(current, mask); | ||
5270 | WARN_ON(smp_processor_id() != source); | ||
5271 | |||
5272 | /* | ||
5273 | * Dirty the working set: | ||
5274 | */ | ||
5275 | t0 = sched_clock(); | ||
5276 | touch_cache(cache, size); | ||
5277 | t1 = sched_clock(); | ||
5278 | |||
5279 | /* | ||
5280 | * Migrate to the target CPU, dirty the L2 cache and access | ||
5281 | * the shared buffer. (which represents the working set | ||
5282 | * of a migrated task.) | ||
5283 | */ | ||
5284 | mask = cpumask_of_cpu(target); | ||
5285 | set_cpus_allowed(current, mask); | ||
5286 | WARN_ON(smp_processor_id() != target); | ||
5287 | |||
5288 | t2 = sched_clock(); | ||
5289 | touch_cache(cache, size); | ||
5290 | t3 = sched_clock(); | ||
5291 | |||
5292 | cost = t1-t0 + t3-t2; | ||
5293 | |||
5294 | if (migration_debug >= 2) | ||
5295 | printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", | ||
5296 | source, target, t1-t0, t1-t0, t3-t2, cost); | ||
5297 | /* | ||
5298 | * Flush target caches to RAM and invalidate them: | ||
5299 | */ | ||
5300 | sched_cacheflush(); | ||
5301 | |||
5302 | set_cpus_allowed(current, saved_mask); | ||
5303 | |||
5304 | return cost; | ||
5305 | } | ||
5306 | |||
5307 | /* | ||
5308 | * Measure a series of task migrations and return the average | ||
5309 | * result. Since this code runs early during bootup the system | ||
5310 | * is 'undisturbed' and the average latency makes sense. | ||
5311 | * | ||
5312 | * The algorithm in essence auto-detects the relevant cache-size, | ||
5313 | * so it will properly detect different cachesizes for different | ||
5314 | * cache-hierarchies, depending on how the CPUs are connected. | ||
5315 | * | ||
5316 | * Architectures can prime the upper limit of the search range via | ||
5317 | * max_cache_size, otherwise the search range defaults to 20MB...64K. | ||
5318 | */ | ||
5319 | static unsigned long long | ||
5320 | measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | ||
5321 | { | ||
5322 | unsigned long long cost1, cost2; | ||
5323 | int i; | ||
5324 | |||
5325 | /* | ||
5326 | * Measure the migration cost of 'size' bytes, over an | ||
5327 | * average of 10 runs: | ||
5328 | * | ||
5329 | * (We perturb the cache size by a small (0..4k) | ||
5330 | * value to compensate size/alignment related artifacts. | ||
5331 | * We also subtract the cost of the operation done on | ||
5332 | * the same CPU.) | ||
5333 | */ | ||
5334 | cost1 = 0; | ||
5335 | |||
5336 | /* | ||
5337 | * dry run, to make sure we start off cache-cold on cpu1, | ||
5338 | * and to get any vmalloc pagefaults in advance: | ||
5339 | */ | ||
5340 | measure_one(cache, size, cpu1, cpu2); | ||
5341 | for (i = 0; i < ITERATIONS; i++) | ||
5342 | cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); | ||
5343 | |||
5344 | measure_one(cache, size, cpu2, cpu1); | ||
5345 | for (i = 0; i < ITERATIONS; i++) | ||
5346 | cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); | ||
5347 | |||
5348 | /* | ||
5349 | * (We measure the non-migrating [cached] cost on both | ||
5350 | * cpu1 and cpu2, to handle CPUs with different speeds) | ||
5351 | */ | ||
5352 | cost2 = 0; | ||
5353 | |||
5354 | measure_one(cache, size, cpu1, cpu1); | ||
5355 | for (i = 0; i < ITERATIONS; i++) | ||
5356 | cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); | ||
5357 | |||
5358 | measure_one(cache, size, cpu2, cpu2); | ||
5359 | for (i = 0; i < ITERATIONS; i++) | ||
5360 | cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); | ||
5361 | |||
5362 | /* | ||
5363 | * Get the per-iteration migration cost: | ||
5364 | */ | ||
5365 | do_div(cost1, 2*ITERATIONS); | ||
5366 | do_div(cost2, 2*ITERATIONS); | ||
5367 | |||
5368 | return cost1 - cost2; | ||
5369 | } | ||
5370 | |||
5371 | static unsigned long long measure_migration_cost(int cpu1, int cpu2) | ||
5372 | { | ||
5373 | unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; | ||
5374 | unsigned int max_size, size, size_found = 0; | ||
5375 | long long cost = 0, prev_cost; | ||
5376 | void *cache; | ||
5377 | |||
5378 | /* | ||
5379 | * Search from max_cache_size*5 down to 64K - the real relevant | ||
5380 | * cachesize has to lie somewhere inbetween. | ||
5381 | */ | ||
5382 | if (max_cache_size) { | ||
5383 | max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
5384 | size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
5385 | } else { | ||
5386 | /* | ||
5387 | * Since we have no estimation about the relevant | ||
5388 | * search range | ||
5389 | */ | ||
5390 | max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; | ||
5391 | size = MIN_CACHE_SIZE; | ||
5392 | } | ||
5393 | |||
5394 | if (!cpu_online(cpu1) || !cpu_online(cpu2)) { | ||
5395 | printk("cpu %d and %d not both online!\n", cpu1, cpu2); | ||
5396 | return 0; | ||
5397 | } | ||
5398 | |||
5399 | /* | ||
5400 | * Allocate the working set: | ||
5401 | */ | ||
5402 | cache = vmalloc(max_size); | ||
5403 | if (!cache) { | ||
5404 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | ||
5405 | return 1000000; // return 1 msec on very small boxen | ||
5406 | } | ||
5407 | |||
5408 | while (size <= max_size) { | ||
5409 | prev_cost = cost; | ||
5410 | cost = measure_cost(cpu1, cpu2, cache, size); | ||
5411 | |||
5412 | /* | ||
5413 | * Update the max: | ||
5414 | */ | ||
5415 | if (cost > 0) { | ||
5416 | if (max_cost < cost) { | ||
5417 | max_cost = cost; | ||
5418 | size_found = size; | ||
5419 | } | ||
5420 | } | ||
5421 | /* | ||
5422 | * Calculate average fluctuation, we use this to prevent | ||
5423 | * noise from triggering an early break out of the loop: | ||
5424 | */ | ||
5425 | fluct = abs(cost - prev_cost); | ||
5426 | avg_fluct = (avg_fluct + fluct)/2; | ||
5427 | |||
5428 | if (migration_debug) | ||
5429 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", | ||
5430 | cpu1, cpu2, size, | ||
5431 | (long)cost / 1000000, | ||
5432 | ((long)cost / 100000) % 10, | ||
5433 | (long)max_cost / 1000000, | ||
5434 | ((long)max_cost / 100000) % 10, | ||
5435 | domain_distance(cpu1, cpu2), | ||
5436 | cost, avg_fluct); | ||
5437 | |||
5438 | /* | ||
5439 | * If we iterated at least 20% past the previous maximum, | ||
5440 | * and the cost has dropped by more than 20% already, | ||
5441 | * (taking fluctuations into account) then we assume to | ||
5442 | * have found the maximum and break out of the loop early: | ||
5443 | */ | ||
5444 | if (size_found && (size*100 > size_found*SIZE_THRESH)) | ||
5445 | if (cost+avg_fluct <= 0 || | ||
5446 | max_cost*100 > (cost+avg_fluct)*COST_THRESH) { | ||
5447 | |||
5448 | if (migration_debug) | ||
5449 | printk("-> found max.\n"); | ||
5450 | break; | ||
5451 | } | ||
5452 | /* | ||
5453 | * Increase the cachesize in 5% steps: | ||
5454 | */ | ||
5455 | size = size * 20 / 19; | ||
5456 | } | ||
5457 | |||
5458 | if (migration_debug) | ||
5459 | printk("[%d][%d] working set size found: %d, cost: %Ld\n", | ||
5460 | cpu1, cpu2, size_found, max_cost); | ||
5461 | |||
5462 | vfree(cache); | ||
5463 | |||
5464 | /* | ||
5465 | * A task is considered 'cache cold' if at least 2 times | ||
5466 | * the worst-case cost of migration has passed. | ||
5467 | * | ||
5468 | * (this limit is only listened to if the load-balancing | ||
5469 | * situation is 'nice' - if there is a large imbalance we | ||
5470 | * ignore it for the sake of CPU utilization and | ||
5471 | * processing fairness.) | ||
5472 | */ | ||
5473 | return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; | ||
5474 | } | ||
5475 | |||
5476 | static void calibrate_migration_costs(const cpumask_t *cpu_map) | ||
5477 | { | ||
5478 | int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); | ||
5479 | unsigned long j0, j1, distance, max_distance = 0; | ||
5480 | struct sched_domain *sd; | ||
5481 | |||
5482 | j0 = jiffies; | ||
5483 | |||
5484 | /* | ||
5485 | * First pass - calculate the cacheflush times: | ||
5486 | */ | ||
5487 | for_each_cpu_mask(cpu1, *cpu_map) { | ||
5488 | for_each_cpu_mask(cpu2, *cpu_map) { | ||
5489 | if (cpu1 == cpu2) | ||
5490 | continue; | ||
5491 | distance = domain_distance(cpu1, cpu2); | ||
5492 | max_distance = max(max_distance, distance); | ||
5493 | /* | ||
5494 | * No result cached yet? | ||
5495 | */ | ||
5496 | if (migration_cost[distance] == -1LL) | ||
5497 | migration_cost[distance] = | ||
5498 | measure_migration_cost(cpu1, cpu2); | ||
5499 | } | ||
5500 | } | ||
5501 | /* | ||
5502 | * Second pass - update the sched domain hierarchy with | ||
5503 | * the new cache-hot-time estimations: | ||
5504 | */ | ||
5505 | for_each_cpu_mask(cpu, *cpu_map) { | ||
5506 | distance = 0; | ||
5507 | for_each_domain(cpu, sd) { | ||
5508 | sd->cache_hot_time = migration_cost[distance]; | ||
5509 | distance++; | ||
5510 | } | ||
5511 | } | ||
5512 | /* | ||
5513 | * Print the matrix: | ||
5514 | */ | ||
5515 | if (migration_debug) | ||
5516 | printk("migration: max_cache_size: %d, cpu: %d MHz:\n", | ||
5517 | max_cache_size, | ||
5518 | #ifdef CONFIG_X86 | ||
5519 | cpu_khz/1000 | ||
5520 | #else | ||
5521 | -1 | ||
5522 | #endif | ||
5523 | ); | ||
5524 | printk("migration_cost="); | ||
5525 | for (distance = 0; distance <= max_distance; distance++) { | ||
5526 | if (distance) | ||
5527 | printk(","); | ||
5528 | printk("%ld", (long)migration_cost[distance] / 1000); | ||
5529 | } | ||
5530 | printk("\n"); | ||
5531 | j1 = jiffies; | ||
5532 | if (migration_debug) | ||
5533 | printk("migration: %ld seconds\n", (j1-j0)/HZ); | ||
5534 | |||
5535 | /* | ||
5536 | * Move back to the original CPU. NUMA-Q gets confused | ||
5537 | * if we migrate to another quad during bootup. | ||
5538 | */ | ||
5539 | if (raw_smp_processor_id() != orig_cpu) { | ||
5540 | cpumask_t mask = cpumask_of_cpu(orig_cpu), | ||
5541 | saved_mask = current->cpus_allowed; | ||
5542 | |||
5543 | set_cpus_allowed(current, mask); | ||
5544 | set_cpus_allowed(current, saved_mask); | ||
5545 | } | ||
5546 | } | ||
5547 | |||
5085 | #ifdef CONFIG_NUMA | 5548 | #ifdef CONFIG_NUMA |
5549 | |||
5086 | /** | 5550 | /** |
5087 | * find_next_best_node - find the next node to include in a sched_domain | 5551 | * find_next_best_node - find the next node to include in a sched_domain |
5088 | * @node: node whose sched_domain we're building | 5552 | * @node: node whose sched_domain we're building |
@@ -5448,6 +5912,10 @@ next_sg: | |||
5448 | #endif | 5912 | #endif |
5449 | cpu_attach_domain(sd, i); | 5913 | cpu_attach_domain(sd, i); |
5450 | } | 5914 | } |
5915 | /* | ||
5916 | * Tune cache-hot values: | ||
5917 | */ | ||
5918 | calibrate_migration_costs(cpu_map); | ||
5451 | } | 5919 | } |
5452 | /* | 5920 | /* |
5453 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 5921 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |