diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/hrtimer.c | 34 | ||||
-rw-r--r-- | kernel/sched.c | 478 |
2 files changed, 496 insertions, 16 deletions
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f073a2461f..04ccab099e 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -275,7 +275,7 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | |||
275 | * The number of overruns is added to the overrun field. | 275 | * The number of overruns is added to the overrun field. |
276 | */ | 276 | */ |
277 | unsigned long | 277 | unsigned long |
278 | hrtimer_forward(struct hrtimer *timer, const ktime_t interval) | 278 | hrtimer_forward(struct hrtimer *timer, ktime_t interval) |
279 | { | 279 | { |
280 | unsigned long orun = 1; | 280 | unsigned long orun = 1; |
281 | ktime_t delta, now; | 281 | ktime_t delta, now; |
@@ -287,6 +287,9 @@ hrtimer_forward(struct hrtimer *timer, const ktime_t interval) | |||
287 | if (delta.tv64 < 0) | 287 | if (delta.tv64 < 0) |
288 | return 0; | 288 | return 0; |
289 | 289 | ||
290 | if (interval.tv64 < timer->base->resolution.tv64) | ||
291 | interval.tv64 = timer->base->resolution.tv64; | ||
292 | |||
290 | if (unlikely(delta.tv64 >= interval.tv64)) { | 293 | if (unlikely(delta.tv64 >= interval.tv64)) { |
291 | nsec_t incr = ktime_to_ns(interval); | 294 | nsec_t incr = ktime_to_ns(interval); |
292 | 295 | ||
@@ -314,7 +317,6 @@ hrtimer_forward(struct hrtimer *timer, const ktime_t interval) | |||
314 | static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | 317 | static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) |
315 | { | 318 | { |
316 | struct rb_node **link = &base->active.rb_node; | 319 | struct rb_node **link = &base->active.rb_node; |
317 | struct list_head *prev = &base->pending; | ||
318 | struct rb_node *parent = NULL; | 320 | struct rb_node *parent = NULL; |
319 | struct hrtimer *entry; | 321 | struct hrtimer *entry; |
320 | 322 | ||
@@ -330,22 +332,23 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
330 | */ | 332 | */ |
331 | if (timer->expires.tv64 < entry->expires.tv64) | 333 | if (timer->expires.tv64 < entry->expires.tv64) |
332 | link = &(*link)->rb_left; | 334 | link = &(*link)->rb_left; |
333 | else { | 335 | else |
334 | link = &(*link)->rb_right; | 336 | link = &(*link)->rb_right; |
335 | prev = &entry->list; | ||
336 | } | ||
337 | } | 337 | } |
338 | 338 | ||
339 | /* | 339 | /* |
340 | * Insert the timer to the rbtree and to the sorted list: | 340 | * Insert the timer to the rbtree and check whether it |
341 | * replaces the first pending timer | ||
341 | */ | 342 | */ |
342 | rb_link_node(&timer->node, parent, link); | 343 | rb_link_node(&timer->node, parent, link); |
343 | rb_insert_color(&timer->node, &base->active); | 344 | rb_insert_color(&timer->node, &base->active); |
344 | list_add(&timer->list, prev); | ||
345 | 345 | ||
346 | timer->state = HRTIMER_PENDING; | 346 | timer->state = HRTIMER_PENDING; |
347 | } | ||
348 | 347 | ||
348 | if (!base->first || timer->expires.tv64 < | ||
349 | rb_entry(base->first, struct hrtimer, node)->expires.tv64) | ||
350 | base->first = &timer->node; | ||
351 | } | ||
349 | 352 | ||
350 | /* | 353 | /* |
351 | * __remove_hrtimer - internal function to remove a timer | 354 | * __remove_hrtimer - internal function to remove a timer |
@@ -355,9 +358,11 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
355 | static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | 358 | static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) |
356 | { | 359 | { |
357 | /* | 360 | /* |
358 | * Remove the timer from the sorted list and from the rbtree: | 361 | * Remove the timer from the rbtree and replace the |
362 | * first entry pointer if necessary. | ||
359 | */ | 363 | */ |
360 | list_del(&timer->list); | 364 | if (base->first == &timer->node) |
365 | base->first = rb_next(&timer->node); | ||
361 | rb_erase(&timer->node, &base->active); | 366 | rb_erase(&timer->node, &base->active); |
362 | } | 367 | } |
363 | 368 | ||
@@ -516,9 +521,8 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | |||
516 | { | 521 | { |
517 | struct hrtimer_base *bases; | 522 | struct hrtimer_base *bases; |
518 | 523 | ||
519 | tp->tv_sec = 0; | ||
520 | bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); | 524 | bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); |
521 | tp->tv_nsec = bases[which_clock].resolution; | 525 | *tp = ktime_to_timespec(bases[which_clock].resolution); |
522 | 526 | ||
523 | return 0; | 527 | return 0; |
524 | } | 528 | } |
@@ -529,16 +533,17 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | |||
529 | static inline void run_hrtimer_queue(struct hrtimer_base *base) | 533 | static inline void run_hrtimer_queue(struct hrtimer_base *base) |
530 | { | 534 | { |
531 | ktime_t now = base->get_time(); | 535 | ktime_t now = base->get_time(); |
536 | struct rb_node *node; | ||
532 | 537 | ||
533 | spin_lock_irq(&base->lock); | 538 | spin_lock_irq(&base->lock); |
534 | 539 | ||
535 | while (!list_empty(&base->pending)) { | 540 | while ((node = base->first)) { |
536 | struct hrtimer *timer; | 541 | struct hrtimer *timer; |
537 | int (*fn)(void *); | 542 | int (*fn)(void *); |
538 | int restart; | 543 | int restart; |
539 | void *data; | 544 | void *data; |
540 | 545 | ||
541 | timer = list_entry(base->pending.next, struct hrtimer, list); | 546 | timer = rb_entry(node, struct hrtimer, node); |
542 | if (now.tv64 <= timer->expires.tv64) | 547 | if (now.tv64 <= timer->expires.tv64) |
543 | break; | 548 | break; |
544 | 549 | ||
@@ -732,7 +737,6 @@ static void __devinit init_hrtimers_cpu(int cpu) | |||
732 | 737 | ||
733 | for (i = 0; i < MAX_HRTIMER_BASES; i++) { | 738 | for (i = 0; i < MAX_HRTIMER_BASES; i++) { |
734 | spin_lock_init(&base->lock); | 739 | spin_lock_init(&base->lock); |
735 | INIT_LIST_HEAD(&base->pending); | ||
736 | base++; | 740 | base++; |
737 | } | 741 | } |
738 | } | 742 | } |
diff --git a/kernel/sched.c b/kernel/sched.c index c0c60c926d..c9dec2aa19 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/notifier.h> | 34 | #include <linux/notifier.h> |
35 | #include <linux/profile.h> | 35 | #include <linux/profile.h> |
36 | #include <linux/suspend.h> | 36 | #include <linux/suspend.h> |
37 | #include <linux/vmalloc.h> | ||
37 | #include <linux/blkdev.h> | 38 | #include <linux/blkdev.h> |
38 | #include <linux/delay.h> | 39 | #include <linux/delay.h> |
39 | #include <linux/smp.h> | 40 | #include <linux/smp.h> |
@@ -1289,6 +1290,9 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) | |||
1289 | } | 1290 | } |
1290 | } | 1291 | } |
1291 | 1292 | ||
1293 | if (p->last_waker_cpu != this_cpu) | ||
1294 | goto out_set_cpu; | ||
1295 | |||
1292 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | 1296 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) |
1293 | goto out_set_cpu; | 1297 | goto out_set_cpu; |
1294 | 1298 | ||
@@ -1359,6 +1363,8 @@ out_set_cpu: | |||
1359 | cpu = task_cpu(p); | 1363 | cpu = task_cpu(p); |
1360 | } | 1364 | } |
1361 | 1365 | ||
1366 | p->last_waker_cpu = this_cpu; | ||
1367 | |||
1362 | out_activate: | 1368 | out_activate: |
1363 | #endif /* CONFIG_SMP */ | 1369 | #endif /* CONFIG_SMP */ |
1364 | if (old_state == TASK_UNINTERRUPTIBLE) { | 1370 | if (old_state == TASK_UNINTERRUPTIBLE) { |
@@ -1440,9 +1446,12 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
1440 | #ifdef CONFIG_SCHEDSTATS | 1446 | #ifdef CONFIG_SCHEDSTATS |
1441 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1447 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
1442 | #endif | 1448 | #endif |
1443 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1449 | #if defined(CONFIG_SMP) |
1450 | p->last_waker_cpu = cpu; | ||
1451 | #if defined(__ARCH_WANT_UNLOCKED_CTXSW) | ||
1444 | p->oncpu = 0; | 1452 | p->oncpu = 0; |
1445 | #endif | 1453 | #endif |
1454 | #endif | ||
1446 | #ifdef CONFIG_PREEMPT | 1455 | #ifdef CONFIG_PREEMPT |
1447 | /* Want to start with kernel preemption disabled. */ | 1456 | /* Want to start with kernel preemption disabled. */ |
1448 | task_thread_info(p)->preempt_count = 1; | 1457 | task_thread_info(p)->preempt_count = 1; |
@@ -5082,7 +5091,470 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, | |||
5082 | 5091 | ||
5083 | #define SD_NODES_PER_DOMAIN 16 | 5092 | #define SD_NODES_PER_DOMAIN 16 |
5084 | 5093 | ||
5094 | /* | ||
5095 | * Self-tuning task migration cost measurement between source and target CPUs. | ||
5096 | * | ||
5097 | * This is done by measuring the cost of manipulating buffers of varying | ||
5098 | * sizes. For a given buffer-size here are the steps that are taken: | ||
5099 | * | ||
5100 | * 1) the source CPU reads+dirties a shared buffer | ||
5101 | * 2) the target CPU reads+dirties the same shared buffer | ||
5102 | * | ||
5103 | * We measure how long they take, in the following 4 scenarios: | ||
5104 | * | ||
5105 | * - source: CPU1, target: CPU2 | cost1 | ||
5106 | * - source: CPU2, target: CPU1 | cost2 | ||
5107 | * - source: CPU1, target: CPU1 | cost3 | ||
5108 | * - source: CPU2, target: CPU2 | cost4 | ||
5109 | * | ||
5110 | * We then calculate the cost3+cost4-cost1-cost2 difference - this is | ||
5111 | * the cost of migration. | ||
5112 | * | ||
5113 | * We then start off from a small buffer-size and iterate up to larger | ||
5114 | * buffer sizes, in 5% steps - measuring each buffer-size separately, and | ||
5115 | * doing a maximum search for the cost. (The maximum cost for a migration | ||
5116 | * normally occurs when the working set size is around the effective cache | ||
5117 | * size.) | ||
5118 | */ | ||
5119 | #define SEARCH_SCOPE 2 | ||
5120 | #define MIN_CACHE_SIZE (64*1024U) | ||
5121 | #define DEFAULT_CACHE_SIZE (5*1024*1024U) | ||
5122 | #define ITERATIONS 2 | ||
5123 | #define SIZE_THRESH 130 | ||
5124 | #define COST_THRESH 130 | ||
5125 | |||
5126 | /* | ||
5127 | * The migration cost is a function of 'domain distance'. Domain | ||
5128 | * distance is the number of steps a CPU has to iterate down its | ||
5129 | * domain tree to share a domain with the other CPU. The farther | ||
5130 | * two CPUs are from each other, the larger the distance gets. | ||
5131 | * | ||
5132 | * Note that we use the distance only to cache measurement results, | ||
5133 | * the distance value is not used numerically otherwise. When two | ||
5134 | * CPUs have the same distance it is assumed that the migration | ||
5135 | * cost is the same. (this is a simplification but quite practical) | ||
5136 | */ | ||
5137 | #define MAX_DOMAIN_DISTANCE 32 | ||
5138 | |||
5139 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = | ||
5140 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL }; | ||
5141 | |||
5142 | /* | ||
5143 | * Allow override of migration cost - in units of microseconds. | ||
5144 | * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost | ||
5145 | * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: | ||
5146 | */ | ||
5147 | static int __init migration_cost_setup(char *str) | ||
5148 | { | ||
5149 | int ints[MAX_DOMAIN_DISTANCE+1], i; | ||
5150 | |||
5151 | str = get_options(str, ARRAY_SIZE(ints), ints); | ||
5152 | |||
5153 | printk("#ints: %d\n", ints[0]); | ||
5154 | for (i = 1; i <= ints[0]; i++) { | ||
5155 | migration_cost[i-1] = (unsigned long long)ints[i]*1000; | ||
5156 | printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); | ||
5157 | } | ||
5158 | return 1; | ||
5159 | } | ||
5160 | |||
5161 | __setup ("migration_cost=", migration_cost_setup); | ||
5162 | |||
5163 | /* | ||
5164 | * Global multiplier (divisor) for migration-cutoff values, | ||
5165 | * in percentiles. E.g. use a value of 150 to get 1.5 times | ||
5166 | * longer cache-hot cutoff times. | ||
5167 | * | ||
5168 | * (We scale it from 100 to 128 to long long handling easier.) | ||
5169 | */ | ||
5170 | |||
5171 | #define MIGRATION_FACTOR_SCALE 128 | ||
5172 | |||
5173 | static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; | ||
5174 | |||
5175 | static int __init setup_migration_factor(char *str) | ||
5176 | { | ||
5177 | get_option(&str, &migration_factor); | ||
5178 | migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; | ||
5179 | return 1; | ||
5180 | } | ||
5181 | |||
5182 | __setup("migration_factor=", setup_migration_factor); | ||
5183 | |||
5184 | /* | ||
5185 | * Estimated distance of two CPUs, measured via the number of domains | ||
5186 | * we have to pass for the two CPUs to be in the same span: | ||
5187 | */ | ||
5188 | static unsigned long domain_distance(int cpu1, int cpu2) | ||
5189 | { | ||
5190 | unsigned long distance = 0; | ||
5191 | struct sched_domain *sd; | ||
5192 | |||
5193 | for_each_domain(cpu1, sd) { | ||
5194 | WARN_ON(!cpu_isset(cpu1, sd->span)); | ||
5195 | if (cpu_isset(cpu2, sd->span)) | ||
5196 | return distance; | ||
5197 | distance++; | ||
5198 | } | ||
5199 | if (distance >= MAX_DOMAIN_DISTANCE) { | ||
5200 | WARN_ON(1); | ||
5201 | distance = MAX_DOMAIN_DISTANCE-1; | ||
5202 | } | ||
5203 | |||
5204 | return distance; | ||
5205 | } | ||
5206 | |||
5207 | static unsigned int migration_debug; | ||
5208 | |||
5209 | static int __init setup_migration_debug(char *str) | ||
5210 | { | ||
5211 | get_option(&str, &migration_debug); | ||
5212 | return 1; | ||
5213 | } | ||
5214 | |||
5215 | __setup("migration_debug=", setup_migration_debug); | ||
5216 | |||
5217 | /* | ||
5218 | * Maximum cache-size that the scheduler should try to measure. | ||
5219 | * Architectures with larger caches should tune this up during | ||
5220 | * bootup. Gets used in the domain-setup code (i.e. during SMP | ||
5221 | * bootup). | ||
5222 | */ | ||
5223 | unsigned int max_cache_size; | ||
5224 | |||
5225 | static int __init setup_max_cache_size(char *str) | ||
5226 | { | ||
5227 | get_option(&str, &max_cache_size); | ||
5228 | return 1; | ||
5229 | } | ||
5230 | |||
5231 | __setup("max_cache_size=", setup_max_cache_size); | ||
5232 | |||
5233 | /* | ||
5234 | * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This | ||
5235 | * is the operation that is timed, so we try to generate unpredictable | ||
5236 | * cachemisses that still end up filling the L2 cache: | ||
5237 | */ | ||
5238 | static void touch_cache(void *__cache, unsigned long __size) | ||
5239 | { | ||
5240 | unsigned long size = __size/sizeof(long), chunk1 = size/3, | ||
5241 | chunk2 = 2*size/3; | ||
5242 | unsigned long *cache = __cache; | ||
5243 | int i; | ||
5244 | |||
5245 | for (i = 0; i < size/6; i += 8) { | ||
5246 | switch (i % 6) { | ||
5247 | case 0: cache[i]++; | ||
5248 | case 1: cache[size-1-i]++; | ||
5249 | case 2: cache[chunk1-i]++; | ||
5250 | case 3: cache[chunk1+i]++; | ||
5251 | case 4: cache[chunk2-i]++; | ||
5252 | case 5: cache[chunk2+i]++; | ||
5253 | } | ||
5254 | } | ||
5255 | } | ||
5256 | |||
5257 | /* | ||
5258 | * Measure the cache-cost of one task migration. Returns in units of nsec. | ||
5259 | */ | ||
5260 | static unsigned long long measure_one(void *cache, unsigned long size, | ||
5261 | int source, int target) | ||
5262 | { | ||
5263 | cpumask_t mask, saved_mask; | ||
5264 | unsigned long long t0, t1, t2, t3, cost; | ||
5265 | |||
5266 | saved_mask = current->cpus_allowed; | ||
5267 | |||
5268 | /* | ||
5269 | * Flush source caches to RAM and invalidate them: | ||
5270 | */ | ||
5271 | sched_cacheflush(); | ||
5272 | |||
5273 | /* | ||
5274 | * Migrate to the source CPU: | ||
5275 | */ | ||
5276 | mask = cpumask_of_cpu(source); | ||
5277 | set_cpus_allowed(current, mask); | ||
5278 | WARN_ON(smp_processor_id() != source); | ||
5279 | |||
5280 | /* | ||
5281 | * Dirty the working set: | ||
5282 | */ | ||
5283 | t0 = sched_clock(); | ||
5284 | touch_cache(cache, size); | ||
5285 | t1 = sched_clock(); | ||
5286 | |||
5287 | /* | ||
5288 | * Migrate to the target CPU, dirty the L2 cache and access | ||
5289 | * the shared buffer. (which represents the working set | ||
5290 | * of a migrated task.) | ||
5291 | */ | ||
5292 | mask = cpumask_of_cpu(target); | ||
5293 | set_cpus_allowed(current, mask); | ||
5294 | WARN_ON(smp_processor_id() != target); | ||
5295 | |||
5296 | t2 = sched_clock(); | ||
5297 | touch_cache(cache, size); | ||
5298 | t3 = sched_clock(); | ||
5299 | |||
5300 | cost = t1-t0 + t3-t2; | ||
5301 | |||
5302 | if (migration_debug >= 2) | ||
5303 | printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", | ||
5304 | source, target, t1-t0, t1-t0, t3-t2, cost); | ||
5305 | /* | ||
5306 | * Flush target caches to RAM and invalidate them: | ||
5307 | */ | ||
5308 | sched_cacheflush(); | ||
5309 | |||
5310 | set_cpus_allowed(current, saved_mask); | ||
5311 | |||
5312 | return cost; | ||
5313 | } | ||
5314 | |||
5315 | /* | ||
5316 | * Measure a series of task migrations and return the average | ||
5317 | * result. Since this code runs early during bootup the system | ||
5318 | * is 'undisturbed' and the average latency makes sense. | ||
5319 | * | ||
5320 | * The algorithm in essence auto-detects the relevant cache-size, | ||
5321 | * so it will properly detect different cachesizes for different | ||
5322 | * cache-hierarchies, depending on how the CPUs are connected. | ||
5323 | * | ||
5324 | * Architectures can prime the upper limit of the search range via | ||
5325 | * max_cache_size, otherwise the search range defaults to 20MB...64K. | ||
5326 | */ | ||
5327 | static unsigned long long | ||
5328 | measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | ||
5329 | { | ||
5330 | unsigned long long cost1, cost2; | ||
5331 | int i; | ||
5332 | |||
5333 | /* | ||
5334 | * Measure the migration cost of 'size' bytes, over an | ||
5335 | * average of 10 runs: | ||
5336 | * | ||
5337 | * (We perturb the cache size by a small (0..4k) | ||
5338 | * value to compensate size/alignment related artifacts. | ||
5339 | * We also subtract the cost of the operation done on | ||
5340 | * the same CPU.) | ||
5341 | */ | ||
5342 | cost1 = 0; | ||
5343 | |||
5344 | /* | ||
5345 | * dry run, to make sure we start off cache-cold on cpu1, | ||
5346 | * and to get any vmalloc pagefaults in advance: | ||
5347 | */ | ||
5348 | measure_one(cache, size, cpu1, cpu2); | ||
5349 | for (i = 0; i < ITERATIONS; i++) | ||
5350 | cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); | ||
5351 | |||
5352 | measure_one(cache, size, cpu2, cpu1); | ||
5353 | for (i = 0; i < ITERATIONS; i++) | ||
5354 | cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); | ||
5355 | |||
5356 | /* | ||
5357 | * (We measure the non-migrating [cached] cost on both | ||
5358 | * cpu1 and cpu2, to handle CPUs with different speeds) | ||
5359 | */ | ||
5360 | cost2 = 0; | ||
5361 | |||
5362 | measure_one(cache, size, cpu1, cpu1); | ||
5363 | for (i = 0; i < ITERATIONS; i++) | ||
5364 | cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); | ||
5365 | |||
5366 | measure_one(cache, size, cpu2, cpu2); | ||
5367 | for (i = 0; i < ITERATIONS; i++) | ||
5368 | cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); | ||
5369 | |||
5370 | /* | ||
5371 | * Get the per-iteration migration cost: | ||
5372 | */ | ||
5373 | do_div(cost1, 2*ITERATIONS); | ||
5374 | do_div(cost2, 2*ITERATIONS); | ||
5375 | |||
5376 | return cost1 - cost2; | ||
5377 | } | ||
5378 | |||
5379 | static unsigned long long measure_migration_cost(int cpu1, int cpu2) | ||
5380 | { | ||
5381 | unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; | ||
5382 | unsigned int max_size, size, size_found = 0; | ||
5383 | long long cost = 0, prev_cost; | ||
5384 | void *cache; | ||
5385 | |||
5386 | /* | ||
5387 | * Search from max_cache_size*5 down to 64K - the real relevant | ||
5388 | * cachesize has to lie somewhere inbetween. | ||
5389 | */ | ||
5390 | if (max_cache_size) { | ||
5391 | max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
5392 | size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
5393 | } else { | ||
5394 | /* | ||
5395 | * Since we have no estimation about the relevant | ||
5396 | * search range | ||
5397 | */ | ||
5398 | max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; | ||
5399 | size = MIN_CACHE_SIZE; | ||
5400 | } | ||
5401 | |||
5402 | if (!cpu_online(cpu1) || !cpu_online(cpu2)) { | ||
5403 | printk("cpu %d and %d not both online!\n", cpu1, cpu2); | ||
5404 | return 0; | ||
5405 | } | ||
5406 | |||
5407 | /* | ||
5408 | * Allocate the working set: | ||
5409 | */ | ||
5410 | cache = vmalloc(max_size); | ||
5411 | if (!cache) { | ||
5412 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | ||
5413 | return 1000000; // return 1 msec on very small boxen | ||
5414 | } | ||
5415 | |||
5416 | while (size <= max_size) { | ||
5417 | prev_cost = cost; | ||
5418 | cost = measure_cost(cpu1, cpu2, cache, size); | ||
5419 | |||
5420 | /* | ||
5421 | * Update the max: | ||
5422 | */ | ||
5423 | if (cost > 0) { | ||
5424 | if (max_cost < cost) { | ||
5425 | max_cost = cost; | ||
5426 | size_found = size; | ||
5427 | } | ||
5428 | } | ||
5429 | /* | ||
5430 | * Calculate average fluctuation, we use this to prevent | ||
5431 | * noise from triggering an early break out of the loop: | ||
5432 | */ | ||
5433 | fluct = abs(cost - prev_cost); | ||
5434 | avg_fluct = (avg_fluct + fluct)/2; | ||
5435 | |||
5436 | if (migration_debug) | ||
5437 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", | ||
5438 | cpu1, cpu2, size, | ||
5439 | (long)cost / 1000000, | ||
5440 | ((long)cost / 100000) % 10, | ||
5441 | (long)max_cost / 1000000, | ||
5442 | ((long)max_cost / 100000) % 10, | ||
5443 | domain_distance(cpu1, cpu2), | ||
5444 | cost, avg_fluct); | ||
5445 | |||
5446 | /* | ||
5447 | * If we iterated at least 20% past the previous maximum, | ||
5448 | * and the cost has dropped by more than 20% already, | ||
5449 | * (taking fluctuations into account) then we assume to | ||
5450 | * have found the maximum and break out of the loop early: | ||
5451 | */ | ||
5452 | if (size_found && (size*100 > size_found*SIZE_THRESH)) | ||
5453 | if (cost+avg_fluct <= 0 || | ||
5454 | max_cost*100 > (cost+avg_fluct)*COST_THRESH) { | ||
5455 | |||
5456 | if (migration_debug) | ||
5457 | printk("-> found max.\n"); | ||
5458 | break; | ||
5459 | } | ||
5460 | /* | ||
5461 | * Increase the cachesize in 5% steps: | ||
5462 | */ | ||
5463 | size = size * 20 / 19; | ||
5464 | } | ||
5465 | |||
5466 | if (migration_debug) | ||
5467 | printk("[%d][%d] working set size found: %d, cost: %Ld\n", | ||
5468 | cpu1, cpu2, size_found, max_cost); | ||
5469 | |||
5470 | vfree(cache); | ||
5471 | |||
5472 | /* | ||
5473 | * A task is considered 'cache cold' if at least 2 times | ||
5474 | * the worst-case cost of migration has passed. | ||
5475 | * | ||
5476 | * (this limit is only listened to if the load-balancing | ||
5477 | * situation is 'nice' - if there is a large imbalance we | ||
5478 | * ignore it for the sake of CPU utilization and | ||
5479 | * processing fairness.) | ||
5480 | */ | ||
5481 | return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; | ||
5482 | } | ||
5483 | |||
5484 | static void calibrate_migration_costs(const cpumask_t *cpu_map) | ||
5485 | { | ||
5486 | int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); | ||
5487 | unsigned long j0, j1, distance, max_distance = 0; | ||
5488 | struct sched_domain *sd; | ||
5489 | |||
5490 | j0 = jiffies; | ||
5491 | |||
5492 | /* | ||
5493 | * First pass - calculate the cacheflush times: | ||
5494 | */ | ||
5495 | for_each_cpu_mask(cpu1, *cpu_map) { | ||
5496 | for_each_cpu_mask(cpu2, *cpu_map) { | ||
5497 | if (cpu1 == cpu2) | ||
5498 | continue; | ||
5499 | distance = domain_distance(cpu1, cpu2); | ||
5500 | max_distance = max(max_distance, distance); | ||
5501 | /* | ||
5502 | * No result cached yet? | ||
5503 | */ | ||
5504 | if (migration_cost[distance] == -1LL) | ||
5505 | migration_cost[distance] = | ||
5506 | measure_migration_cost(cpu1, cpu2); | ||
5507 | } | ||
5508 | } | ||
5509 | /* | ||
5510 | * Second pass - update the sched domain hierarchy with | ||
5511 | * the new cache-hot-time estimations: | ||
5512 | */ | ||
5513 | for_each_cpu_mask(cpu, *cpu_map) { | ||
5514 | distance = 0; | ||
5515 | for_each_domain(cpu, sd) { | ||
5516 | sd->cache_hot_time = migration_cost[distance]; | ||
5517 | distance++; | ||
5518 | } | ||
5519 | } | ||
5520 | /* | ||
5521 | * Print the matrix: | ||
5522 | */ | ||
5523 | if (migration_debug) | ||
5524 | printk("migration: max_cache_size: %d, cpu: %d MHz:\n", | ||
5525 | max_cache_size, | ||
5526 | #ifdef CONFIG_X86 | ||
5527 | cpu_khz/1000 | ||
5528 | #else | ||
5529 | -1 | ||
5530 | #endif | ||
5531 | ); | ||
5532 | printk("migration_cost="); | ||
5533 | for (distance = 0; distance <= max_distance; distance++) { | ||
5534 | if (distance) | ||
5535 | printk(","); | ||
5536 | printk("%ld", (long)migration_cost[distance] / 1000); | ||
5537 | } | ||
5538 | printk("\n"); | ||
5539 | j1 = jiffies; | ||
5540 | if (migration_debug) | ||
5541 | printk("migration: %ld seconds\n", (j1-j0)/HZ); | ||
5542 | |||
5543 | /* | ||
5544 | * Move back to the original CPU. NUMA-Q gets confused | ||
5545 | * if we migrate to another quad during bootup. | ||
5546 | */ | ||
5547 | if (raw_smp_processor_id() != orig_cpu) { | ||
5548 | cpumask_t mask = cpumask_of_cpu(orig_cpu), | ||
5549 | saved_mask = current->cpus_allowed; | ||
5550 | |||
5551 | set_cpus_allowed(current, mask); | ||
5552 | set_cpus_allowed(current, saved_mask); | ||
5553 | } | ||
5554 | } | ||
5555 | |||
5085 | #ifdef CONFIG_NUMA | 5556 | #ifdef CONFIG_NUMA |
5557 | |||
5086 | /** | 5558 | /** |
5087 | * find_next_best_node - find the next node to include in a sched_domain | 5559 | * find_next_best_node - find the next node to include in a sched_domain |
5088 | * @node: node whose sched_domain we're building | 5560 | * @node: node whose sched_domain we're building |
@@ -5448,6 +5920,10 @@ next_sg: | |||
5448 | #endif | 5920 | #endif |
5449 | cpu_attach_domain(sd, i); | 5921 | cpu_attach_domain(sd, i); |
5450 | } | 5922 | } |
5923 | /* | ||
5924 | * Tune cache-hot values: | ||
5925 | */ | ||
5926 | calibrate_migration_costs(cpu_map); | ||
5451 | } | 5927 | } |
5452 | /* | 5928 | /* |
5453 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 5929 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |