diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/hrtimer.c | 34 | ||||
| -rw-r--r-- | kernel/sched.c | 478 |
2 files changed, 496 insertions, 16 deletions
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f073a2461faa..04ccab099e84 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
| @@ -275,7 +275,7 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | |||
| 275 | * The number of overruns is added to the overrun field. | 275 | * The number of overruns is added to the overrun field. |
| 276 | */ | 276 | */ |
| 277 | unsigned long | 277 | unsigned long |
| 278 | hrtimer_forward(struct hrtimer *timer, const ktime_t interval) | 278 | hrtimer_forward(struct hrtimer *timer, ktime_t interval) |
| 279 | { | 279 | { |
| 280 | unsigned long orun = 1; | 280 | unsigned long orun = 1; |
| 281 | ktime_t delta, now; | 281 | ktime_t delta, now; |
| @@ -287,6 +287,9 @@ hrtimer_forward(struct hrtimer *timer, const ktime_t interval) | |||
| 287 | if (delta.tv64 < 0) | 287 | if (delta.tv64 < 0) |
| 288 | return 0; | 288 | return 0; |
| 289 | 289 | ||
| 290 | if (interval.tv64 < timer->base->resolution.tv64) | ||
| 291 | interval.tv64 = timer->base->resolution.tv64; | ||
| 292 | |||
| 290 | if (unlikely(delta.tv64 >= interval.tv64)) { | 293 | if (unlikely(delta.tv64 >= interval.tv64)) { |
| 291 | nsec_t incr = ktime_to_ns(interval); | 294 | nsec_t incr = ktime_to_ns(interval); |
| 292 | 295 | ||
| @@ -314,7 +317,6 @@ hrtimer_forward(struct hrtimer *timer, const ktime_t interval) | |||
| 314 | static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | 317 | static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) |
| 315 | { | 318 | { |
| 316 | struct rb_node **link = &base->active.rb_node; | 319 | struct rb_node **link = &base->active.rb_node; |
| 317 | struct list_head *prev = &base->pending; | ||
| 318 | struct rb_node *parent = NULL; | 320 | struct rb_node *parent = NULL; |
| 319 | struct hrtimer *entry; | 321 | struct hrtimer *entry; |
| 320 | 322 | ||
| @@ -330,22 +332,23 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
| 330 | */ | 332 | */ |
| 331 | if (timer->expires.tv64 < entry->expires.tv64) | 333 | if (timer->expires.tv64 < entry->expires.tv64) |
| 332 | link = &(*link)->rb_left; | 334 | link = &(*link)->rb_left; |
| 333 | else { | 335 | else |
| 334 | link = &(*link)->rb_right; | 336 | link = &(*link)->rb_right; |
| 335 | prev = &entry->list; | ||
| 336 | } | ||
| 337 | } | 337 | } |
| 338 | 338 | ||
| 339 | /* | 339 | /* |
| 340 | * Insert the timer to the rbtree and to the sorted list: | 340 | * Insert the timer to the rbtree and check whether it |
| 341 | * replaces the first pending timer | ||
| 341 | */ | 342 | */ |
| 342 | rb_link_node(&timer->node, parent, link); | 343 | rb_link_node(&timer->node, parent, link); |
| 343 | rb_insert_color(&timer->node, &base->active); | 344 | rb_insert_color(&timer->node, &base->active); |
| 344 | list_add(&timer->list, prev); | ||
| 345 | 345 | ||
| 346 | timer->state = HRTIMER_PENDING; | 346 | timer->state = HRTIMER_PENDING; |
| 347 | } | ||
| 348 | 347 | ||
| 348 | if (!base->first || timer->expires.tv64 < | ||
| 349 | rb_entry(base->first, struct hrtimer, node)->expires.tv64) | ||
| 350 | base->first = &timer->node; | ||
| 351 | } | ||
| 349 | 352 | ||
| 350 | /* | 353 | /* |
| 351 | * __remove_hrtimer - internal function to remove a timer | 354 | * __remove_hrtimer - internal function to remove a timer |
| @@ -355,9 +358,11 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
| 355 | static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | 358 | static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) |
| 356 | { | 359 | { |
| 357 | /* | 360 | /* |
| 358 | * Remove the timer from the sorted list and from the rbtree: | 361 | * Remove the timer from the rbtree and replace the |
| 362 | * first entry pointer if necessary. | ||
| 359 | */ | 363 | */ |
| 360 | list_del(&timer->list); | 364 | if (base->first == &timer->node) |
| 365 | base->first = rb_next(&timer->node); | ||
| 361 | rb_erase(&timer->node, &base->active); | 366 | rb_erase(&timer->node, &base->active); |
| 362 | } | 367 | } |
| 363 | 368 | ||
| @@ -516,9 +521,8 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | |||
| 516 | { | 521 | { |
| 517 | struct hrtimer_base *bases; | 522 | struct hrtimer_base *bases; |
| 518 | 523 | ||
| 519 | tp->tv_sec = 0; | ||
| 520 | bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); | 524 | bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); |
| 521 | tp->tv_nsec = bases[which_clock].resolution; | 525 | *tp = ktime_to_timespec(bases[which_clock].resolution); |
| 522 | 526 | ||
| 523 | return 0; | 527 | return 0; |
| 524 | } | 528 | } |
| @@ -529,16 +533,17 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | |||
| 529 | static inline void run_hrtimer_queue(struct hrtimer_base *base) | 533 | static inline void run_hrtimer_queue(struct hrtimer_base *base) |
| 530 | { | 534 | { |
| 531 | ktime_t now = base->get_time(); | 535 | ktime_t now = base->get_time(); |
| 536 | struct rb_node *node; | ||
| 532 | 537 | ||
| 533 | spin_lock_irq(&base->lock); | 538 | spin_lock_irq(&base->lock); |
| 534 | 539 | ||
| 535 | while (!list_empty(&base->pending)) { | 540 | while ((node = base->first)) { |
| 536 | struct hrtimer *timer; | 541 | struct hrtimer *timer; |
| 537 | int (*fn)(void *); | 542 | int (*fn)(void *); |
| 538 | int restart; | 543 | int restart; |
| 539 | void *data; | 544 | void *data; |
| 540 | 545 | ||
| 541 | timer = list_entry(base->pending.next, struct hrtimer, list); | 546 | timer = rb_entry(node, struct hrtimer, node); |
| 542 | if (now.tv64 <= timer->expires.tv64) | 547 | if (now.tv64 <= timer->expires.tv64) |
| 543 | break; | 548 | break; |
| 544 | 549 | ||
| @@ -732,7 +737,6 @@ static void __devinit init_hrtimers_cpu(int cpu) | |||
| 732 | 737 | ||
| 733 | for (i = 0; i < MAX_HRTIMER_BASES; i++) { | 738 | for (i = 0; i < MAX_HRTIMER_BASES; i++) { |
| 734 | spin_lock_init(&base->lock); | 739 | spin_lock_init(&base->lock); |
| 735 | INIT_LIST_HEAD(&base->pending); | ||
| 736 | base++; | 740 | base++; |
| 737 | } | 741 | } |
| 738 | } | 742 | } |
diff --git a/kernel/sched.c b/kernel/sched.c index c0c60c926d5e..c9dec2aa1976 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -34,6 +34,7 @@ | |||
| 34 | #include <linux/notifier.h> | 34 | #include <linux/notifier.h> |
| 35 | #include <linux/profile.h> | 35 | #include <linux/profile.h> |
| 36 | #include <linux/suspend.h> | 36 | #include <linux/suspend.h> |
| 37 | #include <linux/vmalloc.h> | ||
| 37 | #include <linux/blkdev.h> | 38 | #include <linux/blkdev.h> |
| 38 | #include <linux/delay.h> | 39 | #include <linux/delay.h> |
| 39 | #include <linux/smp.h> | 40 | #include <linux/smp.h> |
| @@ -1289,6 +1290,9 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) | |||
| 1289 | } | 1290 | } |
| 1290 | } | 1291 | } |
| 1291 | 1292 | ||
| 1293 | if (p->last_waker_cpu != this_cpu) | ||
| 1294 | goto out_set_cpu; | ||
| 1295 | |||
| 1292 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | 1296 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) |
| 1293 | goto out_set_cpu; | 1297 | goto out_set_cpu; |
| 1294 | 1298 | ||
| @@ -1359,6 +1363,8 @@ out_set_cpu: | |||
| 1359 | cpu = task_cpu(p); | 1363 | cpu = task_cpu(p); |
| 1360 | } | 1364 | } |
| 1361 | 1365 | ||
| 1366 | p->last_waker_cpu = this_cpu; | ||
| 1367 | |||
| 1362 | out_activate: | 1368 | out_activate: |
| 1363 | #endif /* CONFIG_SMP */ | 1369 | #endif /* CONFIG_SMP */ |
| 1364 | if (old_state == TASK_UNINTERRUPTIBLE) { | 1370 | if (old_state == TASK_UNINTERRUPTIBLE) { |
| @@ -1440,9 +1446,12 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
| 1440 | #ifdef CONFIG_SCHEDSTATS | 1446 | #ifdef CONFIG_SCHEDSTATS |
| 1441 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1447 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
| 1442 | #endif | 1448 | #endif |
| 1443 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1449 | #if defined(CONFIG_SMP) |
| 1450 | p->last_waker_cpu = cpu; | ||
| 1451 | #if defined(__ARCH_WANT_UNLOCKED_CTXSW) | ||
| 1444 | p->oncpu = 0; | 1452 | p->oncpu = 0; |
| 1445 | #endif | 1453 | #endif |
| 1454 | #endif | ||
| 1446 | #ifdef CONFIG_PREEMPT | 1455 | #ifdef CONFIG_PREEMPT |
| 1447 | /* Want to start with kernel preemption disabled. */ | 1456 | /* Want to start with kernel preemption disabled. */ |
| 1448 | task_thread_info(p)->preempt_count = 1; | 1457 | task_thread_info(p)->preempt_count = 1; |
| @@ -5082,7 +5091,470 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, | |||
| 5082 | 5091 | ||
| 5083 | #define SD_NODES_PER_DOMAIN 16 | 5092 | #define SD_NODES_PER_DOMAIN 16 |
| 5084 | 5093 | ||
| 5094 | /* | ||
| 5095 | * Self-tuning task migration cost measurement between source and target CPUs. | ||
| 5096 | * | ||
| 5097 | * This is done by measuring the cost of manipulating buffers of varying | ||
| 5098 | * sizes. For a given buffer-size here are the steps that are taken: | ||
| 5099 | * | ||
| 5100 | * 1) the source CPU reads+dirties a shared buffer | ||
| 5101 | * 2) the target CPU reads+dirties the same shared buffer | ||
| 5102 | * | ||
| 5103 | * We measure how long they take, in the following 4 scenarios: | ||
| 5104 | * | ||
| 5105 | * - source: CPU1, target: CPU2 | cost1 | ||
| 5106 | * - source: CPU2, target: CPU1 | cost2 | ||
| 5107 | * - source: CPU1, target: CPU1 | cost3 | ||
| 5108 | * - source: CPU2, target: CPU2 | cost4 | ||
| 5109 | * | ||
| 5110 | * We then calculate the cost3+cost4-cost1-cost2 difference - this is | ||
| 5111 | * the cost of migration. | ||
| 5112 | * | ||
| 5113 | * We then start off from a small buffer-size and iterate up to larger | ||
| 5114 | * buffer sizes, in 5% steps - measuring each buffer-size separately, and | ||
| 5115 | * doing a maximum search for the cost. (The maximum cost for a migration | ||
| 5116 | * normally occurs when the working set size is around the effective cache | ||
| 5117 | * size.) | ||
| 5118 | */ | ||
| 5119 | #define SEARCH_SCOPE 2 | ||
| 5120 | #define MIN_CACHE_SIZE (64*1024U) | ||
| 5121 | #define DEFAULT_CACHE_SIZE (5*1024*1024U) | ||
| 5122 | #define ITERATIONS 2 | ||
| 5123 | #define SIZE_THRESH 130 | ||
| 5124 | #define COST_THRESH 130 | ||
| 5125 | |||
| 5126 | /* | ||
| 5127 | * The migration cost is a function of 'domain distance'. Domain | ||
| 5128 | * distance is the number of steps a CPU has to iterate down its | ||
| 5129 | * domain tree to share a domain with the other CPU. The farther | ||
| 5130 | * two CPUs are from each other, the larger the distance gets. | ||
| 5131 | * | ||
| 5132 | * Note that we use the distance only to cache measurement results, | ||
| 5133 | * the distance value is not used numerically otherwise. When two | ||
| 5134 | * CPUs have the same distance it is assumed that the migration | ||
| 5135 | * cost is the same. (this is a simplification but quite practical) | ||
| 5136 | */ | ||
| 5137 | #define MAX_DOMAIN_DISTANCE 32 | ||
| 5138 | |||
| 5139 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = | ||
| 5140 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL }; | ||
| 5141 | |||
| 5142 | /* | ||
| 5143 | * Allow override of migration cost - in units of microseconds. | ||
| 5144 | * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost | ||
| 5145 | * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: | ||
| 5146 | */ | ||
| 5147 | static int __init migration_cost_setup(char *str) | ||
| 5148 | { | ||
| 5149 | int ints[MAX_DOMAIN_DISTANCE+1], i; | ||
| 5150 | |||
| 5151 | str = get_options(str, ARRAY_SIZE(ints), ints); | ||
| 5152 | |||
| 5153 | printk("#ints: %d\n", ints[0]); | ||
| 5154 | for (i = 1; i <= ints[0]; i++) { | ||
| 5155 | migration_cost[i-1] = (unsigned long long)ints[i]*1000; | ||
| 5156 | printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); | ||
| 5157 | } | ||
| 5158 | return 1; | ||
| 5159 | } | ||
| 5160 | |||
| 5161 | __setup ("migration_cost=", migration_cost_setup); | ||
| 5162 | |||
| 5163 | /* | ||
| 5164 | * Global multiplier (divisor) for migration-cutoff values, | ||
| 5165 | * in percentiles. E.g. use a value of 150 to get 1.5 times | ||
| 5166 | * longer cache-hot cutoff times. | ||
| 5167 | * | ||
| 5168 | * (We scale it from 100 to 128 to long long handling easier.) | ||
| 5169 | */ | ||
| 5170 | |||
| 5171 | #define MIGRATION_FACTOR_SCALE 128 | ||
| 5172 | |||
| 5173 | static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; | ||
| 5174 | |||
| 5175 | static int __init setup_migration_factor(char *str) | ||
| 5176 | { | ||
| 5177 | get_option(&str, &migration_factor); | ||
| 5178 | migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; | ||
| 5179 | return 1; | ||
| 5180 | } | ||
| 5181 | |||
| 5182 | __setup("migration_factor=", setup_migration_factor); | ||
| 5183 | |||
| 5184 | /* | ||
| 5185 | * Estimated distance of two CPUs, measured via the number of domains | ||
| 5186 | * we have to pass for the two CPUs to be in the same span: | ||
| 5187 | */ | ||
| 5188 | static unsigned long domain_distance(int cpu1, int cpu2) | ||
| 5189 | { | ||
| 5190 | unsigned long distance = 0; | ||
| 5191 | struct sched_domain *sd; | ||
| 5192 | |||
| 5193 | for_each_domain(cpu1, sd) { | ||
| 5194 | WARN_ON(!cpu_isset(cpu1, sd->span)); | ||
| 5195 | if (cpu_isset(cpu2, sd->span)) | ||
| 5196 | return distance; | ||
| 5197 | distance++; | ||
| 5198 | } | ||
| 5199 | if (distance >= MAX_DOMAIN_DISTANCE) { | ||
| 5200 | WARN_ON(1); | ||
| 5201 | distance = MAX_DOMAIN_DISTANCE-1; | ||
| 5202 | } | ||
| 5203 | |||
| 5204 | return distance; | ||
| 5205 | } | ||
| 5206 | |||
| 5207 | static unsigned int migration_debug; | ||
| 5208 | |||
| 5209 | static int __init setup_migration_debug(char *str) | ||
| 5210 | { | ||
| 5211 | get_option(&str, &migration_debug); | ||
| 5212 | return 1; | ||
| 5213 | } | ||
| 5214 | |||
| 5215 | __setup("migration_debug=", setup_migration_debug); | ||
| 5216 | |||
| 5217 | /* | ||
| 5218 | * Maximum cache-size that the scheduler should try to measure. | ||
| 5219 | * Architectures with larger caches should tune this up during | ||
| 5220 | * bootup. Gets used in the domain-setup code (i.e. during SMP | ||
| 5221 | * bootup). | ||
| 5222 | */ | ||
| 5223 | unsigned int max_cache_size; | ||
| 5224 | |||
| 5225 | static int __init setup_max_cache_size(char *str) | ||
| 5226 | { | ||
| 5227 | get_option(&str, &max_cache_size); | ||
| 5228 | return 1; | ||
| 5229 | } | ||
| 5230 | |||
| 5231 | __setup("max_cache_size=", setup_max_cache_size); | ||
| 5232 | |||
| 5233 | /* | ||
| 5234 | * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This | ||
| 5235 | * is the operation that is timed, so we try to generate unpredictable | ||
| 5236 | * cachemisses that still end up filling the L2 cache: | ||
| 5237 | */ | ||
| 5238 | static void touch_cache(void *__cache, unsigned long __size) | ||
| 5239 | { | ||
| 5240 | unsigned long size = __size/sizeof(long), chunk1 = size/3, | ||
| 5241 | chunk2 = 2*size/3; | ||
| 5242 | unsigned long *cache = __cache; | ||
| 5243 | int i; | ||
| 5244 | |||
| 5245 | for (i = 0; i < size/6; i += 8) { | ||
| 5246 | switch (i % 6) { | ||
| 5247 | case 0: cache[i]++; | ||
| 5248 | case 1: cache[size-1-i]++; | ||
| 5249 | case 2: cache[chunk1-i]++; | ||
| 5250 | case 3: cache[chunk1+i]++; | ||
| 5251 | case 4: cache[chunk2-i]++; | ||
| 5252 | case 5: cache[chunk2+i]++; | ||
| 5253 | } | ||
| 5254 | } | ||
| 5255 | } | ||
| 5256 | |||
| 5257 | /* | ||
| 5258 | * Measure the cache-cost of one task migration. Returns in units of nsec. | ||
| 5259 | */ | ||
| 5260 | static unsigned long long measure_one(void *cache, unsigned long size, | ||
| 5261 | int source, int target) | ||
| 5262 | { | ||
| 5263 | cpumask_t mask, saved_mask; | ||
| 5264 | unsigned long long t0, t1, t2, t3, cost; | ||
| 5265 | |||
| 5266 | saved_mask = current->cpus_allowed; | ||
| 5267 | |||
| 5268 | /* | ||
| 5269 | * Flush source caches to RAM and invalidate them: | ||
| 5270 | */ | ||
| 5271 | sched_cacheflush(); | ||
| 5272 | |||
| 5273 | /* | ||
| 5274 | * Migrate to the source CPU: | ||
| 5275 | */ | ||
| 5276 | mask = cpumask_of_cpu(source); | ||
| 5277 | set_cpus_allowed(current, mask); | ||
| 5278 | WARN_ON(smp_processor_id() != source); | ||
| 5279 | |||
| 5280 | /* | ||
| 5281 | * Dirty the working set: | ||
| 5282 | */ | ||
| 5283 | t0 = sched_clock(); | ||
| 5284 | touch_cache(cache, size); | ||
| 5285 | t1 = sched_clock(); | ||
| 5286 | |||
| 5287 | /* | ||
| 5288 | * Migrate to the target CPU, dirty the L2 cache and access | ||
| 5289 | * the shared buffer. (which represents the working set | ||
| 5290 | * of a migrated task.) | ||
| 5291 | */ | ||
| 5292 | mask = cpumask_of_cpu(target); | ||
| 5293 | set_cpus_allowed(current, mask); | ||
| 5294 | WARN_ON(smp_processor_id() != target); | ||
| 5295 | |||
| 5296 | t2 = sched_clock(); | ||
| 5297 | touch_cache(cache, size); | ||
| 5298 | t3 = sched_clock(); | ||
| 5299 | |||
| 5300 | cost = t1-t0 + t3-t2; | ||
| 5301 | |||
| 5302 | if (migration_debug >= 2) | ||
| 5303 | printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", | ||
| 5304 | source, target, t1-t0, t1-t0, t3-t2, cost); | ||
| 5305 | /* | ||
| 5306 | * Flush target caches to RAM and invalidate them: | ||
| 5307 | */ | ||
| 5308 | sched_cacheflush(); | ||
| 5309 | |||
| 5310 | set_cpus_allowed(current, saved_mask); | ||
| 5311 | |||
| 5312 | return cost; | ||
| 5313 | } | ||
| 5314 | |||
| 5315 | /* | ||
| 5316 | * Measure a series of task migrations and return the average | ||
| 5317 | * result. Since this code runs early during bootup the system | ||
| 5318 | * is 'undisturbed' and the average latency makes sense. | ||
| 5319 | * | ||
| 5320 | * The algorithm in essence auto-detects the relevant cache-size, | ||
| 5321 | * so it will properly detect different cachesizes for different | ||
| 5322 | * cache-hierarchies, depending on how the CPUs are connected. | ||
| 5323 | * | ||
| 5324 | * Architectures can prime the upper limit of the search range via | ||
| 5325 | * max_cache_size, otherwise the search range defaults to 20MB...64K. | ||
| 5326 | */ | ||
| 5327 | static unsigned long long | ||
| 5328 | measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | ||
| 5329 | { | ||
| 5330 | unsigned long long cost1, cost2; | ||
| 5331 | int i; | ||
| 5332 | |||
| 5333 | /* | ||
| 5334 | * Measure the migration cost of 'size' bytes, over an | ||
| 5335 | * average of 10 runs: | ||
| 5336 | * | ||
| 5337 | * (We perturb the cache size by a small (0..4k) | ||
| 5338 | * value to compensate size/alignment related artifacts. | ||
| 5339 | * We also subtract the cost of the operation done on | ||
| 5340 | * the same CPU.) | ||
| 5341 | */ | ||
| 5342 | cost1 = 0; | ||
| 5343 | |||
| 5344 | /* | ||
| 5345 | * dry run, to make sure we start off cache-cold on cpu1, | ||
| 5346 | * and to get any vmalloc pagefaults in advance: | ||
| 5347 | */ | ||
| 5348 | measure_one(cache, size, cpu1, cpu2); | ||
| 5349 | for (i = 0; i < ITERATIONS; i++) | ||
| 5350 | cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); | ||
| 5351 | |||
| 5352 | measure_one(cache, size, cpu2, cpu1); | ||
| 5353 | for (i = 0; i < ITERATIONS; i++) | ||
| 5354 | cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); | ||
| 5355 | |||
| 5356 | /* | ||
| 5357 | * (We measure the non-migrating [cached] cost on both | ||
| 5358 | * cpu1 and cpu2, to handle CPUs with different speeds) | ||
| 5359 | */ | ||
| 5360 | cost2 = 0; | ||
| 5361 | |||
| 5362 | measure_one(cache, size, cpu1, cpu1); | ||
| 5363 | for (i = 0; i < ITERATIONS; i++) | ||
| 5364 | cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); | ||
| 5365 | |||
| 5366 | measure_one(cache, size, cpu2, cpu2); | ||
| 5367 | for (i = 0; i < ITERATIONS; i++) | ||
| 5368 | cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); | ||
| 5369 | |||
| 5370 | /* | ||
| 5371 | * Get the per-iteration migration cost: | ||
| 5372 | */ | ||
| 5373 | do_div(cost1, 2*ITERATIONS); | ||
| 5374 | do_div(cost2, 2*ITERATIONS); | ||
| 5375 | |||
| 5376 | return cost1 - cost2; | ||
| 5377 | } | ||
| 5378 | |||
| 5379 | static unsigned long long measure_migration_cost(int cpu1, int cpu2) | ||
| 5380 | { | ||
| 5381 | unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; | ||
| 5382 | unsigned int max_size, size, size_found = 0; | ||
| 5383 | long long cost = 0, prev_cost; | ||
| 5384 | void *cache; | ||
| 5385 | |||
| 5386 | /* | ||
| 5387 | * Search from max_cache_size*5 down to 64K - the real relevant | ||
| 5388 | * cachesize has to lie somewhere inbetween. | ||
| 5389 | */ | ||
| 5390 | if (max_cache_size) { | ||
| 5391 | max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
| 5392 | size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
| 5393 | } else { | ||
| 5394 | /* | ||
| 5395 | * Since we have no estimation about the relevant | ||
| 5396 | * search range | ||
| 5397 | */ | ||
| 5398 | max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; | ||
| 5399 | size = MIN_CACHE_SIZE; | ||
| 5400 | } | ||
| 5401 | |||
| 5402 | if (!cpu_online(cpu1) || !cpu_online(cpu2)) { | ||
| 5403 | printk("cpu %d and %d not both online!\n", cpu1, cpu2); | ||
| 5404 | return 0; | ||
| 5405 | } | ||
| 5406 | |||
| 5407 | /* | ||
| 5408 | * Allocate the working set: | ||
| 5409 | */ | ||
| 5410 | cache = vmalloc(max_size); | ||
| 5411 | if (!cache) { | ||
| 5412 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | ||
| 5413 | return 1000000; // return 1 msec on very small boxen | ||
| 5414 | } | ||
| 5415 | |||
| 5416 | while (size <= max_size) { | ||
| 5417 | prev_cost = cost; | ||
| 5418 | cost = measure_cost(cpu1, cpu2, cache, size); | ||
| 5419 | |||
| 5420 | /* | ||
| 5421 | * Update the max: | ||
| 5422 | */ | ||
| 5423 | if (cost > 0) { | ||
| 5424 | if (max_cost < cost) { | ||
| 5425 | max_cost = cost; | ||
| 5426 | size_found = size; | ||
| 5427 | } | ||
| 5428 | } | ||
| 5429 | /* | ||
| 5430 | * Calculate average fluctuation, we use this to prevent | ||
| 5431 | * noise from triggering an early break out of the loop: | ||
| 5432 | */ | ||
| 5433 | fluct = abs(cost - prev_cost); | ||
| 5434 | avg_fluct = (avg_fluct + fluct)/2; | ||
| 5435 | |||
| 5436 | if (migration_debug) | ||
| 5437 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", | ||
| 5438 | cpu1, cpu2, size, | ||
| 5439 | (long)cost / 1000000, | ||
| 5440 | ((long)cost / 100000) % 10, | ||
| 5441 | (long)max_cost / 1000000, | ||
| 5442 | ((long)max_cost / 100000) % 10, | ||
| 5443 | domain_distance(cpu1, cpu2), | ||
| 5444 | cost, avg_fluct); | ||
| 5445 | |||
| 5446 | /* | ||
| 5447 | * If we iterated at least 20% past the previous maximum, | ||
| 5448 | * and the cost has dropped by more than 20% already, | ||
| 5449 | * (taking fluctuations into account) then we assume to | ||
| 5450 | * have found the maximum and break out of the loop early: | ||
| 5451 | */ | ||
| 5452 | if (size_found && (size*100 > size_found*SIZE_THRESH)) | ||
| 5453 | if (cost+avg_fluct <= 0 || | ||
| 5454 | max_cost*100 > (cost+avg_fluct)*COST_THRESH) { | ||
| 5455 | |||
| 5456 | if (migration_debug) | ||
| 5457 | printk("-> found max.\n"); | ||
| 5458 | break; | ||
| 5459 | } | ||
| 5460 | /* | ||
| 5461 | * Increase the cachesize in 5% steps: | ||
| 5462 | */ | ||
| 5463 | size = size * 20 / 19; | ||
| 5464 | } | ||
| 5465 | |||
| 5466 | if (migration_debug) | ||
| 5467 | printk("[%d][%d] working set size found: %d, cost: %Ld\n", | ||
| 5468 | cpu1, cpu2, size_found, max_cost); | ||
| 5469 | |||
| 5470 | vfree(cache); | ||
| 5471 | |||
| 5472 | /* | ||
| 5473 | * A task is considered 'cache cold' if at least 2 times | ||
| 5474 | * the worst-case cost of migration has passed. | ||
| 5475 | * | ||
| 5476 | * (this limit is only listened to if the load-balancing | ||
| 5477 | * situation is 'nice' - if there is a large imbalance we | ||
| 5478 | * ignore it for the sake of CPU utilization and | ||
| 5479 | * processing fairness.) | ||
| 5480 | */ | ||
| 5481 | return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; | ||
| 5482 | } | ||
| 5483 | |||
| 5484 | static void calibrate_migration_costs(const cpumask_t *cpu_map) | ||
| 5485 | { | ||
| 5486 | int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); | ||
| 5487 | unsigned long j0, j1, distance, max_distance = 0; | ||
| 5488 | struct sched_domain *sd; | ||
| 5489 | |||
| 5490 | j0 = jiffies; | ||
| 5491 | |||
| 5492 | /* | ||
| 5493 | * First pass - calculate the cacheflush times: | ||
| 5494 | */ | ||
| 5495 | for_each_cpu_mask(cpu1, *cpu_map) { | ||
| 5496 | for_each_cpu_mask(cpu2, *cpu_map) { | ||
| 5497 | if (cpu1 == cpu2) | ||
| 5498 | continue; | ||
| 5499 | distance = domain_distance(cpu1, cpu2); | ||
| 5500 | max_distance = max(max_distance, distance); | ||
| 5501 | /* | ||
| 5502 | * No result cached yet? | ||
| 5503 | */ | ||
| 5504 | if (migration_cost[distance] == -1LL) | ||
| 5505 | migration_cost[distance] = | ||
| 5506 | measure_migration_cost(cpu1, cpu2); | ||
| 5507 | } | ||
| 5508 | } | ||
| 5509 | /* | ||
| 5510 | * Second pass - update the sched domain hierarchy with | ||
| 5511 | * the new cache-hot-time estimations: | ||
| 5512 | */ | ||
| 5513 | for_each_cpu_mask(cpu, *cpu_map) { | ||
| 5514 | distance = 0; | ||
| 5515 | for_each_domain(cpu, sd) { | ||
| 5516 | sd->cache_hot_time = migration_cost[distance]; | ||
| 5517 | distance++; | ||
| 5518 | } | ||
| 5519 | } | ||
| 5520 | /* | ||
| 5521 | * Print the matrix: | ||
| 5522 | */ | ||
| 5523 | if (migration_debug) | ||
| 5524 | printk("migration: max_cache_size: %d, cpu: %d MHz:\n", | ||
| 5525 | max_cache_size, | ||
| 5526 | #ifdef CONFIG_X86 | ||
| 5527 | cpu_khz/1000 | ||
| 5528 | #else | ||
| 5529 | -1 | ||
| 5530 | #endif | ||
| 5531 | ); | ||
| 5532 | printk("migration_cost="); | ||
| 5533 | for (distance = 0; distance <= max_distance; distance++) { | ||
| 5534 | if (distance) | ||
| 5535 | printk(","); | ||
| 5536 | printk("%ld", (long)migration_cost[distance] / 1000); | ||
| 5537 | } | ||
| 5538 | printk("\n"); | ||
| 5539 | j1 = jiffies; | ||
| 5540 | if (migration_debug) | ||
| 5541 | printk("migration: %ld seconds\n", (j1-j0)/HZ); | ||
| 5542 | |||
| 5543 | /* | ||
| 5544 | * Move back to the original CPU. NUMA-Q gets confused | ||
| 5545 | * if we migrate to another quad during bootup. | ||
| 5546 | */ | ||
| 5547 | if (raw_smp_processor_id() != orig_cpu) { | ||
| 5548 | cpumask_t mask = cpumask_of_cpu(orig_cpu), | ||
| 5549 | saved_mask = current->cpus_allowed; | ||
| 5550 | |||
| 5551 | set_cpus_allowed(current, mask); | ||
| 5552 | set_cpus_allowed(current, saved_mask); | ||
| 5553 | } | ||
| 5554 | } | ||
| 5555 | |||
| 5085 | #ifdef CONFIG_NUMA | 5556 | #ifdef CONFIG_NUMA |
| 5557 | |||
| 5086 | /** | 5558 | /** |
| 5087 | * find_next_best_node - find the next node to include in a sched_domain | 5559 | * find_next_best_node - find the next node to include in a sched_domain |
| 5088 | * @node: node whose sched_domain we're building | 5560 | * @node: node whose sched_domain we're building |
| @@ -5448,6 +5920,10 @@ next_sg: | |||
| 5448 | #endif | 5920 | #endif |
| 5449 | cpu_attach_domain(sd, i); | 5921 | cpu_attach_domain(sd, i); |
| 5450 | } | 5922 | } |
| 5923 | /* | ||
| 5924 | * Tune cache-hot values: | ||
| 5925 | */ | ||
| 5926 | calibrate_migration_costs(cpu_map); | ||
| 5451 | } | 5927 | } |
| 5452 | /* | 5928 | /* |
| 5453 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 5929 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
