aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c561
1 files changed, 534 insertions, 27 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 6f46c94cc29e..3ee2ae45125f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -27,12 +27,14 @@
27#include <linux/smp_lock.h> 27#include <linux/smp_lock.h>
28#include <asm/mmu_context.h> 28#include <asm/mmu_context.h>
29#include <linux/interrupt.h> 29#include <linux/interrupt.h>
30#include <linux/capability.h>
30#include <linux/completion.h> 31#include <linux/completion.h>
31#include <linux/kernel_stat.h> 32#include <linux/kernel_stat.h>
32#include <linux/security.h> 33#include <linux/security.h>
33#include <linux/notifier.h> 34#include <linux/notifier.h>
34#include <linux/profile.h> 35#include <linux/profile.h>
35#include <linux/suspend.h> 36#include <linux/suspend.h>
37#include <linux/vmalloc.h>
36#include <linux/blkdev.h> 38#include <linux/blkdev.h>
37#include <linux/delay.h> 39#include <linux/delay.h>
38#include <linux/smp.h> 40#include <linux/smp.h>
@@ -176,6 +178,13 @@ static unsigned int task_timeslice(task_t *p)
176#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ 178#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
177 < (long long) (sd)->cache_hot_time) 179 < (long long) (sd)->cache_hot_time)
178 180
181void __put_task_struct_cb(struct rcu_head *rhp)
182{
183 __put_task_struct(container_of(rhp, struct task_struct, rcu));
184}
185
186EXPORT_SYMBOL_GPL(__put_task_struct_cb);
187
179/* 188/*
180 * These are the runqueue data structures: 189 * These are the runqueue data structures:
181 */ 190 */
@@ -512,7 +521,7 @@ static inline void sched_info_dequeued(task_t *t)
512 * long it was waiting to run. We also note when it began so that we 521 * long it was waiting to run. We also note when it began so that we
513 * can keep stats on how long its timeslice is. 522 * can keep stats on how long its timeslice is.
514 */ 523 */
515static inline void sched_info_arrive(task_t *t) 524static void sched_info_arrive(task_t *t)
516{ 525{
517 unsigned long now = jiffies, diff = 0; 526 unsigned long now = jiffies, diff = 0;
518 struct runqueue *rq = task_rq(t); 527 struct runqueue *rq = task_rq(t);
@@ -739,10 +748,14 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
739 unsigned long long __sleep_time = now - p->timestamp; 748 unsigned long long __sleep_time = now - p->timestamp;
740 unsigned long sleep_time; 749 unsigned long sleep_time;
741 750
742 if (__sleep_time > NS_MAX_SLEEP_AVG) 751 if (unlikely(p->policy == SCHED_BATCH))
743 sleep_time = NS_MAX_SLEEP_AVG; 752 sleep_time = 0;
744 else 753 else {
745 sleep_time = (unsigned long)__sleep_time; 754 if (__sleep_time > NS_MAX_SLEEP_AVG)
755 sleep_time = NS_MAX_SLEEP_AVG;
756 else
757 sleep_time = (unsigned long)__sleep_time;
758 }
746 759
747 if (likely(sleep_time > 0)) { 760 if (likely(sleep_time > 0)) {
748 /* 761 /*
@@ -994,7 +1007,7 @@ void kick_process(task_t *p)
994 * We want to under-estimate the load of migration sources, to 1007 * We want to under-estimate the load of migration sources, to
995 * balance conservatively. 1008 * balance conservatively.
996 */ 1009 */
997static inline unsigned long __source_load(int cpu, int type, enum idle_type idle) 1010static unsigned long __source_load(int cpu, int type, enum idle_type idle)
998{ 1011{
999 runqueue_t *rq = cpu_rq(cpu); 1012 runqueue_t *rq = cpu_rq(cpu);
1000 unsigned long running = rq->nr_running; 1013 unsigned long running = rq->nr_running;
@@ -1281,6 +1294,9 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
1281 } 1294 }
1282 } 1295 }
1283 1296
1297 if (p->last_waker_cpu != this_cpu)
1298 goto out_set_cpu;
1299
1284 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1300 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1285 goto out_set_cpu; 1301 goto out_set_cpu;
1286 1302
@@ -1351,6 +1367,8 @@ out_set_cpu:
1351 cpu = task_cpu(p); 1367 cpu = task_cpu(p);
1352 } 1368 }
1353 1369
1370 p->last_waker_cpu = this_cpu;
1371
1354out_activate: 1372out_activate:
1355#endif /* CONFIG_SMP */ 1373#endif /* CONFIG_SMP */
1356 if (old_state == TASK_UNINTERRUPTIBLE) { 1374 if (old_state == TASK_UNINTERRUPTIBLE) {
@@ -1432,9 +1450,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1432#ifdef CONFIG_SCHEDSTATS 1450#ifdef CONFIG_SCHEDSTATS
1433 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1451 memset(&p->sched_info, 0, sizeof(p->sched_info));
1434#endif 1452#endif
1435#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1453#if defined(CONFIG_SMP)
1454 p->last_waker_cpu = cpu;
1455#if defined(__ARCH_WANT_UNLOCKED_CTXSW)
1436 p->oncpu = 0; 1456 p->oncpu = 0;
1437#endif 1457#endif
1458#endif
1438#ifdef CONFIG_PREEMPT 1459#ifdef CONFIG_PREEMPT
1439 /* Want to start with kernel preemption disabled. */ 1460 /* Want to start with kernel preemption disabled. */
1440 task_thread_info(p)->preempt_count = 1; 1461 task_thread_info(p)->preempt_count = 1;
@@ -1849,7 +1870,7 @@ void sched_exec(void)
1849 * pull_task - move a task from a remote runqueue to the local runqueue. 1870 * pull_task - move a task from a remote runqueue to the local runqueue.
1850 * Both runqueues must be locked. 1871 * Both runqueues must be locked.
1851 */ 1872 */
1852static inline 1873static
1853void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, 1874void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1854 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 1875 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1855{ 1876{
@@ -1871,7 +1892,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1871/* 1892/*
1872 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 1893 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1873 */ 1894 */
1874static inline 1895static
1875int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 1896int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1876 struct sched_domain *sd, enum idle_type idle, 1897 struct sched_domain *sd, enum idle_type idle,
1877 int *all_pinned) 1898 int *all_pinned)
@@ -2357,7 +2378,7 @@ out_balanced:
2357 * idle_balance is called by schedule() if this_cpu is about to become 2378 * idle_balance is called by schedule() if this_cpu is about to become
2358 * idle. Attempts to pull tasks from other CPUs. 2379 * idle. Attempts to pull tasks from other CPUs.
2359 */ 2380 */
2360static inline void idle_balance(int this_cpu, runqueue_t *this_rq) 2381static void idle_balance(int this_cpu, runqueue_t *this_rq)
2361{ 2382{
2362 struct sched_domain *sd; 2383 struct sched_domain *sd;
2363 2384
@@ -2741,7 +2762,7 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq)
2741 resched_task(rq->idle); 2762 resched_task(rq->idle);
2742} 2763}
2743 2764
2744static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2765static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2745{ 2766{
2746 struct sched_domain *tmp, *sd = NULL; 2767 struct sched_domain *tmp, *sd = NULL;
2747 cpumask_t sibling_map; 2768 cpumask_t sibling_map;
@@ -2795,7 +2816,7 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
2795 return p->time_slice * (100 - sd->per_cpu_gain) / 100; 2816 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
2796} 2817}
2797 2818
2798static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2819static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2799{ 2820{
2800 struct sched_domain *tmp, *sd = NULL; 2821 struct sched_domain *tmp, *sd = NULL;
2801 cpumask_t sibling_map; 2822 cpumask_t sibling_map;
@@ -3543,7 +3564,7 @@ void set_user_nice(task_t *p, long nice)
3543 * The RT priorities are set via sched_setscheduler(), but we still 3564 * The RT priorities are set via sched_setscheduler(), but we still
3544 * allow the 'normal' nice value to be set - but as expected 3565 * allow the 'normal' nice value to be set - but as expected
3545 * it wont have any effect on scheduling until the task is 3566 * it wont have any effect on scheduling until the task is
3546 * not SCHED_NORMAL: 3567 * not SCHED_NORMAL/SCHED_BATCH:
3547 */ 3568 */
3548 if (rt_task(p)) { 3569 if (rt_task(p)) {
3549 p->static_prio = NICE_TO_PRIO(nice); 3570 p->static_prio = NICE_TO_PRIO(nice);
@@ -3689,10 +3710,16 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3689 BUG_ON(p->array); 3710 BUG_ON(p->array);
3690 p->policy = policy; 3711 p->policy = policy;
3691 p->rt_priority = prio; 3712 p->rt_priority = prio;
3692 if (policy != SCHED_NORMAL) 3713 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
3693 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 3714 p->prio = MAX_RT_PRIO-1 - p->rt_priority;
3694 else 3715 } else {
3695 p->prio = p->static_prio; 3716 p->prio = p->static_prio;
3717 /*
3718 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
3719 */
3720 if (policy == SCHED_BATCH)
3721 p->sleep_avg = 0;
3722 }
3696} 3723}
3697 3724
3698/** 3725/**
@@ -3716,29 +3743,35 @@ recheck:
3716 if (policy < 0) 3743 if (policy < 0)
3717 policy = oldpolicy = p->policy; 3744 policy = oldpolicy = p->policy;
3718 else if (policy != SCHED_FIFO && policy != SCHED_RR && 3745 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
3719 policy != SCHED_NORMAL) 3746 policy != SCHED_NORMAL && policy != SCHED_BATCH)
3720 return -EINVAL; 3747 return -EINVAL;
3721 /* 3748 /*
3722 * Valid priorities for SCHED_FIFO and SCHED_RR are 3749 * Valid priorities for SCHED_FIFO and SCHED_RR are
3723 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. 3750 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
3751 * SCHED_BATCH is 0.
3724 */ 3752 */
3725 if (param->sched_priority < 0 || 3753 if (param->sched_priority < 0 ||
3726 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3754 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3727 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3755 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3728 return -EINVAL; 3756 return -EINVAL;
3729 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) 3757 if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
3758 != (param->sched_priority == 0))
3730 return -EINVAL; 3759 return -EINVAL;
3731 3760
3732 /* 3761 /*
3733 * Allow unprivileged RT tasks to decrease priority: 3762 * Allow unprivileged RT tasks to decrease priority:
3734 */ 3763 */
3735 if (!capable(CAP_SYS_NICE)) { 3764 if (!capable(CAP_SYS_NICE)) {
3736 /* can't change policy */ 3765 /*
3737 if (policy != p->policy && 3766 * can't change policy, except between SCHED_NORMAL
3738 !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 3767 * and SCHED_BATCH:
3768 */
3769 if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
3770 (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
3771 !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
3739 return -EPERM; 3772 return -EPERM;
3740 /* can't increase priority */ 3773 /* can't increase priority */
3741 if (policy != SCHED_NORMAL && 3774 if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
3742 param->sched_priority > p->rt_priority && 3775 param->sched_priority > p->rt_priority &&
3743 param->sched_priority > 3776 param->sched_priority >
3744 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 3777 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
@@ -3817,6 +3850,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3817asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 3850asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
3818 struct sched_param __user *param) 3851 struct sched_param __user *param)
3819{ 3852{
3853 /* negative values for policy are not valid */
3854 if (policy < 0)
3855 return -EINVAL;
3856
3820 return do_sched_setscheduler(pid, policy, param); 3857 return do_sched_setscheduler(pid, policy, param);
3821} 3858}
3822 3859
@@ -3972,12 +4009,12 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
3972 * method, such as ACPI for e.g. 4009 * method, such as ACPI for e.g.
3973 */ 4010 */
3974 4011
3975cpumask_t cpu_present_map; 4012cpumask_t cpu_present_map __read_mostly;
3976EXPORT_SYMBOL(cpu_present_map); 4013EXPORT_SYMBOL(cpu_present_map);
3977 4014
3978#ifndef CONFIG_SMP 4015#ifndef CONFIG_SMP
3979cpumask_t cpu_online_map = CPU_MASK_ALL; 4016cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
3980cpumask_t cpu_possible_map = CPU_MASK_ALL; 4017cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
3981#endif 4018#endif
3982 4019
3983long sched_getaffinity(pid_t pid, cpumask_t *mask) 4020long sched_getaffinity(pid_t pid, cpumask_t *mask)
@@ -4216,6 +4253,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
4216 ret = MAX_USER_RT_PRIO-1; 4253 ret = MAX_USER_RT_PRIO-1;
4217 break; 4254 break;
4218 case SCHED_NORMAL: 4255 case SCHED_NORMAL:
4256 case SCHED_BATCH:
4219 ret = 0; 4257 ret = 0;
4220 break; 4258 break;
4221 } 4259 }
@@ -4239,6 +4277,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
4239 ret = 1; 4277 ret = 1;
4240 break; 4278 break;
4241 case SCHED_NORMAL: 4279 case SCHED_NORMAL:
4280 case SCHED_BATCH:
4242 ret = 0; 4281 ret = 0;
4243 } 4282 }
4244 return ret; 4283 return ret;
@@ -4379,6 +4418,7 @@ void show_state(void)
4379 } while_each_thread(g, p); 4418 } while_each_thread(g, p);
4380 4419
4381 read_unlock(&tasklist_lock); 4420 read_unlock(&tasklist_lock);
4421 mutex_debug_show_all_locks();
4382} 4422}
4383 4423
4384/** 4424/**
@@ -5073,7 +5113,470 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
5073 5113
5074#define SD_NODES_PER_DOMAIN 16 5114#define SD_NODES_PER_DOMAIN 16
5075 5115
5116/*
5117 * Self-tuning task migration cost measurement between source and target CPUs.
5118 *
5119 * This is done by measuring the cost of manipulating buffers of varying
5120 * sizes. For a given buffer-size here are the steps that are taken:
5121 *
5122 * 1) the source CPU reads+dirties a shared buffer
5123 * 2) the target CPU reads+dirties the same shared buffer
5124 *
5125 * We measure how long they take, in the following 4 scenarios:
5126 *
5127 * - source: CPU1, target: CPU2 | cost1
5128 * - source: CPU2, target: CPU1 | cost2
5129 * - source: CPU1, target: CPU1 | cost3
5130 * - source: CPU2, target: CPU2 | cost4
5131 *
5132 * We then calculate the cost3+cost4-cost1-cost2 difference - this is
5133 * the cost of migration.
5134 *
5135 * We then start off from a small buffer-size and iterate up to larger
5136 * buffer sizes, in 5% steps - measuring each buffer-size separately, and
5137 * doing a maximum search for the cost. (The maximum cost for a migration
5138 * normally occurs when the working set size is around the effective cache
5139 * size.)
5140 */
5141#define SEARCH_SCOPE 2
5142#define MIN_CACHE_SIZE (64*1024U)
5143#define DEFAULT_CACHE_SIZE (5*1024*1024U)
5144#define ITERATIONS 2
5145#define SIZE_THRESH 130
5146#define COST_THRESH 130
5147
5148/*
5149 * The migration cost is a function of 'domain distance'. Domain
5150 * distance is the number of steps a CPU has to iterate down its
5151 * domain tree to share a domain with the other CPU. The farther
5152 * two CPUs are from each other, the larger the distance gets.
5153 *
5154 * Note that we use the distance only to cache measurement results,
5155 * the distance value is not used numerically otherwise. When two
5156 * CPUs have the same distance it is assumed that the migration
5157 * cost is the same. (this is a simplification but quite practical)
5158 */
5159#define MAX_DOMAIN_DISTANCE 32
5160
5161static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
5162 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL };
5163
5164/*
5165 * Allow override of migration cost - in units of microseconds.
5166 * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
5167 * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
5168 */
5169static int __init migration_cost_setup(char *str)
5170{
5171 int ints[MAX_DOMAIN_DISTANCE+1], i;
5172
5173 str = get_options(str, ARRAY_SIZE(ints), ints);
5174
5175 printk("#ints: %d\n", ints[0]);
5176 for (i = 1; i <= ints[0]; i++) {
5177 migration_cost[i-1] = (unsigned long long)ints[i]*1000;
5178 printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
5179 }
5180 return 1;
5181}
5182
5183__setup ("migration_cost=", migration_cost_setup);
5184
5185/*
5186 * Global multiplier (divisor) for migration-cutoff values,
5187 * in percentiles. E.g. use a value of 150 to get 1.5 times
5188 * longer cache-hot cutoff times.
5189 *
5190 * (We scale it from 100 to 128 to long long handling easier.)
5191 */
5192
5193#define MIGRATION_FACTOR_SCALE 128
5194
5195static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
5196
5197static int __init setup_migration_factor(char *str)
5198{
5199 get_option(&str, &migration_factor);
5200 migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
5201 return 1;
5202}
5203
5204__setup("migration_factor=", setup_migration_factor);
5205
5206/*
5207 * Estimated distance of two CPUs, measured via the number of domains
5208 * we have to pass for the two CPUs to be in the same span:
5209 */
5210static unsigned long domain_distance(int cpu1, int cpu2)
5211{
5212 unsigned long distance = 0;
5213 struct sched_domain *sd;
5214
5215 for_each_domain(cpu1, sd) {
5216 WARN_ON(!cpu_isset(cpu1, sd->span));
5217 if (cpu_isset(cpu2, sd->span))
5218 return distance;
5219 distance++;
5220 }
5221 if (distance >= MAX_DOMAIN_DISTANCE) {
5222 WARN_ON(1);
5223 distance = MAX_DOMAIN_DISTANCE-1;
5224 }
5225
5226 return distance;
5227}
5228
5229static unsigned int migration_debug;
5230
5231static int __init setup_migration_debug(char *str)
5232{
5233 get_option(&str, &migration_debug);
5234 return 1;
5235}
5236
5237__setup("migration_debug=", setup_migration_debug);
5238
5239/*
5240 * Maximum cache-size that the scheduler should try to measure.
5241 * Architectures with larger caches should tune this up during
5242 * bootup. Gets used in the domain-setup code (i.e. during SMP
5243 * bootup).
5244 */
5245unsigned int max_cache_size;
5246
5247static int __init setup_max_cache_size(char *str)
5248{
5249 get_option(&str, &max_cache_size);
5250 return 1;
5251}
5252
5253__setup("max_cache_size=", setup_max_cache_size);
5254
5255/*
5256 * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
5257 * is the operation that is timed, so we try to generate unpredictable
5258 * cachemisses that still end up filling the L2 cache:
5259 */
5260static void touch_cache(void *__cache, unsigned long __size)
5261{
5262 unsigned long size = __size/sizeof(long), chunk1 = size/3,
5263 chunk2 = 2*size/3;
5264 unsigned long *cache = __cache;
5265 int i;
5266
5267 for (i = 0; i < size/6; i += 8) {
5268 switch (i % 6) {
5269 case 0: cache[i]++;
5270 case 1: cache[size-1-i]++;
5271 case 2: cache[chunk1-i]++;
5272 case 3: cache[chunk1+i]++;
5273 case 4: cache[chunk2-i]++;
5274 case 5: cache[chunk2+i]++;
5275 }
5276 }
5277}
5278
5279/*
5280 * Measure the cache-cost of one task migration. Returns in units of nsec.
5281 */
5282static unsigned long long measure_one(void *cache, unsigned long size,
5283 int source, int target)
5284{
5285 cpumask_t mask, saved_mask;
5286 unsigned long long t0, t1, t2, t3, cost;
5287
5288 saved_mask = current->cpus_allowed;
5289
5290 /*
5291 * Flush source caches to RAM and invalidate them:
5292 */
5293 sched_cacheflush();
5294
5295 /*
5296 * Migrate to the source CPU:
5297 */
5298 mask = cpumask_of_cpu(source);
5299 set_cpus_allowed(current, mask);
5300 WARN_ON(smp_processor_id() != source);
5301
5302 /*
5303 * Dirty the working set:
5304 */
5305 t0 = sched_clock();
5306 touch_cache(cache, size);
5307 t1 = sched_clock();
5308
5309 /*
5310 * Migrate to the target CPU, dirty the L2 cache and access
5311 * the shared buffer. (which represents the working set
5312 * of a migrated task.)
5313 */
5314 mask = cpumask_of_cpu(target);
5315 set_cpus_allowed(current, mask);
5316 WARN_ON(smp_processor_id() != target);
5317
5318 t2 = sched_clock();
5319 touch_cache(cache, size);
5320 t3 = sched_clock();
5321
5322 cost = t1-t0 + t3-t2;
5323
5324 if (migration_debug >= 2)
5325 printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
5326 source, target, t1-t0, t1-t0, t3-t2, cost);
5327 /*
5328 * Flush target caches to RAM and invalidate them:
5329 */
5330 sched_cacheflush();
5331
5332 set_cpus_allowed(current, saved_mask);
5333
5334 return cost;
5335}
5336
5337/*
5338 * Measure a series of task migrations and return the average
5339 * result. Since this code runs early during bootup the system
5340 * is 'undisturbed' and the average latency makes sense.
5341 *
5342 * The algorithm in essence auto-detects the relevant cache-size,
5343 * so it will properly detect different cachesizes for different
5344 * cache-hierarchies, depending on how the CPUs are connected.
5345 *
5346 * Architectures can prime the upper limit of the search range via
5347 * max_cache_size, otherwise the search range defaults to 20MB...64K.
5348 */
5349static unsigned long long
5350measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5351{
5352 unsigned long long cost1, cost2;
5353 int i;
5354
5355 /*
5356 * Measure the migration cost of 'size' bytes, over an
5357 * average of 10 runs:
5358 *
5359 * (We perturb the cache size by a small (0..4k)
5360 * value to compensate size/alignment related artifacts.
5361 * We also subtract the cost of the operation done on
5362 * the same CPU.)
5363 */
5364 cost1 = 0;
5365
5366 /*
5367 * dry run, to make sure we start off cache-cold on cpu1,
5368 * and to get any vmalloc pagefaults in advance:
5369 */
5370 measure_one(cache, size, cpu1, cpu2);
5371 for (i = 0; i < ITERATIONS; i++)
5372 cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
5373
5374 measure_one(cache, size, cpu2, cpu1);
5375 for (i = 0; i < ITERATIONS; i++)
5376 cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
5377
5378 /*
5379 * (We measure the non-migrating [cached] cost on both
5380 * cpu1 and cpu2, to handle CPUs with different speeds)
5381 */
5382 cost2 = 0;
5383
5384 measure_one(cache, size, cpu1, cpu1);
5385 for (i = 0; i < ITERATIONS; i++)
5386 cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
5387
5388 measure_one(cache, size, cpu2, cpu2);
5389 for (i = 0; i < ITERATIONS; i++)
5390 cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
5391
5392 /*
5393 * Get the per-iteration migration cost:
5394 */
5395 do_div(cost1, 2*ITERATIONS);
5396 do_div(cost2, 2*ITERATIONS);
5397
5398 return cost1 - cost2;
5399}
5400
5401static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5402{
5403 unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
5404 unsigned int max_size, size, size_found = 0;
5405 long long cost = 0, prev_cost;
5406 void *cache;
5407
5408 /*
5409 * Search from max_cache_size*5 down to 64K - the real relevant
5410 * cachesize has to lie somewhere inbetween.
5411 */
5412 if (max_cache_size) {
5413 max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
5414 size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
5415 } else {
5416 /*
5417 * Since we have no estimation about the relevant
5418 * search range
5419 */
5420 max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
5421 size = MIN_CACHE_SIZE;
5422 }
5423
5424 if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
5425 printk("cpu %d and %d not both online!\n", cpu1, cpu2);
5426 return 0;
5427 }
5428
5429 /*
5430 * Allocate the working set:
5431 */
5432 cache = vmalloc(max_size);
5433 if (!cache) {
5434 printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
5435 return 1000000; // return 1 msec on very small boxen
5436 }
5437
5438 while (size <= max_size) {
5439 prev_cost = cost;
5440 cost = measure_cost(cpu1, cpu2, cache, size);
5441
5442 /*
5443 * Update the max:
5444 */
5445 if (cost > 0) {
5446 if (max_cost < cost) {
5447 max_cost = cost;
5448 size_found = size;
5449 }
5450 }
5451 /*
5452 * Calculate average fluctuation, we use this to prevent
5453 * noise from triggering an early break out of the loop:
5454 */
5455 fluct = abs(cost - prev_cost);
5456 avg_fluct = (avg_fluct + fluct)/2;
5457
5458 if (migration_debug)
5459 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
5460 cpu1, cpu2, size,
5461 (long)cost / 1000000,
5462 ((long)cost / 100000) % 10,
5463 (long)max_cost / 1000000,
5464 ((long)max_cost / 100000) % 10,
5465 domain_distance(cpu1, cpu2),
5466 cost, avg_fluct);
5467
5468 /*
5469 * If we iterated at least 20% past the previous maximum,
5470 * and the cost has dropped by more than 20% already,
5471 * (taking fluctuations into account) then we assume to
5472 * have found the maximum and break out of the loop early:
5473 */
5474 if (size_found && (size*100 > size_found*SIZE_THRESH))
5475 if (cost+avg_fluct <= 0 ||
5476 max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
5477
5478 if (migration_debug)
5479 printk("-> found max.\n");
5480 break;
5481 }
5482 /*
5483 * Increase the cachesize in 5% steps:
5484 */
5485 size = size * 20 / 19;
5486 }
5487
5488 if (migration_debug)
5489 printk("[%d][%d] working set size found: %d, cost: %Ld\n",
5490 cpu1, cpu2, size_found, max_cost);
5491
5492 vfree(cache);
5493
5494 /*
5495 * A task is considered 'cache cold' if at least 2 times
5496 * the worst-case cost of migration has passed.
5497 *
5498 * (this limit is only listened to if the load-balancing
5499 * situation is 'nice' - if there is a large imbalance we
5500 * ignore it for the sake of CPU utilization and
5501 * processing fairness.)
5502 */
5503 return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
5504}
5505
5506static void calibrate_migration_costs(const cpumask_t *cpu_map)
5507{
5508 int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
5509 unsigned long j0, j1, distance, max_distance = 0;
5510 struct sched_domain *sd;
5511
5512 j0 = jiffies;
5513
5514 /*
5515 * First pass - calculate the cacheflush times:
5516 */
5517 for_each_cpu_mask(cpu1, *cpu_map) {
5518 for_each_cpu_mask(cpu2, *cpu_map) {
5519 if (cpu1 == cpu2)
5520 continue;
5521 distance = domain_distance(cpu1, cpu2);
5522 max_distance = max(max_distance, distance);
5523 /*
5524 * No result cached yet?
5525 */
5526 if (migration_cost[distance] == -1LL)
5527 migration_cost[distance] =
5528 measure_migration_cost(cpu1, cpu2);
5529 }
5530 }
5531 /*
5532 * Second pass - update the sched domain hierarchy with
5533 * the new cache-hot-time estimations:
5534 */
5535 for_each_cpu_mask(cpu, *cpu_map) {
5536 distance = 0;
5537 for_each_domain(cpu, sd) {
5538 sd->cache_hot_time = migration_cost[distance];
5539 distance++;
5540 }
5541 }
5542 /*
5543 * Print the matrix:
5544 */
5545 if (migration_debug)
5546 printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
5547 max_cache_size,
5548#ifdef CONFIG_X86
5549 cpu_khz/1000
5550#else
5551 -1
5552#endif
5553 );
5554 printk("migration_cost=");
5555 for (distance = 0; distance <= max_distance; distance++) {
5556 if (distance)
5557 printk(",");
5558 printk("%ld", (long)migration_cost[distance] / 1000);
5559 }
5560 printk("\n");
5561 j1 = jiffies;
5562 if (migration_debug)
5563 printk("migration: %ld seconds\n", (j1-j0)/HZ);
5564
5565 /*
5566 * Move back to the original CPU. NUMA-Q gets confused
5567 * if we migrate to another quad during bootup.
5568 */
5569 if (raw_smp_processor_id() != orig_cpu) {
5570 cpumask_t mask = cpumask_of_cpu(orig_cpu),
5571 saved_mask = current->cpus_allowed;
5572
5573 set_cpus_allowed(current, mask);
5574 set_cpus_allowed(current, saved_mask);
5575 }
5576}
5577
5076#ifdef CONFIG_NUMA 5578#ifdef CONFIG_NUMA
5579
5077/** 5580/**
5078 * find_next_best_node - find the next node to include in a sched_domain 5581 * find_next_best_node - find the next node to include in a sched_domain
5079 * @node: node whose sched_domain we're building 5582 * @node: node whose sched_domain we're building
@@ -5439,6 +5942,10 @@ next_sg:
5439#endif 5942#endif
5440 cpu_attach_domain(sd, i); 5943 cpu_attach_domain(sd, i);
5441 } 5944 }
5945 /*
5946 * Tune cache-hot values:
5947 */
5948 calibrate_migration_costs(cpu_map);
5442} 5949}
5443/* 5950/*
5444 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 5951 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
@@ -5505,7 +6012,7 @@ next_sg:
5505 * Detach sched domains from a group of cpus specified in cpu_map 6012 * Detach sched domains from a group of cpus specified in cpu_map
5506 * These cpus will now be attached to the NULL domain 6013 * These cpus will now be attached to the NULL domain
5507 */ 6014 */
5508static inline void detach_destroy_domains(const cpumask_t *cpu_map) 6015static void detach_destroy_domains(const cpumask_t *cpu_map)
5509{ 6016{
5510 int i; 6017 int i;
5511 6018