diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 561 |
1 files changed, 534 insertions, 27 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 6f46c94cc29e..3ee2ae45125f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -27,12 +27,14 @@ | |||
27 | #include <linux/smp_lock.h> | 27 | #include <linux/smp_lock.h> |
28 | #include <asm/mmu_context.h> | 28 | #include <asm/mmu_context.h> |
29 | #include <linux/interrupt.h> | 29 | #include <linux/interrupt.h> |
30 | #include <linux/capability.h> | ||
30 | #include <linux/completion.h> | 31 | #include <linux/completion.h> |
31 | #include <linux/kernel_stat.h> | 32 | #include <linux/kernel_stat.h> |
32 | #include <linux/security.h> | 33 | #include <linux/security.h> |
33 | #include <linux/notifier.h> | 34 | #include <linux/notifier.h> |
34 | #include <linux/profile.h> | 35 | #include <linux/profile.h> |
35 | #include <linux/suspend.h> | 36 | #include <linux/suspend.h> |
37 | #include <linux/vmalloc.h> | ||
36 | #include <linux/blkdev.h> | 38 | #include <linux/blkdev.h> |
37 | #include <linux/delay.h> | 39 | #include <linux/delay.h> |
38 | #include <linux/smp.h> | 40 | #include <linux/smp.h> |
@@ -176,6 +178,13 @@ static unsigned int task_timeslice(task_t *p) | |||
176 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | 178 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ |
177 | < (long long) (sd)->cache_hot_time) | 179 | < (long long) (sd)->cache_hot_time) |
178 | 180 | ||
181 | void __put_task_struct_cb(struct rcu_head *rhp) | ||
182 | { | ||
183 | __put_task_struct(container_of(rhp, struct task_struct, rcu)); | ||
184 | } | ||
185 | |||
186 | EXPORT_SYMBOL_GPL(__put_task_struct_cb); | ||
187 | |||
179 | /* | 188 | /* |
180 | * These are the runqueue data structures: | 189 | * These are the runqueue data structures: |
181 | */ | 190 | */ |
@@ -512,7 +521,7 @@ static inline void sched_info_dequeued(task_t *t) | |||
512 | * long it was waiting to run. We also note when it began so that we | 521 | * long it was waiting to run. We also note when it began so that we |
513 | * can keep stats on how long its timeslice is. | 522 | * can keep stats on how long its timeslice is. |
514 | */ | 523 | */ |
515 | static inline void sched_info_arrive(task_t *t) | 524 | static void sched_info_arrive(task_t *t) |
516 | { | 525 | { |
517 | unsigned long now = jiffies, diff = 0; | 526 | unsigned long now = jiffies, diff = 0; |
518 | struct runqueue *rq = task_rq(t); | 527 | struct runqueue *rq = task_rq(t); |
@@ -739,10 +748,14 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
739 | unsigned long long __sleep_time = now - p->timestamp; | 748 | unsigned long long __sleep_time = now - p->timestamp; |
740 | unsigned long sleep_time; | 749 | unsigned long sleep_time; |
741 | 750 | ||
742 | if (__sleep_time > NS_MAX_SLEEP_AVG) | 751 | if (unlikely(p->policy == SCHED_BATCH)) |
743 | sleep_time = NS_MAX_SLEEP_AVG; | 752 | sleep_time = 0; |
744 | else | 753 | else { |
745 | sleep_time = (unsigned long)__sleep_time; | 754 | if (__sleep_time > NS_MAX_SLEEP_AVG) |
755 | sleep_time = NS_MAX_SLEEP_AVG; | ||
756 | else | ||
757 | sleep_time = (unsigned long)__sleep_time; | ||
758 | } | ||
746 | 759 | ||
747 | if (likely(sleep_time > 0)) { | 760 | if (likely(sleep_time > 0)) { |
748 | /* | 761 | /* |
@@ -994,7 +1007,7 @@ void kick_process(task_t *p) | |||
994 | * We want to under-estimate the load of migration sources, to | 1007 | * We want to under-estimate the load of migration sources, to |
995 | * balance conservatively. | 1008 | * balance conservatively. |
996 | */ | 1009 | */ |
997 | static inline unsigned long __source_load(int cpu, int type, enum idle_type idle) | 1010 | static unsigned long __source_load(int cpu, int type, enum idle_type idle) |
998 | { | 1011 | { |
999 | runqueue_t *rq = cpu_rq(cpu); | 1012 | runqueue_t *rq = cpu_rq(cpu); |
1000 | unsigned long running = rq->nr_running; | 1013 | unsigned long running = rq->nr_running; |
@@ -1281,6 +1294,9 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) | |||
1281 | } | 1294 | } |
1282 | } | 1295 | } |
1283 | 1296 | ||
1297 | if (p->last_waker_cpu != this_cpu) | ||
1298 | goto out_set_cpu; | ||
1299 | |||
1284 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | 1300 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) |
1285 | goto out_set_cpu; | 1301 | goto out_set_cpu; |
1286 | 1302 | ||
@@ -1351,6 +1367,8 @@ out_set_cpu: | |||
1351 | cpu = task_cpu(p); | 1367 | cpu = task_cpu(p); |
1352 | } | 1368 | } |
1353 | 1369 | ||
1370 | p->last_waker_cpu = this_cpu; | ||
1371 | |||
1354 | out_activate: | 1372 | out_activate: |
1355 | #endif /* CONFIG_SMP */ | 1373 | #endif /* CONFIG_SMP */ |
1356 | if (old_state == TASK_UNINTERRUPTIBLE) { | 1374 | if (old_state == TASK_UNINTERRUPTIBLE) { |
@@ -1432,9 +1450,12 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
1432 | #ifdef CONFIG_SCHEDSTATS | 1450 | #ifdef CONFIG_SCHEDSTATS |
1433 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1451 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
1434 | #endif | 1452 | #endif |
1435 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1453 | #if defined(CONFIG_SMP) |
1454 | p->last_waker_cpu = cpu; | ||
1455 | #if defined(__ARCH_WANT_UNLOCKED_CTXSW) | ||
1436 | p->oncpu = 0; | 1456 | p->oncpu = 0; |
1437 | #endif | 1457 | #endif |
1458 | #endif | ||
1438 | #ifdef CONFIG_PREEMPT | 1459 | #ifdef CONFIG_PREEMPT |
1439 | /* Want to start with kernel preemption disabled. */ | 1460 | /* Want to start with kernel preemption disabled. */ |
1440 | task_thread_info(p)->preempt_count = 1; | 1461 | task_thread_info(p)->preempt_count = 1; |
@@ -1849,7 +1870,7 @@ void sched_exec(void) | |||
1849 | * pull_task - move a task from a remote runqueue to the local runqueue. | 1870 | * pull_task - move a task from a remote runqueue to the local runqueue. |
1850 | * Both runqueues must be locked. | 1871 | * Both runqueues must be locked. |
1851 | */ | 1872 | */ |
1852 | static inline | 1873 | static |
1853 | void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | 1874 | void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, |
1854 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | 1875 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) |
1855 | { | 1876 | { |
@@ -1871,7 +1892,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
1871 | /* | 1892 | /* |
1872 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 1893 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
1873 | */ | 1894 | */ |
1874 | static inline | 1895 | static |
1875 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | 1896 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, |
1876 | struct sched_domain *sd, enum idle_type idle, | 1897 | struct sched_domain *sd, enum idle_type idle, |
1877 | int *all_pinned) | 1898 | int *all_pinned) |
@@ -2357,7 +2378,7 @@ out_balanced: | |||
2357 | * idle_balance is called by schedule() if this_cpu is about to become | 2378 | * idle_balance is called by schedule() if this_cpu is about to become |
2358 | * idle. Attempts to pull tasks from other CPUs. | 2379 | * idle. Attempts to pull tasks from other CPUs. |
2359 | */ | 2380 | */ |
2360 | static inline void idle_balance(int this_cpu, runqueue_t *this_rq) | 2381 | static void idle_balance(int this_cpu, runqueue_t *this_rq) |
2361 | { | 2382 | { |
2362 | struct sched_domain *sd; | 2383 | struct sched_domain *sd; |
2363 | 2384 | ||
@@ -2741,7 +2762,7 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq) | |||
2741 | resched_task(rq->idle); | 2762 | resched_task(rq->idle); |
2742 | } | 2763 | } |
2743 | 2764 | ||
2744 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 2765 | static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) |
2745 | { | 2766 | { |
2746 | struct sched_domain *tmp, *sd = NULL; | 2767 | struct sched_domain *tmp, *sd = NULL; |
2747 | cpumask_t sibling_map; | 2768 | cpumask_t sibling_map; |
@@ -2795,7 +2816,7 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | |||
2795 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | 2816 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; |
2796 | } | 2817 | } |
2797 | 2818 | ||
2798 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 2819 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) |
2799 | { | 2820 | { |
2800 | struct sched_domain *tmp, *sd = NULL; | 2821 | struct sched_domain *tmp, *sd = NULL; |
2801 | cpumask_t sibling_map; | 2822 | cpumask_t sibling_map; |
@@ -3543,7 +3564,7 @@ void set_user_nice(task_t *p, long nice) | |||
3543 | * The RT priorities are set via sched_setscheduler(), but we still | 3564 | * The RT priorities are set via sched_setscheduler(), but we still |
3544 | * allow the 'normal' nice value to be set - but as expected | 3565 | * allow the 'normal' nice value to be set - but as expected |
3545 | * it wont have any effect on scheduling until the task is | 3566 | * it wont have any effect on scheduling until the task is |
3546 | * not SCHED_NORMAL: | 3567 | * not SCHED_NORMAL/SCHED_BATCH: |
3547 | */ | 3568 | */ |
3548 | if (rt_task(p)) { | 3569 | if (rt_task(p)) { |
3549 | p->static_prio = NICE_TO_PRIO(nice); | 3570 | p->static_prio = NICE_TO_PRIO(nice); |
@@ -3689,10 +3710,16 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
3689 | BUG_ON(p->array); | 3710 | BUG_ON(p->array); |
3690 | p->policy = policy; | 3711 | p->policy = policy; |
3691 | p->rt_priority = prio; | 3712 | p->rt_priority = prio; |
3692 | if (policy != SCHED_NORMAL) | 3713 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { |
3693 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; | 3714 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; |
3694 | else | 3715 | } else { |
3695 | p->prio = p->static_prio; | 3716 | p->prio = p->static_prio; |
3717 | /* | ||
3718 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: | ||
3719 | */ | ||
3720 | if (policy == SCHED_BATCH) | ||
3721 | p->sleep_avg = 0; | ||
3722 | } | ||
3696 | } | 3723 | } |
3697 | 3724 | ||
3698 | /** | 3725 | /** |
@@ -3716,29 +3743,35 @@ recheck: | |||
3716 | if (policy < 0) | 3743 | if (policy < 0) |
3717 | policy = oldpolicy = p->policy; | 3744 | policy = oldpolicy = p->policy; |
3718 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 3745 | else if (policy != SCHED_FIFO && policy != SCHED_RR && |
3719 | policy != SCHED_NORMAL) | 3746 | policy != SCHED_NORMAL && policy != SCHED_BATCH) |
3720 | return -EINVAL; | 3747 | return -EINVAL; |
3721 | /* | 3748 | /* |
3722 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 3749 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
3723 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. | 3750 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and |
3751 | * SCHED_BATCH is 0. | ||
3724 | */ | 3752 | */ |
3725 | if (param->sched_priority < 0 || | 3753 | if (param->sched_priority < 0 || |
3726 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 3754 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
3727 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | 3755 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) |
3728 | return -EINVAL; | 3756 | return -EINVAL; |
3729 | if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) | 3757 | if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) |
3758 | != (param->sched_priority == 0)) | ||
3730 | return -EINVAL; | 3759 | return -EINVAL; |
3731 | 3760 | ||
3732 | /* | 3761 | /* |
3733 | * Allow unprivileged RT tasks to decrease priority: | 3762 | * Allow unprivileged RT tasks to decrease priority: |
3734 | */ | 3763 | */ |
3735 | if (!capable(CAP_SYS_NICE)) { | 3764 | if (!capable(CAP_SYS_NICE)) { |
3736 | /* can't change policy */ | 3765 | /* |
3737 | if (policy != p->policy && | 3766 | * can't change policy, except between SCHED_NORMAL |
3738 | !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | 3767 | * and SCHED_BATCH: |
3768 | */ | ||
3769 | if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && | ||
3770 | (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && | ||
3771 | !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | ||
3739 | return -EPERM; | 3772 | return -EPERM; |
3740 | /* can't increase priority */ | 3773 | /* can't increase priority */ |
3741 | if (policy != SCHED_NORMAL && | 3774 | if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && |
3742 | param->sched_priority > p->rt_priority && | 3775 | param->sched_priority > p->rt_priority && |
3743 | param->sched_priority > | 3776 | param->sched_priority > |
3744 | p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | 3777 | p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) |
@@ -3817,6 +3850,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
3817 | asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, | 3850 | asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, |
3818 | struct sched_param __user *param) | 3851 | struct sched_param __user *param) |
3819 | { | 3852 | { |
3853 | /* negative values for policy are not valid */ | ||
3854 | if (policy < 0) | ||
3855 | return -EINVAL; | ||
3856 | |||
3820 | return do_sched_setscheduler(pid, policy, param); | 3857 | return do_sched_setscheduler(pid, policy, param); |
3821 | } | 3858 | } |
3822 | 3859 | ||
@@ -3972,12 +4009,12 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | |||
3972 | * method, such as ACPI for e.g. | 4009 | * method, such as ACPI for e.g. |
3973 | */ | 4010 | */ |
3974 | 4011 | ||
3975 | cpumask_t cpu_present_map; | 4012 | cpumask_t cpu_present_map __read_mostly; |
3976 | EXPORT_SYMBOL(cpu_present_map); | 4013 | EXPORT_SYMBOL(cpu_present_map); |
3977 | 4014 | ||
3978 | #ifndef CONFIG_SMP | 4015 | #ifndef CONFIG_SMP |
3979 | cpumask_t cpu_online_map = CPU_MASK_ALL; | 4016 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; |
3980 | cpumask_t cpu_possible_map = CPU_MASK_ALL; | 4017 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; |
3981 | #endif | 4018 | #endif |
3982 | 4019 | ||
3983 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 4020 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
@@ -4216,6 +4253,7 @@ asmlinkage long sys_sched_get_priority_max(int policy) | |||
4216 | ret = MAX_USER_RT_PRIO-1; | 4253 | ret = MAX_USER_RT_PRIO-1; |
4217 | break; | 4254 | break; |
4218 | case SCHED_NORMAL: | 4255 | case SCHED_NORMAL: |
4256 | case SCHED_BATCH: | ||
4219 | ret = 0; | 4257 | ret = 0; |
4220 | break; | 4258 | break; |
4221 | } | 4259 | } |
@@ -4239,6 +4277,7 @@ asmlinkage long sys_sched_get_priority_min(int policy) | |||
4239 | ret = 1; | 4277 | ret = 1; |
4240 | break; | 4278 | break; |
4241 | case SCHED_NORMAL: | 4279 | case SCHED_NORMAL: |
4280 | case SCHED_BATCH: | ||
4242 | ret = 0; | 4281 | ret = 0; |
4243 | } | 4282 | } |
4244 | return ret; | 4283 | return ret; |
@@ -4379,6 +4418,7 @@ void show_state(void) | |||
4379 | } while_each_thread(g, p); | 4418 | } while_each_thread(g, p); |
4380 | 4419 | ||
4381 | read_unlock(&tasklist_lock); | 4420 | read_unlock(&tasklist_lock); |
4421 | mutex_debug_show_all_locks(); | ||
4382 | } | 4422 | } |
4383 | 4423 | ||
4384 | /** | 4424 | /** |
@@ -5073,7 +5113,470 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, | |||
5073 | 5113 | ||
5074 | #define SD_NODES_PER_DOMAIN 16 | 5114 | #define SD_NODES_PER_DOMAIN 16 |
5075 | 5115 | ||
5116 | /* | ||
5117 | * Self-tuning task migration cost measurement between source and target CPUs. | ||
5118 | * | ||
5119 | * This is done by measuring the cost of manipulating buffers of varying | ||
5120 | * sizes. For a given buffer-size here are the steps that are taken: | ||
5121 | * | ||
5122 | * 1) the source CPU reads+dirties a shared buffer | ||
5123 | * 2) the target CPU reads+dirties the same shared buffer | ||
5124 | * | ||
5125 | * We measure how long they take, in the following 4 scenarios: | ||
5126 | * | ||
5127 | * - source: CPU1, target: CPU2 | cost1 | ||
5128 | * - source: CPU2, target: CPU1 | cost2 | ||
5129 | * - source: CPU1, target: CPU1 | cost3 | ||
5130 | * - source: CPU2, target: CPU2 | cost4 | ||
5131 | * | ||
5132 | * We then calculate the cost3+cost4-cost1-cost2 difference - this is | ||
5133 | * the cost of migration. | ||
5134 | * | ||
5135 | * We then start off from a small buffer-size and iterate up to larger | ||
5136 | * buffer sizes, in 5% steps - measuring each buffer-size separately, and | ||
5137 | * doing a maximum search for the cost. (The maximum cost for a migration | ||
5138 | * normally occurs when the working set size is around the effective cache | ||
5139 | * size.) | ||
5140 | */ | ||
5141 | #define SEARCH_SCOPE 2 | ||
5142 | #define MIN_CACHE_SIZE (64*1024U) | ||
5143 | #define DEFAULT_CACHE_SIZE (5*1024*1024U) | ||
5144 | #define ITERATIONS 2 | ||
5145 | #define SIZE_THRESH 130 | ||
5146 | #define COST_THRESH 130 | ||
5147 | |||
5148 | /* | ||
5149 | * The migration cost is a function of 'domain distance'. Domain | ||
5150 | * distance is the number of steps a CPU has to iterate down its | ||
5151 | * domain tree to share a domain with the other CPU. The farther | ||
5152 | * two CPUs are from each other, the larger the distance gets. | ||
5153 | * | ||
5154 | * Note that we use the distance only to cache measurement results, | ||
5155 | * the distance value is not used numerically otherwise. When two | ||
5156 | * CPUs have the same distance it is assumed that the migration | ||
5157 | * cost is the same. (this is a simplification but quite practical) | ||
5158 | */ | ||
5159 | #define MAX_DOMAIN_DISTANCE 32 | ||
5160 | |||
5161 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = | ||
5162 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL }; | ||
5163 | |||
5164 | /* | ||
5165 | * Allow override of migration cost - in units of microseconds. | ||
5166 | * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost | ||
5167 | * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: | ||
5168 | */ | ||
5169 | static int __init migration_cost_setup(char *str) | ||
5170 | { | ||
5171 | int ints[MAX_DOMAIN_DISTANCE+1], i; | ||
5172 | |||
5173 | str = get_options(str, ARRAY_SIZE(ints), ints); | ||
5174 | |||
5175 | printk("#ints: %d\n", ints[0]); | ||
5176 | for (i = 1; i <= ints[0]; i++) { | ||
5177 | migration_cost[i-1] = (unsigned long long)ints[i]*1000; | ||
5178 | printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); | ||
5179 | } | ||
5180 | return 1; | ||
5181 | } | ||
5182 | |||
5183 | __setup ("migration_cost=", migration_cost_setup); | ||
5184 | |||
5185 | /* | ||
5186 | * Global multiplier (divisor) for migration-cutoff values, | ||
5187 | * in percentiles. E.g. use a value of 150 to get 1.5 times | ||
5188 | * longer cache-hot cutoff times. | ||
5189 | * | ||
5190 | * (We scale it from 100 to 128 to long long handling easier.) | ||
5191 | */ | ||
5192 | |||
5193 | #define MIGRATION_FACTOR_SCALE 128 | ||
5194 | |||
5195 | static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; | ||
5196 | |||
5197 | static int __init setup_migration_factor(char *str) | ||
5198 | { | ||
5199 | get_option(&str, &migration_factor); | ||
5200 | migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; | ||
5201 | return 1; | ||
5202 | } | ||
5203 | |||
5204 | __setup("migration_factor=", setup_migration_factor); | ||
5205 | |||
5206 | /* | ||
5207 | * Estimated distance of two CPUs, measured via the number of domains | ||
5208 | * we have to pass for the two CPUs to be in the same span: | ||
5209 | */ | ||
5210 | static unsigned long domain_distance(int cpu1, int cpu2) | ||
5211 | { | ||
5212 | unsigned long distance = 0; | ||
5213 | struct sched_domain *sd; | ||
5214 | |||
5215 | for_each_domain(cpu1, sd) { | ||
5216 | WARN_ON(!cpu_isset(cpu1, sd->span)); | ||
5217 | if (cpu_isset(cpu2, sd->span)) | ||
5218 | return distance; | ||
5219 | distance++; | ||
5220 | } | ||
5221 | if (distance >= MAX_DOMAIN_DISTANCE) { | ||
5222 | WARN_ON(1); | ||
5223 | distance = MAX_DOMAIN_DISTANCE-1; | ||
5224 | } | ||
5225 | |||
5226 | return distance; | ||
5227 | } | ||
5228 | |||
5229 | static unsigned int migration_debug; | ||
5230 | |||
5231 | static int __init setup_migration_debug(char *str) | ||
5232 | { | ||
5233 | get_option(&str, &migration_debug); | ||
5234 | return 1; | ||
5235 | } | ||
5236 | |||
5237 | __setup("migration_debug=", setup_migration_debug); | ||
5238 | |||
5239 | /* | ||
5240 | * Maximum cache-size that the scheduler should try to measure. | ||
5241 | * Architectures with larger caches should tune this up during | ||
5242 | * bootup. Gets used in the domain-setup code (i.e. during SMP | ||
5243 | * bootup). | ||
5244 | */ | ||
5245 | unsigned int max_cache_size; | ||
5246 | |||
5247 | static int __init setup_max_cache_size(char *str) | ||
5248 | { | ||
5249 | get_option(&str, &max_cache_size); | ||
5250 | return 1; | ||
5251 | } | ||
5252 | |||
5253 | __setup("max_cache_size=", setup_max_cache_size); | ||
5254 | |||
5255 | /* | ||
5256 | * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This | ||
5257 | * is the operation that is timed, so we try to generate unpredictable | ||
5258 | * cachemisses that still end up filling the L2 cache: | ||
5259 | */ | ||
5260 | static void touch_cache(void *__cache, unsigned long __size) | ||
5261 | { | ||
5262 | unsigned long size = __size/sizeof(long), chunk1 = size/3, | ||
5263 | chunk2 = 2*size/3; | ||
5264 | unsigned long *cache = __cache; | ||
5265 | int i; | ||
5266 | |||
5267 | for (i = 0; i < size/6; i += 8) { | ||
5268 | switch (i % 6) { | ||
5269 | case 0: cache[i]++; | ||
5270 | case 1: cache[size-1-i]++; | ||
5271 | case 2: cache[chunk1-i]++; | ||
5272 | case 3: cache[chunk1+i]++; | ||
5273 | case 4: cache[chunk2-i]++; | ||
5274 | case 5: cache[chunk2+i]++; | ||
5275 | } | ||
5276 | } | ||
5277 | } | ||
5278 | |||
5279 | /* | ||
5280 | * Measure the cache-cost of one task migration. Returns in units of nsec. | ||
5281 | */ | ||
5282 | static unsigned long long measure_one(void *cache, unsigned long size, | ||
5283 | int source, int target) | ||
5284 | { | ||
5285 | cpumask_t mask, saved_mask; | ||
5286 | unsigned long long t0, t1, t2, t3, cost; | ||
5287 | |||
5288 | saved_mask = current->cpus_allowed; | ||
5289 | |||
5290 | /* | ||
5291 | * Flush source caches to RAM and invalidate them: | ||
5292 | */ | ||
5293 | sched_cacheflush(); | ||
5294 | |||
5295 | /* | ||
5296 | * Migrate to the source CPU: | ||
5297 | */ | ||
5298 | mask = cpumask_of_cpu(source); | ||
5299 | set_cpus_allowed(current, mask); | ||
5300 | WARN_ON(smp_processor_id() != source); | ||
5301 | |||
5302 | /* | ||
5303 | * Dirty the working set: | ||
5304 | */ | ||
5305 | t0 = sched_clock(); | ||
5306 | touch_cache(cache, size); | ||
5307 | t1 = sched_clock(); | ||
5308 | |||
5309 | /* | ||
5310 | * Migrate to the target CPU, dirty the L2 cache and access | ||
5311 | * the shared buffer. (which represents the working set | ||
5312 | * of a migrated task.) | ||
5313 | */ | ||
5314 | mask = cpumask_of_cpu(target); | ||
5315 | set_cpus_allowed(current, mask); | ||
5316 | WARN_ON(smp_processor_id() != target); | ||
5317 | |||
5318 | t2 = sched_clock(); | ||
5319 | touch_cache(cache, size); | ||
5320 | t3 = sched_clock(); | ||
5321 | |||
5322 | cost = t1-t0 + t3-t2; | ||
5323 | |||
5324 | if (migration_debug >= 2) | ||
5325 | printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", | ||
5326 | source, target, t1-t0, t1-t0, t3-t2, cost); | ||
5327 | /* | ||
5328 | * Flush target caches to RAM and invalidate them: | ||
5329 | */ | ||
5330 | sched_cacheflush(); | ||
5331 | |||
5332 | set_cpus_allowed(current, saved_mask); | ||
5333 | |||
5334 | return cost; | ||
5335 | } | ||
5336 | |||
5337 | /* | ||
5338 | * Measure a series of task migrations and return the average | ||
5339 | * result. Since this code runs early during bootup the system | ||
5340 | * is 'undisturbed' and the average latency makes sense. | ||
5341 | * | ||
5342 | * The algorithm in essence auto-detects the relevant cache-size, | ||
5343 | * so it will properly detect different cachesizes for different | ||
5344 | * cache-hierarchies, depending on how the CPUs are connected. | ||
5345 | * | ||
5346 | * Architectures can prime the upper limit of the search range via | ||
5347 | * max_cache_size, otherwise the search range defaults to 20MB...64K. | ||
5348 | */ | ||
5349 | static unsigned long long | ||
5350 | measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | ||
5351 | { | ||
5352 | unsigned long long cost1, cost2; | ||
5353 | int i; | ||
5354 | |||
5355 | /* | ||
5356 | * Measure the migration cost of 'size' bytes, over an | ||
5357 | * average of 10 runs: | ||
5358 | * | ||
5359 | * (We perturb the cache size by a small (0..4k) | ||
5360 | * value to compensate size/alignment related artifacts. | ||
5361 | * We also subtract the cost of the operation done on | ||
5362 | * the same CPU.) | ||
5363 | */ | ||
5364 | cost1 = 0; | ||
5365 | |||
5366 | /* | ||
5367 | * dry run, to make sure we start off cache-cold on cpu1, | ||
5368 | * and to get any vmalloc pagefaults in advance: | ||
5369 | */ | ||
5370 | measure_one(cache, size, cpu1, cpu2); | ||
5371 | for (i = 0; i < ITERATIONS; i++) | ||
5372 | cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); | ||
5373 | |||
5374 | measure_one(cache, size, cpu2, cpu1); | ||
5375 | for (i = 0; i < ITERATIONS; i++) | ||
5376 | cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); | ||
5377 | |||
5378 | /* | ||
5379 | * (We measure the non-migrating [cached] cost on both | ||
5380 | * cpu1 and cpu2, to handle CPUs with different speeds) | ||
5381 | */ | ||
5382 | cost2 = 0; | ||
5383 | |||
5384 | measure_one(cache, size, cpu1, cpu1); | ||
5385 | for (i = 0; i < ITERATIONS; i++) | ||
5386 | cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); | ||
5387 | |||
5388 | measure_one(cache, size, cpu2, cpu2); | ||
5389 | for (i = 0; i < ITERATIONS; i++) | ||
5390 | cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); | ||
5391 | |||
5392 | /* | ||
5393 | * Get the per-iteration migration cost: | ||
5394 | */ | ||
5395 | do_div(cost1, 2*ITERATIONS); | ||
5396 | do_div(cost2, 2*ITERATIONS); | ||
5397 | |||
5398 | return cost1 - cost2; | ||
5399 | } | ||
5400 | |||
5401 | static unsigned long long measure_migration_cost(int cpu1, int cpu2) | ||
5402 | { | ||
5403 | unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; | ||
5404 | unsigned int max_size, size, size_found = 0; | ||
5405 | long long cost = 0, prev_cost; | ||
5406 | void *cache; | ||
5407 | |||
5408 | /* | ||
5409 | * Search from max_cache_size*5 down to 64K - the real relevant | ||
5410 | * cachesize has to lie somewhere inbetween. | ||
5411 | */ | ||
5412 | if (max_cache_size) { | ||
5413 | max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
5414 | size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
5415 | } else { | ||
5416 | /* | ||
5417 | * Since we have no estimation about the relevant | ||
5418 | * search range | ||
5419 | */ | ||
5420 | max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; | ||
5421 | size = MIN_CACHE_SIZE; | ||
5422 | } | ||
5423 | |||
5424 | if (!cpu_online(cpu1) || !cpu_online(cpu2)) { | ||
5425 | printk("cpu %d and %d not both online!\n", cpu1, cpu2); | ||
5426 | return 0; | ||
5427 | } | ||
5428 | |||
5429 | /* | ||
5430 | * Allocate the working set: | ||
5431 | */ | ||
5432 | cache = vmalloc(max_size); | ||
5433 | if (!cache) { | ||
5434 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | ||
5435 | return 1000000; // return 1 msec on very small boxen | ||
5436 | } | ||
5437 | |||
5438 | while (size <= max_size) { | ||
5439 | prev_cost = cost; | ||
5440 | cost = measure_cost(cpu1, cpu2, cache, size); | ||
5441 | |||
5442 | /* | ||
5443 | * Update the max: | ||
5444 | */ | ||
5445 | if (cost > 0) { | ||
5446 | if (max_cost < cost) { | ||
5447 | max_cost = cost; | ||
5448 | size_found = size; | ||
5449 | } | ||
5450 | } | ||
5451 | /* | ||
5452 | * Calculate average fluctuation, we use this to prevent | ||
5453 | * noise from triggering an early break out of the loop: | ||
5454 | */ | ||
5455 | fluct = abs(cost - prev_cost); | ||
5456 | avg_fluct = (avg_fluct + fluct)/2; | ||
5457 | |||
5458 | if (migration_debug) | ||
5459 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", | ||
5460 | cpu1, cpu2, size, | ||
5461 | (long)cost / 1000000, | ||
5462 | ((long)cost / 100000) % 10, | ||
5463 | (long)max_cost / 1000000, | ||
5464 | ((long)max_cost / 100000) % 10, | ||
5465 | domain_distance(cpu1, cpu2), | ||
5466 | cost, avg_fluct); | ||
5467 | |||
5468 | /* | ||
5469 | * If we iterated at least 20% past the previous maximum, | ||
5470 | * and the cost has dropped by more than 20% already, | ||
5471 | * (taking fluctuations into account) then we assume to | ||
5472 | * have found the maximum and break out of the loop early: | ||
5473 | */ | ||
5474 | if (size_found && (size*100 > size_found*SIZE_THRESH)) | ||
5475 | if (cost+avg_fluct <= 0 || | ||
5476 | max_cost*100 > (cost+avg_fluct)*COST_THRESH) { | ||
5477 | |||
5478 | if (migration_debug) | ||
5479 | printk("-> found max.\n"); | ||
5480 | break; | ||
5481 | } | ||
5482 | /* | ||
5483 | * Increase the cachesize in 5% steps: | ||
5484 | */ | ||
5485 | size = size * 20 / 19; | ||
5486 | } | ||
5487 | |||
5488 | if (migration_debug) | ||
5489 | printk("[%d][%d] working set size found: %d, cost: %Ld\n", | ||
5490 | cpu1, cpu2, size_found, max_cost); | ||
5491 | |||
5492 | vfree(cache); | ||
5493 | |||
5494 | /* | ||
5495 | * A task is considered 'cache cold' if at least 2 times | ||
5496 | * the worst-case cost of migration has passed. | ||
5497 | * | ||
5498 | * (this limit is only listened to if the load-balancing | ||
5499 | * situation is 'nice' - if there is a large imbalance we | ||
5500 | * ignore it for the sake of CPU utilization and | ||
5501 | * processing fairness.) | ||
5502 | */ | ||
5503 | return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; | ||
5504 | } | ||
5505 | |||
5506 | static void calibrate_migration_costs(const cpumask_t *cpu_map) | ||
5507 | { | ||
5508 | int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); | ||
5509 | unsigned long j0, j1, distance, max_distance = 0; | ||
5510 | struct sched_domain *sd; | ||
5511 | |||
5512 | j0 = jiffies; | ||
5513 | |||
5514 | /* | ||
5515 | * First pass - calculate the cacheflush times: | ||
5516 | */ | ||
5517 | for_each_cpu_mask(cpu1, *cpu_map) { | ||
5518 | for_each_cpu_mask(cpu2, *cpu_map) { | ||
5519 | if (cpu1 == cpu2) | ||
5520 | continue; | ||
5521 | distance = domain_distance(cpu1, cpu2); | ||
5522 | max_distance = max(max_distance, distance); | ||
5523 | /* | ||
5524 | * No result cached yet? | ||
5525 | */ | ||
5526 | if (migration_cost[distance] == -1LL) | ||
5527 | migration_cost[distance] = | ||
5528 | measure_migration_cost(cpu1, cpu2); | ||
5529 | } | ||
5530 | } | ||
5531 | /* | ||
5532 | * Second pass - update the sched domain hierarchy with | ||
5533 | * the new cache-hot-time estimations: | ||
5534 | */ | ||
5535 | for_each_cpu_mask(cpu, *cpu_map) { | ||
5536 | distance = 0; | ||
5537 | for_each_domain(cpu, sd) { | ||
5538 | sd->cache_hot_time = migration_cost[distance]; | ||
5539 | distance++; | ||
5540 | } | ||
5541 | } | ||
5542 | /* | ||
5543 | * Print the matrix: | ||
5544 | */ | ||
5545 | if (migration_debug) | ||
5546 | printk("migration: max_cache_size: %d, cpu: %d MHz:\n", | ||
5547 | max_cache_size, | ||
5548 | #ifdef CONFIG_X86 | ||
5549 | cpu_khz/1000 | ||
5550 | #else | ||
5551 | -1 | ||
5552 | #endif | ||
5553 | ); | ||
5554 | printk("migration_cost="); | ||
5555 | for (distance = 0; distance <= max_distance; distance++) { | ||
5556 | if (distance) | ||
5557 | printk(","); | ||
5558 | printk("%ld", (long)migration_cost[distance] / 1000); | ||
5559 | } | ||
5560 | printk("\n"); | ||
5561 | j1 = jiffies; | ||
5562 | if (migration_debug) | ||
5563 | printk("migration: %ld seconds\n", (j1-j0)/HZ); | ||
5564 | |||
5565 | /* | ||
5566 | * Move back to the original CPU. NUMA-Q gets confused | ||
5567 | * if we migrate to another quad during bootup. | ||
5568 | */ | ||
5569 | if (raw_smp_processor_id() != orig_cpu) { | ||
5570 | cpumask_t mask = cpumask_of_cpu(orig_cpu), | ||
5571 | saved_mask = current->cpus_allowed; | ||
5572 | |||
5573 | set_cpus_allowed(current, mask); | ||
5574 | set_cpus_allowed(current, saved_mask); | ||
5575 | } | ||
5576 | } | ||
5577 | |||
5076 | #ifdef CONFIG_NUMA | 5578 | #ifdef CONFIG_NUMA |
5579 | |||
5077 | /** | 5580 | /** |
5078 | * find_next_best_node - find the next node to include in a sched_domain | 5581 | * find_next_best_node - find the next node to include in a sched_domain |
5079 | * @node: node whose sched_domain we're building | 5582 | * @node: node whose sched_domain we're building |
@@ -5439,6 +5942,10 @@ next_sg: | |||
5439 | #endif | 5942 | #endif |
5440 | cpu_attach_domain(sd, i); | 5943 | cpu_attach_domain(sd, i); |
5441 | } | 5944 | } |
5945 | /* | ||
5946 | * Tune cache-hot values: | ||
5947 | */ | ||
5948 | calibrate_migration_costs(cpu_map); | ||
5442 | } | 5949 | } |
5443 | /* | 5950 | /* |
5444 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 5951 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
@@ -5505,7 +6012,7 @@ next_sg: | |||
5505 | * Detach sched domains from a group of cpus specified in cpu_map | 6012 | * Detach sched domains from a group of cpus specified in cpu_map |
5506 | * These cpus will now be attached to the NULL domain | 6013 | * These cpus will now be attached to the NULL domain |
5507 | */ | 6014 | */ |
5508 | static inline void detach_destroy_domains(const cpumask_t *cpu_map) | 6015 | static void detach_destroy_domains(const cpumask_t *cpu_map) |
5509 | { | 6016 | { |
5510 | int i; | 6017 | int i; |
5511 | 6018 | ||