diff options
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 561 |
1 files changed, 534 insertions, 27 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 6f46c94cc29e..3ee2ae45125f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -27,12 +27,14 @@ | |||
| 27 | #include <linux/smp_lock.h> | 27 | #include <linux/smp_lock.h> |
| 28 | #include <asm/mmu_context.h> | 28 | #include <asm/mmu_context.h> |
| 29 | #include <linux/interrupt.h> | 29 | #include <linux/interrupt.h> |
| 30 | #include <linux/capability.h> | ||
| 30 | #include <linux/completion.h> | 31 | #include <linux/completion.h> |
| 31 | #include <linux/kernel_stat.h> | 32 | #include <linux/kernel_stat.h> |
| 32 | #include <linux/security.h> | 33 | #include <linux/security.h> |
| 33 | #include <linux/notifier.h> | 34 | #include <linux/notifier.h> |
| 34 | #include <linux/profile.h> | 35 | #include <linux/profile.h> |
| 35 | #include <linux/suspend.h> | 36 | #include <linux/suspend.h> |
| 37 | #include <linux/vmalloc.h> | ||
| 36 | #include <linux/blkdev.h> | 38 | #include <linux/blkdev.h> |
| 37 | #include <linux/delay.h> | 39 | #include <linux/delay.h> |
| 38 | #include <linux/smp.h> | 40 | #include <linux/smp.h> |
| @@ -176,6 +178,13 @@ static unsigned int task_timeslice(task_t *p) | |||
| 176 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | 178 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ |
| 177 | < (long long) (sd)->cache_hot_time) | 179 | < (long long) (sd)->cache_hot_time) |
| 178 | 180 | ||
| 181 | void __put_task_struct_cb(struct rcu_head *rhp) | ||
| 182 | { | ||
| 183 | __put_task_struct(container_of(rhp, struct task_struct, rcu)); | ||
| 184 | } | ||
| 185 | |||
| 186 | EXPORT_SYMBOL_GPL(__put_task_struct_cb); | ||
| 187 | |||
| 179 | /* | 188 | /* |
| 180 | * These are the runqueue data structures: | 189 | * These are the runqueue data structures: |
| 181 | */ | 190 | */ |
| @@ -512,7 +521,7 @@ static inline void sched_info_dequeued(task_t *t) | |||
| 512 | * long it was waiting to run. We also note when it began so that we | 521 | * long it was waiting to run. We also note when it began so that we |
| 513 | * can keep stats on how long its timeslice is. | 522 | * can keep stats on how long its timeslice is. |
| 514 | */ | 523 | */ |
| 515 | static inline void sched_info_arrive(task_t *t) | 524 | static void sched_info_arrive(task_t *t) |
| 516 | { | 525 | { |
| 517 | unsigned long now = jiffies, diff = 0; | 526 | unsigned long now = jiffies, diff = 0; |
| 518 | struct runqueue *rq = task_rq(t); | 527 | struct runqueue *rq = task_rq(t); |
| @@ -739,10 +748,14 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
| 739 | unsigned long long __sleep_time = now - p->timestamp; | 748 | unsigned long long __sleep_time = now - p->timestamp; |
| 740 | unsigned long sleep_time; | 749 | unsigned long sleep_time; |
| 741 | 750 | ||
| 742 | if (__sleep_time > NS_MAX_SLEEP_AVG) | 751 | if (unlikely(p->policy == SCHED_BATCH)) |
| 743 | sleep_time = NS_MAX_SLEEP_AVG; | 752 | sleep_time = 0; |
| 744 | else | 753 | else { |
| 745 | sleep_time = (unsigned long)__sleep_time; | 754 | if (__sleep_time > NS_MAX_SLEEP_AVG) |
| 755 | sleep_time = NS_MAX_SLEEP_AVG; | ||
| 756 | else | ||
| 757 | sleep_time = (unsigned long)__sleep_time; | ||
| 758 | } | ||
| 746 | 759 | ||
| 747 | if (likely(sleep_time > 0)) { | 760 | if (likely(sleep_time > 0)) { |
| 748 | /* | 761 | /* |
| @@ -994,7 +1007,7 @@ void kick_process(task_t *p) | |||
| 994 | * We want to under-estimate the load of migration sources, to | 1007 | * We want to under-estimate the load of migration sources, to |
| 995 | * balance conservatively. | 1008 | * balance conservatively. |
| 996 | */ | 1009 | */ |
| 997 | static inline unsigned long __source_load(int cpu, int type, enum idle_type idle) | 1010 | static unsigned long __source_load(int cpu, int type, enum idle_type idle) |
| 998 | { | 1011 | { |
| 999 | runqueue_t *rq = cpu_rq(cpu); | 1012 | runqueue_t *rq = cpu_rq(cpu); |
| 1000 | unsigned long running = rq->nr_running; | 1013 | unsigned long running = rq->nr_running; |
| @@ -1281,6 +1294,9 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) | |||
| 1281 | } | 1294 | } |
| 1282 | } | 1295 | } |
| 1283 | 1296 | ||
| 1297 | if (p->last_waker_cpu != this_cpu) | ||
| 1298 | goto out_set_cpu; | ||
| 1299 | |||
| 1284 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | 1300 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) |
| 1285 | goto out_set_cpu; | 1301 | goto out_set_cpu; |
| 1286 | 1302 | ||
| @@ -1351,6 +1367,8 @@ out_set_cpu: | |||
| 1351 | cpu = task_cpu(p); | 1367 | cpu = task_cpu(p); |
| 1352 | } | 1368 | } |
| 1353 | 1369 | ||
| 1370 | p->last_waker_cpu = this_cpu; | ||
| 1371 | |||
| 1354 | out_activate: | 1372 | out_activate: |
| 1355 | #endif /* CONFIG_SMP */ | 1373 | #endif /* CONFIG_SMP */ |
| 1356 | if (old_state == TASK_UNINTERRUPTIBLE) { | 1374 | if (old_state == TASK_UNINTERRUPTIBLE) { |
| @@ -1432,9 +1450,12 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
| 1432 | #ifdef CONFIG_SCHEDSTATS | 1450 | #ifdef CONFIG_SCHEDSTATS |
| 1433 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1451 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
| 1434 | #endif | 1452 | #endif |
| 1435 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1453 | #if defined(CONFIG_SMP) |
| 1454 | p->last_waker_cpu = cpu; | ||
| 1455 | #if defined(__ARCH_WANT_UNLOCKED_CTXSW) | ||
| 1436 | p->oncpu = 0; | 1456 | p->oncpu = 0; |
| 1437 | #endif | 1457 | #endif |
| 1458 | #endif | ||
| 1438 | #ifdef CONFIG_PREEMPT | 1459 | #ifdef CONFIG_PREEMPT |
| 1439 | /* Want to start with kernel preemption disabled. */ | 1460 | /* Want to start with kernel preemption disabled. */ |
| 1440 | task_thread_info(p)->preempt_count = 1; | 1461 | task_thread_info(p)->preempt_count = 1; |
| @@ -1849,7 +1870,7 @@ void sched_exec(void) | |||
| 1849 | * pull_task - move a task from a remote runqueue to the local runqueue. | 1870 | * pull_task - move a task from a remote runqueue to the local runqueue. |
| 1850 | * Both runqueues must be locked. | 1871 | * Both runqueues must be locked. |
| 1851 | */ | 1872 | */ |
| 1852 | static inline | 1873 | static |
| 1853 | void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | 1874 | void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, |
| 1854 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | 1875 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) |
| 1855 | { | 1876 | { |
| @@ -1871,7 +1892,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
| 1871 | /* | 1892 | /* |
| 1872 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 1893 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
| 1873 | */ | 1894 | */ |
| 1874 | static inline | 1895 | static |
| 1875 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | 1896 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, |
| 1876 | struct sched_domain *sd, enum idle_type idle, | 1897 | struct sched_domain *sd, enum idle_type idle, |
| 1877 | int *all_pinned) | 1898 | int *all_pinned) |
| @@ -2357,7 +2378,7 @@ out_balanced: | |||
| 2357 | * idle_balance is called by schedule() if this_cpu is about to become | 2378 | * idle_balance is called by schedule() if this_cpu is about to become |
| 2358 | * idle. Attempts to pull tasks from other CPUs. | 2379 | * idle. Attempts to pull tasks from other CPUs. |
| 2359 | */ | 2380 | */ |
| 2360 | static inline void idle_balance(int this_cpu, runqueue_t *this_rq) | 2381 | static void idle_balance(int this_cpu, runqueue_t *this_rq) |
| 2361 | { | 2382 | { |
| 2362 | struct sched_domain *sd; | 2383 | struct sched_domain *sd; |
| 2363 | 2384 | ||
| @@ -2741,7 +2762,7 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq) | |||
| 2741 | resched_task(rq->idle); | 2762 | resched_task(rq->idle); |
| 2742 | } | 2763 | } |
| 2743 | 2764 | ||
| 2744 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 2765 | static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) |
| 2745 | { | 2766 | { |
| 2746 | struct sched_domain *tmp, *sd = NULL; | 2767 | struct sched_domain *tmp, *sd = NULL; |
| 2747 | cpumask_t sibling_map; | 2768 | cpumask_t sibling_map; |
| @@ -2795,7 +2816,7 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | |||
| 2795 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | 2816 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; |
| 2796 | } | 2817 | } |
| 2797 | 2818 | ||
| 2798 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 2819 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) |
| 2799 | { | 2820 | { |
| 2800 | struct sched_domain *tmp, *sd = NULL; | 2821 | struct sched_domain *tmp, *sd = NULL; |
| 2801 | cpumask_t sibling_map; | 2822 | cpumask_t sibling_map; |
| @@ -3543,7 +3564,7 @@ void set_user_nice(task_t *p, long nice) | |||
| 3543 | * The RT priorities are set via sched_setscheduler(), but we still | 3564 | * The RT priorities are set via sched_setscheduler(), but we still |
| 3544 | * allow the 'normal' nice value to be set - but as expected | 3565 | * allow the 'normal' nice value to be set - but as expected |
| 3545 | * it wont have any effect on scheduling until the task is | 3566 | * it wont have any effect on scheduling until the task is |
| 3546 | * not SCHED_NORMAL: | 3567 | * not SCHED_NORMAL/SCHED_BATCH: |
| 3547 | */ | 3568 | */ |
| 3548 | if (rt_task(p)) { | 3569 | if (rt_task(p)) { |
| 3549 | p->static_prio = NICE_TO_PRIO(nice); | 3570 | p->static_prio = NICE_TO_PRIO(nice); |
| @@ -3689,10 +3710,16 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
| 3689 | BUG_ON(p->array); | 3710 | BUG_ON(p->array); |
| 3690 | p->policy = policy; | 3711 | p->policy = policy; |
| 3691 | p->rt_priority = prio; | 3712 | p->rt_priority = prio; |
| 3692 | if (policy != SCHED_NORMAL) | 3713 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { |
| 3693 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; | 3714 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; |
| 3694 | else | 3715 | } else { |
| 3695 | p->prio = p->static_prio; | 3716 | p->prio = p->static_prio; |
| 3717 | /* | ||
| 3718 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: | ||
| 3719 | */ | ||
| 3720 | if (policy == SCHED_BATCH) | ||
| 3721 | p->sleep_avg = 0; | ||
| 3722 | } | ||
| 3696 | } | 3723 | } |
| 3697 | 3724 | ||
| 3698 | /** | 3725 | /** |
| @@ -3716,29 +3743,35 @@ recheck: | |||
| 3716 | if (policy < 0) | 3743 | if (policy < 0) |
| 3717 | policy = oldpolicy = p->policy; | 3744 | policy = oldpolicy = p->policy; |
| 3718 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 3745 | else if (policy != SCHED_FIFO && policy != SCHED_RR && |
| 3719 | policy != SCHED_NORMAL) | 3746 | policy != SCHED_NORMAL && policy != SCHED_BATCH) |
| 3720 | return -EINVAL; | 3747 | return -EINVAL; |
| 3721 | /* | 3748 | /* |
| 3722 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 3749 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
| 3723 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. | 3750 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and |
| 3751 | * SCHED_BATCH is 0. | ||
| 3724 | */ | 3752 | */ |
| 3725 | if (param->sched_priority < 0 || | 3753 | if (param->sched_priority < 0 || |
| 3726 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 3754 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
| 3727 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | 3755 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) |
| 3728 | return -EINVAL; | 3756 | return -EINVAL; |
| 3729 | if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) | 3757 | if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) |
| 3758 | != (param->sched_priority == 0)) | ||
| 3730 | return -EINVAL; | 3759 | return -EINVAL; |
| 3731 | 3760 | ||
| 3732 | /* | 3761 | /* |
| 3733 | * Allow unprivileged RT tasks to decrease priority: | 3762 | * Allow unprivileged RT tasks to decrease priority: |
| 3734 | */ | 3763 | */ |
| 3735 | if (!capable(CAP_SYS_NICE)) { | 3764 | if (!capable(CAP_SYS_NICE)) { |
| 3736 | /* can't change policy */ | 3765 | /* |
| 3737 | if (policy != p->policy && | 3766 | * can't change policy, except between SCHED_NORMAL |
| 3738 | !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | 3767 | * and SCHED_BATCH: |
| 3768 | */ | ||
| 3769 | if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && | ||
| 3770 | (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && | ||
| 3771 | !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | ||
| 3739 | return -EPERM; | 3772 | return -EPERM; |
| 3740 | /* can't increase priority */ | 3773 | /* can't increase priority */ |
| 3741 | if (policy != SCHED_NORMAL && | 3774 | if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && |
| 3742 | param->sched_priority > p->rt_priority && | 3775 | param->sched_priority > p->rt_priority && |
| 3743 | param->sched_priority > | 3776 | param->sched_priority > |
| 3744 | p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | 3777 | p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) |
| @@ -3817,6 +3850,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
| 3817 | asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, | 3850 | asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, |
| 3818 | struct sched_param __user *param) | 3851 | struct sched_param __user *param) |
| 3819 | { | 3852 | { |
| 3853 | /* negative values for policy are not valid */ | ||
| 3854 | if (policy < 0) | ||
| 3855 | return -EINVAL; | ||
| 3856 | |||
| 3820 | return do_sched_setscheduler(pid, policy, param); | 3857 | return do_sched_setscheduler(pid, policy, param); |
| 3821 | } | 3858 | } |
| 3822 | 3859 | ||
| @@ -3972,12 +4009,12 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | |||
| 3972 | * method, such as ACPI for e.g. | 4009 | * method, such as ACPI for e.g. |
| 3973 | */ | 4010 | */ |
| 3974 | 4011 | ||
| 3975 | cpumask_t cpu_present_map; | 4012 | cpumask_t cpu_present_map __read_mostly; |
| 3976 | EXPORT_SYMBOL(cpu_present_map); | 4013 | EXPORT_SYMBOL(cpu_present_map); |
| 3977 | 4014 | ||
| 3978 | #ifndef CONFIG_SMP | 4015 | #ifndef CONFIG_SMP |
| 3979 | cpumask_t cpu_online_map = CPU_MASK_ALL; | 4016 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; |
| 3980 | cpumask_t cpu_possible_map = CPU_MASK_ALL; | 4017 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; |
| 3981 | #endif | 4018 | #endif |
| 3982 | 4019 | ||
| 3983 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 4020 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
| @@ -4216,6 +4253,7 @@ asmlinkage long sys_sched_get_priority_max(int policy) | |||
| 4216 | ret = MAX_USER_RT_PRIO-1; | 4253 | ret = MAX_USER_RT_PRIO-1; |
| 4217 | break; | 4254 | break; |
| 4218 | case SCHED_NORMAL: | 4255 | case SCHED_NORMAL: |
| 4256 | case SCHED_BATCH: | ||
| 4219 | ret = 0; | 4257 | ret = 0; |
| 4220 | break; | 4258 | break; |
| 4221 | } | 4259 | } |
| @@ -4239,6 +4277,7 @@ asmlinkage long sys_sched_get_priority_min(int policy) | |||
| 4239 | ret = 1; | 4277 | ret = 1; |
| 4240 | break; | 4278 | break; |
| 4241 | case SCHED_NORMAL: | 4279 | case SCHED_NORMAL: |
| 4280 | case SCHED_BATCH: | ||
| 4242 | ret = 0; | 4281 | ret = 0; |
| 4243 | } | 4282 | } |
| 4244 | return ret; | 4283 | return ret; |
| @@ -4379,6 +4418,7 @@ void show_state(void) | |||
| 4379 | } while_each_thread(g, p); | 4418 | } while_each_thread(g, p); |
| 4380 | 4419 | ||
| 4381 | read_unlock(&tasklist_lock); | 4420 | read_unlock(&tasklist_lock); |
| 4421 | mutex_debug_show_all_locks(); | ||
| 4382 | } | 4422 | } |
| 4383 | 4423 | ||
| 4384 | /** | 4424 | /** |
| @@ -5073,7 +5113,470 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, | |||
| 5073 | 5113 | ||
| 5074 | #define SD_NODES_PER_DOMAIN 16 | 5114 | #define SD_NODES_PER_DOMAIN 16 |
| 5075 | 5115 | ||
| 5116 | /* | ||
| 5117 | * Self-tuning task migration cost measurement between source and target CPUs. | ||
| 5118 | * | ||
| 5119 | * This is done by measuring the cost of manipulating buffers of varying | ||
| 5120 | * sizes. For a given buffer-size here are the steps that are taken: | ||
| 5121 | * | ||
| 5122 | * 1) the source CPU reads+dirties a shared buffer | ||
| 5123 | * 2) the target CPU reads+dirties the same shared buffer | ||
| 5124 | * | ||
| 5125 | * We measure how long they take, in the following 4 scenarios: | ||
| 5126 | * | ||
| 5127 | * - source: CPU1, target: CPU2 | cost1 | ||
| 5128 | * - source: CPU2, target: CPU1 | cost2 | ||
| 5129 | * - source: CPU1, target: CPU1 | cost3 | ||
| 5130 | * - source: CPU2, target: CPU2 | cost4 | ||
| 5131 | * | ||
| 5132 | * We then calculate the cost3+cost4-cost1-cost2 difference - this is | ||
| 5133 | * the cost of migration. | ||
| 5134 | * | ||
| 5135 | * We then start off from a small buffer-size and iterate up to larger | ||
| 5136 | * buffer sizes, in 5% steps - measuring each buffer-size separately, and | ||
| 5137 | * doing a maximum search for the cost. (The maximum cost for a migration | ||
| 5138 | * normally occurs when the working set size is around the effective cache | ||
| 5139 | * size.) | ||
| 5140 | */ | ||
| 5141 | #define SEARCH_SCOPE 2 | ||
| 5142 | #define MIN_CACHE_SIZE (64*1024U) | ||
| 5143 | #define DEFAULT_CACHE_SIZE (5*1024*1024U) | ||
| 5144 | #define ITERATIONS 2 | ||
| 5145 | #define SIZE_THRESH 130 | ||
| 5146 | #define COST_THRESH 130 | ||
| 5147 | |||
| 5148 | /* | ||
| 5149 | * The migration cost is a function of 'domain distance'. Domain | ||
| 5150 | * distance is the number of steps a CPU has to iterate down its | ||
| 5151 | * domain tree to share a domain with the other CPU. The farther | ||
| 5152 | * two CPUs are from each other, the larger the distance gets. | ||
| 5153 | * | ||
| 5154 | * Note that we use the distance only to cache measurement results, | ||
| 5155 | * the distance value is not used numerically otherwise. When two | ||
| 5156 | * CPUs have the same distance it is assumed that the migration | ||
| 5157 | * cost is the same. (this is a simplification but quite practical) | ||
| 5158 | */ | ||
| 5159 | #define MAX_DOMAIN_DISTANCE 32 | ||
| 5160 | |||
| 5161 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = | ||
| 5162 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL }; | ||
| 5163 | |||
| 5164 | /* | ||
| 5165 | * Allow override of migration cost - in units of microseconds. | ||
| 5166 | * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost | ||
| 5167 | * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: | ||
| 5168 | */ | ||
| 5169 | static int __init migration_cost_setup(char *str) | ||
| 5170 | { | ||
| 5171 | int ints[MAX_DOMAIN_DISTANCE+1], i; | ||
| 5172 | |||
| 5173 | str = get_options(str, ARRAY_SIZE(ints), ints); | ||
| 5174 | |||
| 5175 | printk("#ints: %d\n", ints[0]); | ||
| 5176 | for (i = 1; i <= ints[0]; i++) { | ||
| 5177 | migration_cost[i-1] = (unsigned long long)ints[i]*1000; | ||
| 5178 | printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); | ||
| 5179 | } | ||
| 5180 | return 1; | ||
| 5181 | } | ||
| 5182 | |||
| 5183 | __setup ("migration_cost=", migration_cost_setup); | ||
| 5184 | |||
| 5185 | /* | ||
| 5186 | * Global multiplier (divisor) for migration-cutoff values, | ||
| 5187 | * in percentiles. E.g. use a value of 150 to get 1.5 times | ||
| 5188 | * longer cache-hot cutoff times. | ||
| 5189 | * | ||
| 5190 | * (We scale it from 100 to 128 to long long handling easier.) | ||
| 5191 | */ | ||
| 5192 | |||
| 5193 | #define MIGRATION_FACTOR_SCALE 128 | ||
| 5194 | |||
| 5195 | static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; | ||
| 5196 | |||
| 5197 | static int __init setup_migration_factor(char *str) | ||
| 5198 | { | ||
| 5199 | get_option(&str, &migration_factor); | ||
| 5200 | migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; | ||
| 5201 | return 1; | ||
| 5202 | } | ||
| 5203 | |||
| 5204 | __setup("migration_factor=", setup_migration_factor); | ||
| 5205 | |||
| 5206 | /* | ||
| 5207 | * Estimated distance of two CPUs, measured via the number of domains | ||
| 5208 | * we have to pass for the two CPUs to be in the same span: | ||
| 5209 | */ | ||
| 5210 | static unsigned long domain_distance(int cpu1, int cpu2) | ||
| 5211 | { | ||
| 5212 | unsigned long distance = 0; | ||
| 5213 | struct sched_domain *sd; | ||
| 5214 | |||
| 5215 | for_each_domain(cpu1, sd) { | ||
| 5216 | WARN_ON(!cpu_isset(cpu1, sd->span)); | ||
| 5217 | if (cpu_isset(cpu2, sd->span)) | ||
| 5218 | return distance; | ||
| 5219 | distance++; | ||
| 5220 | } | ||
| 5221 | if (distance >= MAX_DOMAIN_DISTANCE) { | ||
| 5222 | WARN_ON(1); | ||
| 5223 | distance = MAX_DOMAIN_DISTANCE-1; | ||
| 5224 | } | ||
| 5225 | |||
| 5226 | return distance; | ||
| 5227 | } | ||
| 5228 | |||
| 5229 | static unsigned int migration_debug; | ||
| 5230 | |||
| 5231 | static int __init setup_migration_debug(char *str) | ||
| 5232 | { | ||
| 5233 | get_option(&str, &migration_debug); | ||
| 5234 | return 1; | ||
| 5235 | } | ||
| 5236 | |||
| 5237 | __setup("migration_debug=", setup_migration_debug); | ||
| 5238 | |||
| 5239 | /* | ||
| 5240 | * Maximum cache-size that the scheduler should try to measure. | ||
| 5241 | * Architectures with larger caches should tune this up during | ||
| 5242 | * bootup. Gets used in the domain-setup code (i.e. during SMP | ||
| 5243 | * bootup). | ||
| 5244 | */ | ||
| 5245 | unsigned int max_cache_size; | ||
| 5246 | |||
| 5247 | static int __init setup_max_cache_size(char *str) | ||
| 5248 | { | ||
| 5249 | get_option(&str, &max_cache_size); | ||
| 5250 | return 1; | ||
| 5251 | } | ||
| 5252 | |||
| 5253 | __setup("max_cache_size=", setup_max_cache_size); | ||
| 5254 | |||
| 5255 | /* | ||
| 5256 | * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This | ||
| 5257 | * is the operation that is timed, so we try to generate unpredictable | ||
| 5258 | * cachemisses that still end up filling the L2 cache: | ||
| 5259 | */ | ||
| 5260 | static void touch_cache(void *__cache, unsigned long __size) | ||
| 5261 | { | ||
| 5262 | unsigned long size = __size/sizeof(long), chunk1 = size/3, | ||
| 5263 | chunk2 = 2*size/3; | ||
| 5264 | unsigned long *cache = __cache; | ||
| 5265 | int i; | ||
| 5266 | |||
| 5267 | for (i = 0; i < size/6; i += 8) { | ||
| 5268 | switch (i % 6) { | ||
| 5269 | case 0: cache[i]++; | ||
| 5270 | case 1: cache[size-1-i]++; | ||
| 5271 | case 2: cache[chunk1-i]++; | ||
| 5272 | case 3: cache[chunk1+i]++; | ||
| 5273 | case 4: cache[chunk2-i]++; | ||
| 5274 | case 5: cache[chunk2+i]++; | ||
| 5275 | } | ||
| 5276 | } | ||
| 5277 | } | ||
| 5278 | |||
| 5279 | /* | ||
| 5280 | * Measure the cache-cost of one task migration. Returns in units of nsec. | ||
| 5281 | */ | ||
| 5282 | static unsigned long long measure_one(void *cache, unsigned long size, | ||
| 5283 | int source, int target) | ||
| 5284 | { | ||
| 5285 | cpumask_t mask, saved_mask; | ||
| 5286 | unsigned long long t0, t1, t2, t3, cost; | ||
| 5287 | |||
| 5288 | saved_mask = current->cpus_allowed; | ||
| 5289 | |||
| 5290 | /* | ||
| 5291 | * Flush source caches to RAM and invalidate them: | ||
| 5292 | */ | ||
| 5293 | sched_cacheflush(); | ||
| 5294 | |||
| 5295 | /* | ||
| 5296 | * Migrate to the source CPU: | ||
| 5297 | */ | ||
| 5298 | mask = cpumask_of_cpu(source); | ||
| 5299 | set_cpus_allowed(current, mask); | ||
| 5300 | WARN_ON(smp_processor_id() != source); | ||
| 5301 | |||
| 5302 | /* | ||
| 5303 | * Dirty the working set: | ||
| 5304 | */ | ||
| 5305 | t0 = sched_clock(); | ||
| 5306 | touch_cache(cache, size); | ||
| 5307 | t1 = sched_clock(); | ||
| 5308 | |||
| 5309 | /* | ||
| 5310 | * Migrate to the target CPU, dirty the L2 cache and access | ||
| 5311 | * the shared buffer. (which represents the working set | ||
| 5312 | * of a migrated task.) | ||
| 5313 | */ | ||
| 5314 | mask = cpumask_of_cpu(target); | ||
| 5315 | set_cpus_allowed(current, mask); | ||
| 5316 | WARN_ON(smp_processor_id() != target); | ||
| 5317 | |||
| 5318 | t2 = sched_clock(); | ||
| 5319 | touch_cache(cache, size); | ||
| 5320 | t3 = sched_clock(); | ||
| 5321 | |||
| 5322 | cost = t1-t0 + t3-t2; | ||
| 5323 | |||
| 5324 | if (migration_debug >= 2) | ||
| 5325 | printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", | ||
| 5326 | source, target, t1-t0, t1-t0, t3-t2, cost); | ||
| 5327 | /* | ||
| 5328 | * Flush target caches to RAM and invalidate them: | ||
| 5329 | */ | ||
| 5330 | sched_cacheflush(); | ||
| 5331 | |||
| 5332 | set_cpus_allowed(current, saved_mask); | ||
| 5333 | |||
| 5334 | return cost; | ||
| 5335 | } | ||
| 5336 | |||
| 5337 | /* | ||
| 5338 | * Measure a series of task migrations and return the average | ||
| 5339 | * result. Since this code runs early during bootup the system | ||
| 5340 | * is 'undisturbed' and the average latency makes sense. | ||
| 5341 | * | ||
| 5342 | * The algorithm in essence auto-detects the relevant cache-size, | ||
| 5343 | * so it will properly detect different cachesizes for different | ||
| 5344 | * cache-hierarchies, depending on how the CPUs are connected. | ||
| 5345 | * | ||
| 5346 | * Architectures can prime the upper limit of the search range via | ||
| 5347 | * max_cache_size, otherwise the search range defaults to 20MB...64K. | ||
| 5348 | */ | ||
| 5349 | static unsigned long long | ||
| 5350 | measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | ||
| 5351 | { | ||
| 5352 | unsigned long long cost1, cost2; | ||
| 5353 | int i; | ||
| 5354 | |||
| 5355 | /* | ||
| 5356 | * Measure the migration cost of 'size' bytes, over an | ||
| 5357 | * average of 10 runs: | ||
| 5358 | * | ||
| 5359 | * (We perturb the cache size by a small (0..4k) | ||
| 5360 | * value to compensate size/alignment related artifacts. | ||
| 5361 | * We also subtract the cost of the operation done on | ||
| 5362 | * the same CPU.) | ||
| 5363 | */ | ||
| 5364 | cost1 = 0; | ||
| 5365 | |||
| 5366 | /* | ||
| 5367 | * dry run, to make sure we start off cache-cold on cpu1, | ||
| 5368 | * and to get any vmalloc pagefaults in advance: | ||
| 5369 | */ | ||
| 5370 | measure_one(cache, size, cpu1, cpu2); | ||
| 5371 | for (i = 0; i < ITERATIONS; i++) | ||
| 5372 | cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); | ||
| 5373 | |||
| 5374 | measure_one(cache, size, cpu2, cpu1); | ||
| 5375 | for (i = 0; i < ITERATIONS; i++) | ||
| 5376 | cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); | ||
| 5377 | |||
| 5378 | /* | ||
| 5379 | * (We measure the non-migrating [cached] cost on both | ||
| 5380 | * cpu1 and cpu2, to handle CPUs with different speeds) | ||
| 5381 | */ | ||
| 5382 | cost2 = 0; | ||
| 5383 | |||
| 5384 | measure_one(cache, size, cpu1, cpu1); | ||
| 5385 | for (i = 0; i < ITERATIONS; i++) | ||
| 5386 | cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); | ||
| 5387 | |||
| 5388 | measure_one(cache, size, cpu2, cpu2); | ||
| 5389 | for (i = 0; i < ITERATIONS; i++) | ||
| 5390 | cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); | ||
| 5391 | |||
| 5392 | /* | ||
| 5393 | * Get the per-iteration migration cost: | ||
| 5394 | */ | ||
| 5395 | do_div(cost1, 2*ITERATIONS); | ||
| 5396 | do_div(cost2, 2*ITERATIONS); | ||
| 5397 | |||
| 5398 | return cost1 - cost2; | ||
| 5399 | } | ||
| 5400 | |||
| 5401 | static unsigned long long measure_migration_cost(int cpu1, int cpu2) | ||
| 5402 | { | ||
| 5403 | unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; | ||
| 5404 | unsigned int max_size, size, size_found = 0; | ||
| 5405 | long long cost = 0, prev_cost; | ||
| 5406 | void *cache; | ||
| 5407 | |||
| 5408 | /* | ||
| 5409 | * Search from max_cache_size*5 down to 64K - the real relevant | ||
| 5410 | * cachesize has to lie somewhere inbetween. | ||
| 5411 | */ | ||
| 5412 | if (max_cache_size) { | ||
| 5413 | max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
| 5414 | size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
| 5415 | } else { | ||
| 5416 | /* | ||
| 5417 | * Since we have no estimation about the relevant | ||
| 5418 | * search range | ||
| 5419 | */ | ||
| 5420 | max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; | ||
| 5421 | size = MIN_CACHE_SIZE; | ||
| 5422 | } | ||
| 5423 | |||
| 5424 | if (!cpu_online(cpu1) || !cpu_online(cpu2)) { | ||
| 5425 | printk("cpu %d and %d not both online!\n", cpu1, cpu2); | ||
| 5426 | return 0; | ||
| 5427 | } | ||
| 5428 | |||
| 5429 | /* | ||
| 5430 | * Allocate the working set: | ||
| 5431 | */ | ||
| 5432 | cache = vmalloc(max_size); | ||
| 5433 | if (!cache) { | ||
| 5434 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | ||
| 5435 | return 1000000; // return 1 msec on very small boxen | ||
| 5436 | } | ||
| 5437 | |||
| 5438 | while (size <= max_size) { | ||
| 5439 | prev_cost = cost; | ||
| 5440 | cost = measure_cost(cpu1, cpu2, cache, size); | ||
| 5441 | |||
| 5442 | /* | ||
| 5443 | * Update the max: | ||
| 5444 | */ | ||
| 5445 | if (cost > 0) { | ||
| 5446 | if (max_cost < cost) { | ||
| 5447 | max_cost = cost; | ||
| 5448 | size_found = size; | ||
| 5449 | } | ||
| 5450 | } | ||
| 5451 | /* | ||
| 5452 | * Calculate average fluctuation, we use this to prevent | ||
| 5453 | * noise from triggering an early break out of the loop: | ||
| 5454 | */ | ||
| 5455 | fluct = abs(cost - prev_cost); | ||
| 5456 | avg_fluct = (avg_fluct + fluct)/2; | ||
| 5457 | |||
| 5458 | if (migration_debug) | ||
| 5459 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", | ||
| 5460 | cpu1, cpu2, size, | ||
| 5461 | (long)cost / 1000000, | ||
| 5462 | ((long)cost / 100000) % 10, | ||
| 5463 | (long)max_cost / 1000000, | ||
| 5464 | ((long)max_cost / 100000) % 10, | ||
| 5465 | domain_distance(cpu1, cpu2), | ||
| 5466 | cost, avg_fluct); | ||
| 5467 | |||
| 5468 | /* | ||
| 5469 | * If we iterated at least 20% past the previous maximum, | ||
| 5470 | * and the cost has dropped by more than 20% already, | ||
| 5471 | * (taking fluctuations into account) then we assume to | ||
| 5472 | * have found the maximum and break out of the loop early: | ||
| 5473 | */ | ||
| 5474 | if (size_found && (size*100 > size_found*SIZE_THRESH)) | ||
| 5475 | if (cost+avg_fluct <= 0 || | ||
| 5476 | max_cost*100 > (cost+avg_fluct)*COST_THRESH) { | ||
| 5477 | |||
| 5478 | if (migration_debug) | ||
| 5479 | printk("-> found max.\n"); | ||
| 5480 | break; | ||
| 5481 | } | ||
| 5482 | /* | ||
| 5483 | * Increase the cachesize in 5% steps: | ||
| 5484 | */ | ||
| 5485 | size = size * 20 / 19; | ||
| 5486 | } | ||
| 5487 | |||
| 5488 | if (migration_debug) | ||
| 5489 | printk("[%d][%d] working set size found: %d, cost: %Ld\n", | ||
| 5490 | cpu1, cpu2, size_found, max_cost); | ||
| 5491 | |||
| 5492 | vfree(cache); | ||
| 5493 | |||
| 5494 | /* | ||
| 5495 | * A task is considered 'cache cold' if at least 2 times | ||
| 5496 | * the worst-case cost of migration has passed. | ||
| 5497 | * | ||
| 5498 | * (this limit is only listened to if the load-balancing | ||
| 5499 | * situation is 'nice' - if there is a large imbalance we | ||
| 5500 | * ignore it for the sake of CPU utilization and | ||
| 5501 | * processing fairness.) | ||
| 5502 | */ | ||
| 5503 | return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; | ||
| 5504 | } | ||
| 5505 | |||
| 5506 | static void calibrate_migration_costs(const cpumask_t *cpu_map) | ||
| 5507 | { | ||
| 5508 | int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); | ||
| 5509 | unsigned long j0, j1, distance, max_distance = 0; | ||
| 5510 | struct sched_domain *sd; | ||
| 5511 | |||
| 5512 | j0 = jiffies; | ||
| 5513 | |||
| 5514 | /* | ||
| 5515 | * First pass - calculate the cacheflush times: | ||
| 5516 | */ | ||
| 5517 | for_each_cpu_mask(cpu1, *cpu_map) { | ||
| 5518 | for_each_cpu_mask(cpu2, *cpu_map) { | ||
| 5519 | if (cpu1 == cpu2) | ||
| 5520 | continue; | ||
| 5521 | distance = domain_distance(cpu1, cpu2); | ||
| 5522 | max_distance = max(max_distance, distance); | ||
| 5523 | /* | ||
| 5524 | * No result cached yet? | ||
| 5525 | */ | ||
| 5526 | if (migration_cost[distance] == -1LL) | ||
| 5527 | migration_cost[distance] = | ||
| 5528 | measure_migration_cost(cpu1, cpu2); | ||
| 5529 | } | ||
| 5530 | } | ||
| 5531 | /* | ||
| 5532 | * Second pass - update the sched domain hierarchy with | ||
| 5533 | * the new cache-hot-time estimations: | ||
| 5534 | */ | ||
| 5535 | for_each_cpu_mask(cpu, *cpu_map) { | ||
| 5536 | distance = 0; | ||
| 5537 | for_each_domain(cpu, sd) { | ||
| 5538 | sd->cache_hot_time = migration_cost[distance]; | ||
| 5539 | distance++; | ||
| 5540 | } | ||
| 5541 | } | ||
| 5542 | /* | ||
| 5543 | * Print the matrix: | ||
| 5544 | */ | ||
| 5545 | if (migration_debug) | ||
| 5546 | printk("migration: max_cache_size: %d, cpu: %d MHz:\n", | ||
| 5547 | max_cache_size, | ||
| 5548 | #ifdef CONFIG_X86 | ||
| 5549 | cpu_khz/1000 | ||
| 5550 | #else | ||
| 5551 | -1 | ||
| 5552 | #endif | ||
| 5553 | ); | ||
| 5554 | printk("migration_cost="); | ||
| 5555 | for (distance = 0; distance <= max_distance; distance++) { | ||
| 5556 | if (distance) | ||
| 5557 | printk(","); | ||
| 5558 | printk("%ld", (long)migration_cost[distance] / 1000); | ||
| 5559 | } | ||
| 5560 | printk("\n"); | ||
| 5561 | j1 = jiffies; | ||
| 5562 | if (migration_debug) | ||
| 5563 | printk("migration: %ld seconds\n", (j1-j0)/HZ); | ||
| 5564 | |||
| 5565 | /* | ||
| 5566 | * Move back to the original CPU. NUMA-Q gets confused | ||
| 5567 | * if we migrate to another quad during bootup. | ||
| 5568 | */ | ||
| 5569 | if (raw_smp_processor_id() != orig_cpu) { | ||
| 5570 | cpumask_t mask = cpumask_of_cpu(orig_cpu), | ||
| 5571 | saved_mask = current->cpus_allowed; | ||
| 5572 | |||
| 5573 | set_cpus_allowed(current, mask); | ||
| 5574 | set_cpus_allowed(current, saved_mask); | ||
| 5575 | } | ||
| 5576 | } | ||
| 5577 | |||
| 5076 | #ifdef CONFIG_NUMA | 5578 | #ifdef CONFIG_NUMA |
| 5579 | |||
| 5077 | /** | 5580 | /** |
| 5078 | * find_next_best_node - find the next node to include in a sched_domain | 5581 | * find_next_best_node - find the next node to include in a sched_domain |
| 5079 | * @node: node whose sched_domain we're building | 5582 | * @node: node whose sched_domain we're building |
| @@ -5439,6 +5942,10 @@ next_sg: | |||
| 5439 | #endif | 5942 | #endif |
| 5440 | cpu_attach_domain(sd, i); | 5943 | cpu_attach_domain(sd, i); |
| 5441 | } | 5944 | } |
| 5945 | /* | ||
| 5946 | * Tune cache-hot values: | ||
| 5947 | */ | ||
| 5948 | calibrate_migration_costs(cpu_map); | ||
| 5442 | } | 5949 | } |
| 5443 | /* | 5950 | /* |
| 5444 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 5951 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
| @@ -5505,7 +6012,7 @@ next_sg: | |||
| 5505 | * Detach sched domains from a group of cpus specified in cpu_map | 6012 | * Detach sched domains from a group of cpus specified in cpu_map |
| 5506 | * These cpus will now be attached to the NULL domain | 6013 | * These cpus will now be attached to the NULL domain |
| 5507 | */ | 6014 | */ |
| 5508 | static inline void detach_destroy_domains(const cpumask_t *cpu_map) | 6015 | static void detach_destroy_domains(const cpumask_t *cpu_map) |
| 5509 | { | 6016 | { |
| 5510 | int i; | 6017 | int i; |
| 5511 | 6018 | ||
