sched: zap the migration init / cache-hot balancing code

the SMP load-balancer uses the boot-time migration-cost estimation code to attempt to improve the quality of balancing. The reason for this code is that the discrete priority queues do not preserve the order of scheduling accurately, so the load-balancer skips tasks that were running on a CPU 'recently'. this code is fundamental fragile: the boot-time migration cost detector doesnt really work on systems that had large L3 caches, it caused boot delays on large systems and the whole cache-hot concept made the balancing code pretty undeterministic as well. (and hey, i wrote most of it, so i can say it out loud that it sucks ;-) under CFS the same purpose of cache affinity can be achieved without any special cache-hot special-case: tasks are sorted in the 'timeline' tree and the SMP balancer picks tasks from the left side of the tree, thus the most cache-cold task is balanced automatically. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2007-07-09 12:51:57 -0400
committer: Ingo Molnar <mingo@elte.hu> 2007-07-09 12:51:57 -0400
commit: 0437e109e1841607f2988891eaa36c531c6aa6ac (patch)
tree: e9d8f170786f7e33d4c5829cb008cf38d42a2014 /kernel
parent: 0e6aca43e08a62a48d6770e9a159dbec167bf4c6 (diff)
1 files changed, 0 insertions, 481 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index ac054d9a0719..46b23f0fee24 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5797,483 +5797,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
 #define SD_NODES_PER_DOMAIN 16
-/*
- * Self-tuning task migration cost measurement between source and target CPUs.
- *
- * This is done by measuring the cost of manipulating buffers of varying
- * sizes. For a given buffer-size here are the steps that are taken:
- *
- * 1) the source CPU reads+dirties a shared buffer
- * 2) the target CPU reads+dirties the same shared buffer
- *
- * We measure how long they take, in the following 4 scenarios:
- *
- *  - source: CPU1, target: CPU2 | cost1
- *  - source: CPU2, target: CPU1 | cost2
- *  - source: CPU1, target: CPU1 | cost3
- *  - source: CPU2, target: CPU2 | cost4
- *
- * We then calculate the cost3+cost4-cost1-cost2 difference - this is
- * the cost of migration.
- *
- * We then start off from a small buffer-size and iterate up to larger
- * buffer sizes, in 5% steps - measuring each buffer-size separately, and
- * doing a maximum search for the cost. (The maximum cost for a migration
- * normally occurs when the working set size is around the effective cache
- * size.)
- */
-#define SEARCH_SCOPE            2
-#define MIN_CACHE_SIZE          (64*1024U)
-#define DEFAULT_CACHE_SIZE      (5*1024*1024U)
-#define ITERATIONS              1
-#define SIZE_THRESH             130
-#define COST_THRESH             130
-/*
- * The migration cost is a function of 'domain distance'. Domain
- * distance is the number of steps a CPU has to iterate down its
- * domain tree to share a domain with the other CPU. The farther
- * two CPUs are from each other, the larger the distance gets.
- *
- * Note that we use the distance only to cache measurement results,
- * the distance value is not used numerically otherwise. When two
- * CPUs have the same distance it is assumed that the migration
- * cost is the same. (this is a simplification but quite practical)
- */
-#define MAX_DOMAIN_DISTANCE 32
-static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
-                { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
-/*
- * Architectures may override the migration cost and thus avoid
- * boot-time calibration. Unit is nanoseconds. Mostly useful for
- * virtualized hardware:
- */
-#ifdef CONFIG_DEFAULT_MIGRATION_COST
-                        CONFIG_DEFAULT_MIGRATION_COST
-#else
-                        -1LL
-#endif
-};
-/*
- * Allow override of migration cost - in units of microseconds.
- * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
- * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
- */
-static int __init migration_cost_setup(char *str)
-{
-        int ints[MAX_DOMAIN_DISTANCE+1], i;
-        str = get_options(str, ARRAY_SIZE(ints), ints);
-        printk("#ints: %d\n", ints[0]);
-        for (i = 1; i <= ints[0]; i++) {
-                migration_cost[i-1] = (unsigned long long)ints[i]*1000;
-                printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
-        }
-        return 1;
-}
-__setup ("migration_cost=", migration_cost_setup);
-/*
- * Global multiplier (divisor) for migration-cutoff values,
- * in percentiles. E.g. use a value of 150 to get 1.5 times
- * longer cache-hot cutoff times.
- *
- * (We scale it from 100 to 128 to long long handling easier.)
- */
-#define MIGRATION_FACTOR_SCALE 128
-static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
-static int __init setup_migration_factor(char *str)
-{
-        get_option(&str, &migration_factor);
-        migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
-        return 1;
-}
-__setup("migration_factor=", setup_migration_factor);
-/*
- * Estimated distance of two CPUs, measured via the number of domains
- * we have to pass for the two CPUs to be in the same span:
- */
-static unsigned long domain_distance(int cpu1, int cpu2)
-{
-        unsigned long distance = 0;
-        struct sched_domain *sd;
-        for_each_domain(cpu1, sd) {
-                WARN_ON(!cpu_isset(cpu1, sd->span));
-                if (cpu_isset(cpu2, sd->span))
-                        return distance;
-                distance++;
-        }
-        if (distance >= MAX_DOMAIN_DISTANCE) {
-                WARN_ON(1);
-                distance = MAX_DOMAIN_DISTANCE-1;
-        }
-        return distance;
-}
-static unsigned int migration_debug;
-static int __init setup_migration_debug(char *str)
-{
-        get_option(&str, &migration_debug);
-        return 1;
-}
-__setup("migration_debug=", setup_migration_debug);
-/*
- * Maximum cache-size that the scheduler should try to measure.
- * Architectures with larger caches should tune this up during
- * bootup. Gets used in the domain-setup code (i.e. during SMP
- * bootup).
- */
-unsigned int max_cache_size;
-static int __init setup_max_cache_size(char *str)
-{
-        get_option(&str, &max_cache_size);
-        return 1;
-}
-__setup("max_cache_size=", setup_max_cache_size);
-/*
- * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
- * is the operation that is timed, so we try to generate unpredictable
- * cachemisses that still end up filling the L2 cache:
- */
-static void touch_cache(void *__cache, unsigned long __size)
-{
-        unsigned long size = __size / sizeof(long);
-        unsigned long chunk1 = size / 3;
-        unsigned long chunk2 = 2 * size / 3;
-        unsigned long *cache = __cache;
-        int i;
-        for (i = 0; i < size/6; i += 8) {
-                switch (i % 6) {
-                        case 0: cache[i]++;
-                        case 1: cache[size-1-i]++;
-                        case 2: cache[chunk1-i]++;
-                        case 3: cache[chunk1+i]++;
-                        case 4: cache[chunk2-i]++;
-                        case 5: cache[chunk2+i]++;
-                }
-        }
-}
-/*
- * Measure the cache-cost of one task migration. Returns in units of nsec.
- */
-static unsigned long long
-measure_one(void *cache, unsigned long size, int source, int target)
-{
-        cpumask_t mask, saved_mask;
-        unsigned long long t0, t1, t2, t3, cost;
-        saved_mask = current->cpus_allowed;
-        /*
-         * Flush source caches to RAM and invalidate them:
-         */
-        sched_cacheflush();
-        /*
-         * Migrate to the source CPU:
-         */
-        mask = cpumask_of_cpu(source);
-        set_cpus_allowed(current, mask);
-        WARN_ON(smp_processor_id() != source);
-        /*
-         * Dirty the working set:
-         */
-        t0 = sched_clock();
-        touch_cache(cache, size);
-        t1 = sched_clock();
-        /*
-         * Migrate to the target CPU, dirty the L2 cache and access
-         * the shared buffer. (which represents the working set
-         * of a migrated task.)
-         */
-        mask = cpumask_of_cpu(target);
-        set_cpus_allowed(current, mask);
-        WARN_ON(smp_processor_id() != target);
-        t2 = sched_clock();
-        touch_cache(cache, size);
-        t3 = sched_clock();
-        cost = t1-t0 + t3-t2;
-        if (migration_debug >= 2)
-                printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
-                        source, target, t1-t0, t1-t0, t3-t2, cost);
-        /*
-         * Flush target caches to RAM and invalidate them:
-         */
-        sched_cacheflush();
-        set_cpus_allowed(current, saved_mask);
-        return cost;
-}
-/*
- * Measure a series of task migrations and return the average
- * result. Since this code runs early during bootup the system
- * is 'undisturbed' and the average latency makes sense.
- *
- * The algorithm in essence auto-detects the relevant cache-size,
- * so it will properly detect different cachesizes for different
- * cache-hierarchies, depending on how the CPUs are connected.
- *
- * Architectures can prime the upper limit of the search range via
- * max_cache_size, otherwise the search range defaults to 20MB...64K.
- */
-static unsigned long long
-measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
-{
-        unsigned long long cost1, cost2;
-        int i;
-        /*
-         * Measure the migration cost of 'size' bytes, over an
-         * average of 10 runs:
-         *
-         * (We perturb the cache size by a small (0..4k)
-         *  value to compensate size/alignment related artifacts.
-         *  We also subtract the cost of the operation done on
-         *  the same CPU.)
-         */
-        cost1 = 0;
-        /*
-         * dry run, to make sure we start off cache-cold on cpu1,
-         * and to get any vmalloc pagefaults in advance:
-         */
-        measure_one(cache, size, cpu1, cpu2);
-        for (i = 0; i < ITERATIONS; i++)
-                cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
-        measure_one(cache, size, cpu2, cpu1);
-        for (i = 0; i < ITERATIONS; i++)
-                cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
-        /*
-         * (We measure the non-migrating [cached] cost on both
-         *  cpu1 and cpu2, to handle CPUs with different speeds)
-         */
-        cost2 = 0;
-        measure_one(cache, size, cpu1, cpu1);
-        for (i = 0; i < ITERATIONS; i++)
-                cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
-        measure_one(cache, size, cpu2, cpu2);
-        for (i = 0; i < ITERATIONS; i++)
-                cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
-        /*
-         * Get the per-iteration migration cost:
-         */
-        do_div(cost1, 2 * ITERATIONS);
-        do_div(cost2, 2 * ITERATIONS);
-        return cost1 - cost2;
-}
-static unsigned long long measure_migration_cost(int cpu1, int cpu2)
-{
-        unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
-        unsigned int max_size, size, size_found = 0;
-        long long cost = 0, prev_cost;
-        void *cache;
-        /*
-         * Search from max_cache_size*5 down to 64K - the real relevant
-         * cachesize has to lie somewhere inbetween.
-         */
-        if (max_cache_size) {
-                max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
-                size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
-        } else {
-                /*
-                 * Since we have no estimation about the relevant
-                 * search range
-                 */
-                max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
-                size = MIN_CACHE_SIZE;
-        }
-        if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
-                printk("cpu %d and %d not both online!\n", cpu1, cpu2);
-                return 0;
-        }
-        /*
-         * Allocate the working set:
-         */
-        cache = vmalloc(max_size);
-        if (!cache) {
-                printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
-                return 1000000; /* return 1 msec on very small boxen */
-        }
-        while (size <= max_size) {
-                prev_cost = cost;
-                cost = measure_cost(cpu1, cpu2, cache, size);
-                /*
-                 * Update the max:
-                 */
-                if (cost > 0) {
-                        if (max_cost < cost) {
-                                max_cost = cost;
-                                size_found = size;
-                        }
-                }
-                /*
-                 * Calculate average fluctuation, we use this to prevent
-                 * noise from triggering an early break out of the loop:
-                 */
-                fluct = abs(cost - prev_cost);
-                avg_fluct = (avg_fluct + fluct)/2;
-                if (migration_debug)
-                        printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
-                                "(%8Ld %8Ld)\n",
-                                cpu1, cpu2, size,
-                                (long)cost / 1000000,
-                                ((long)cost / 100000) % 10,
-                                (long)max_cost / 1000000,
-                                ((long)max_cost / 100000) % 10,
-                                domain_distance(cpu1, cpu2),
-                                cost, avg_fluct);
-                /*
-                 * If we iterated at least 20% past the previous maximum,
-                 * and the cost has dropped by more than 20% already,
-                 * (taking fluctuations into account) then we assume to
-                 * have found the maximum and break out of the loop early:
-                 */
-                if (size_found && (size*100 > size_found*SIZE_THRESH))
-                        if (cost+avg_fluct <= 0 ||
-                                max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
-                                if (migration_debug)
-                                        printk("-> found max.\n");
-                                break;
-                        }
-                /*
-                 * Increase the cachesize in 10% steps:
-                 */
-                size = size * 10 / 9;
-        }
-        if (migration_debug)
-                printk("[%d][%d] working set size found: %d, cost: %Ld\n",
-                        cpu1, cpu2, size_found, max_cost);
-        vfree(cache);
-        /*
-         * A task is considered 'cache cold' if at least 2 times
-         * the worst-case cost of migration has passed.
-         *
-         * (this limit is only listened to if the load-balancing
-         * situation is 'nice' - if there is a large imbalance we
-         * ignore it for the sake of CPU utilization and
-         * processing fairness.)
-         */
-        return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
-}
-static void calibrate_migration_costs(const cpumask_t *cpu_map)
-{
-        int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
-        unsigned long j0, j1, distance, max_distance = 0;
-        struct sched_domain *sd;
-        j0 = jiffies;
-        /*
-         * First pass - calculate the cacheflush times:
-         */
-        for_each_cpu_mask(cpu1, *cpu_map) {
-                for_each_cpu_mask(cpu2, *cpu_map) {
-                        if (cpu1 == cpu2)
-                                continue;
-                        distance = domain_distance(cpu1, cpu2);
-                        max_distance = max(max_distance, distance);
-                        /*
-                         * No result cached yet?
-                         */
-                        if (migration_cost[distance] == -1LL)
-                                migration_cost[distance] =
-                                        measure_migration_cost(cpu1, cpu2);
-                }
-        }
-        /*
-         * Second pass - update the sched domain hierarchy with
-         * the new cache-hot-time estimations:
-         */
-        for_each_cpu_mask(cpu, *cpu_map) {
-                distance = 0;
-                for_each_domain(cpu, sd) {
-                        sd->cache_hot_time = migration_cost[distance];
-                        distance++;
-                }
-        }
-        /*
-         * Print the matrix:
-         */
-        if (migration_debug)
-                printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
-                        max_cache_size,
-#ifdef CONFIG_X86
-                        cpu_khz/1000
-#else
-                        -1
-#endif
-                );
-        if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
-                printk("migration_cost=");
-                for (distance = 0; distance <= max_distance; distance++) {
-                        if (distance)
-                                printk(",");
-                        printk("%ld", (long)migration_cost[distance] / 1000);
-                }
-                printk("\n");
-        }
-        j1 = jiffies;
-        if (migration_debug)
-                printk("migration: %ld seconds\n", (j1-j0) / HZ);
-        /*
-         * Move back to the original CPU. NUMA-Q gets confused
-         * if we migrate to another quad during bootup.
-         */
-        if (raw_smp_processor_id() != orig_cpu) {
-                cpumask_t mask = cpumask_of_cpu(orig_cpu),
-                        saved_mask = current->cpus_allowed;
-                set_cpus_allowed(current, mask);
-                set_cpus_allowed(current, saved_mask);
-        }
-}
 #ifdef CONFIG_NUMA
 /**
@@ -6803,10 +6326,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #endif
                cpu_attach_domain(sd, i);
        }
-        /*
-         * Tune cache-hot values:
-         */
-        calibrate_migration_costs(cpu_map);
        return 0;
author	Ingo Molnar <mingo@elte.hu>	2007-07-09 12:51:57 -0400
committer	Ingo Molnar <mingo@elte.hu>	2007-07-09 12:51:57 -0400
commit	0437e109e1841607f2988891eaa36c531c6aa6ac (patch)
tree	e9d8f170786f7e33d4c5829cb008cf38d42a2014 /kernel
parent	0e6aca43e08a62a48d6770e9a159dbec167bf4c6 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index ac054d9a0719..46b23f0fee24 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -5797,483 +5797,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5797		5797
5798	#define SD_NODES_PER_DOMAIN 16	5798	#define SD_NODES_PER_DOMAIN 16
5799		5799
5800	/*
5801	* Self-tuning task migration cost measurement between source and target CPUs.
5802	*
5803	* This is done by measuring the cost of manipulating buffers of varying
5804	* sizes. For a given buffer-size here are the steps that are taken:
5805	*
5806	* 1) the source CPU reads+dirties a shared buffer
5807	* 2) the target CPU reads+dirties the same shared buffer
5808	*
5809	* We measure how long they take, in the following 4 scenarios:
5810	*
5811	* - source: CPU1, target: CPU2 \| cost1
5812	* - source: CPU2, target: CPU1 \| cost2
5813	* - source: CPU1, target: CPU1 \| cost3
5814	* - source: CPU2, target: CPU2 \| cost4
5815	*
5816	* We then calculate the cost3+cost4-cost1-cost2 difference - this is
5817	* the cost of migration.
5818	*
5819	* We then start off from a small buffer-size and iterate up to larger
5820	* buffer sizes, in 5% steps - measuring each buffer-size separately, and
5821	* doing a maximum search for the cost. (The maximum cost for a migration
5822	* normally occurs when the working set size is around the effective cache
5823	* size.)
5824	*/
5825	#define SEARCH_SCOPE 2
5826	#define MIN_CACHE_SIZE (64*1024U)
5827	#define DEFAULT_CACHE_SIZE (510241024U)
5828	#define ITERATIONS 1
5829	#define SIZE_THRESH 130
5830	#define COST_THRESH 130
5831
5832	/*
5833	* The migration cost is a function of 'domain distance'. Domain
5834	* distance is the number of steps a CPU has to iterate down its
5835	* domain tree to share a domain with the other CPU. The farther
5836	* two CPUs are from each other, the larger the distance gets.
5837	*
5838	* Note that we use the distance only to cache measurement results,
5839	* the distance value is not used numerically otherwise. When two
5840	* CPUs have the same distance it is assumed that the migration
5841	* cost is the same. (this is a simplification but quite practical)
5842	*/
5843	#define MAX_DOMAIN_DISTANCE 32
5844
5845	static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
5846	{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
5847	/*
5848	* Architectures may override the migration cost and thus avoid
5849	* boot-time calibration. Unit is nanoseconds. Mostly useful for
5850	* virtualized hardware:
5851	*/
5852	#ifdef CONFIG_DEFAULT_MIGRATION_COST
5853	CONFIG_DEFAULT_MIGRATION_COST
5854	#else
5855	-1LL
5856	#endif
5857	};
5858
5859	/*
5860	* Allow override of migration cost - in units of microseconds.
5861	* E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
5862	* of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
5863	*/
5864	static int __init migration_cost_setup(char *str)
5865	{
5866	int ints[MAX_DOMAIN_DISTANCE+1], i;
5867
5868	str = get_options(str, ARRAY_SIZE(ints), ints);
5869
5870	printk("#ints: %d\n", ints[0]);
5871	for (i = 1; i <= ints[0]; i++) {
5872	migration_cost[i-1] = (unsigned long long)ints[i]*1000;
5873	printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
5874	}
5875	return 1;
5876	}
5877
5878	__setup ("migration_cost=", migration_cost_setup);
5879
5880	/*
5881	* Global multiplier (divisor) for migration-cutoff values,
5882	* in percentiles. E.g. use a value of 150 to get 1.5 times
5883	* longer cache-hot cutoff times.
5884	*
5885	* (We scale it from 100 to 128 to long long handling easier.)
5886	*/
5887
5888	#define MIGRATION_FACTOR_SCALE 128
5889
5890	static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
5891
5892	static int __init setup_migration_factor(char *str)
5893	{
5894	get_option(&str, &migration_factor);
5895	migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
5896	return 1;
5897	}
5898
5899	__setup("migration_factor=", setup_migration_factor);
5900
5901	/*
5902	* Estimated distance of two CPUs, measured via the number of domains
5903	* we have to pass for the two CPUs to be in the same span:
5904	*/
5905	static unsigned long domain_distance(int cpu1, int cpu2)
5906	{
5907	unsigned long distance = 0;
5908	struct sched_domain *sd;
5909
5910	for_each_domain(cpu1, sd) {
5911	WARN_ON(!cpu_isset(cpu1, sd->span));
5912	if (cpu_isset(cpu2, sd->span))
5913	return distance;
5914	distance++;
5915	}
5916	if (distance >= MAX_DOMAIN_DISTANCE) {
5917	WARN_ON(1);
5918	distance = MAX_DOMAIN_DISTANCE-1;
5919	}
5920
5921	return distance;
5922	}
5923
5924	static unsigned int migration_debug;
5925
5926	static int __init setup_migration_debug(char *str)
5927	{
5928	get_option(&str, &migration_debug);
5929	return 1;
5930	}
5931
5932	__setup("migration_debug=", setup_migration_debug);
5933
5934	/*
5935	* Maximum cache-size that the scheduler should try to measure.
5936	* Architectures with larger caches should tune this up during
5937	* bootup. Gets used in the domain-setup code (i.e. during SMP
5938	* bootup).
5939	*/
5940	unsigned int max_cache_size;
5941
5942	static int __init setup_max_cache_size(char *str)
5943	{
5944	get_option(&str, &max_cache_size);
5945	return 1;
5946	}
5947
5948	__setup("max_cache_size=", setup_max_cache_size);
5949
5950	/*
5951	* Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
5952	* is the operation that is timed, so we try to generate unpredictable
5953	* cachemisses that still end up filling the L2 cache:
5954	*/
5955	static void touch_cache(void *__cache, unsigned long __size)
5956	{
5957	unsigned long size = __size / sizeof(long);
5958	unsigned long chunk1 = size / 3;
5959	unsigned long chunk2 = 2 * size / 3;
5960	unsigned long *cache = __cache;
5961	int i;
5962
5963	for (i = 0; i < size/6; i += 8) {
5964	switch (i % 6) {
5965	case 0: cache[i]++;
5966	case 1: cache[size-1-i]++;
5967	case 2: cache[chunk1-i]++;
5968	case 3: cache[chunk1+i]++;
5969	case 4: cache[chunk2-i]++;
5970	case 5: cache[chunk2+i]++;
5971	}
5972	}
5973	}
5974
5975	/*
5976	* Measure the cache-cost of one task migration. Returns in units of nsec.
5977	*/
5978	static unsigned long long
5979	measure_one(void *cache, unsigned long size, int source, int target)
5980	{
5981	cpumask_t mask, saved_mask;
5982	unsigned long long t0, t1, t2, t3, cost;
5983
5984	saved_mask = current->cpus_allowed;
5985
5986	/*
5987	* Flush source caches to RAM and invalidate them:
5988	*/
5989	sched_cacheflush();
5990
5991	/*
5992	* Migrate to the source CPU:
5993	*/
5994	mask = cpumask_of_cpu(source);
5995	set_cpus_allowed(current, mask);
5996	WARN_ON(smp_processor_id() != source);
5997
5998	/*
5999	* Dirty the working set:
6000	*/
6001	t0 = sched_clock();
6002	touch_cache(cache, size);
6003	t1 = sched_clock();
6004
6005	/*
6006	* Migrate to the target CPU, dirty the L2 cache and access
6007	* the shared buffer. (which represents the working set
6008	* of a migrated task.)
6009	*/
6010	mask = cpumask_of_cpu(target);
6011	set_cpus_allowed(current, mask);
6012	WARN_ON(smp_processor_id() != target);
6013
6014	t2 = sched_clock();
6015	touch_cache(cache, size);
6016	t3 = sched_clock();
6017
6018	cost = t1-t0 + t3-t2;
6019
6020	if (migration_debug >= 2)
6021	printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
6022	source, target, t1-t0, t1-t0, t3-t2, cost);
6023	/*
6024	* Flush target caches to RAM and invalidate them:
6025	*/
6026	sched_cacheflush();
6027
6028	set_cpus_allowed(current, saved_mask);
6029
6030	return cost;
6031	}
6032
6033	/*
6034	* Measure a series of task migrations and return the average
6035	* result. Since this code runs early during bootup the system
6036	* is 'undisturbed' and the average latency makes sense.
6037	*
6038	* The algorithm in essence auto-detects the relevant cache-size,
6039	* so it will properly detect different cachesizes for different
6040	* cache-hierarchies, depending on how the CPUs are connected.
6041	*
6042	* Architectures can prime the upper limit of the search range via
6043	* max_cache_size, otherwise the search range defaults to 20MB...64K.
6044	*/
6045	static unsigned long long
6046	measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
6047	{
6048	unsigned long long cost1, cost2;
6049	int i;
6050
6051	/*
6052	* Measure the migration cost of 'size' bytes, over an
6053	* average of 10 runs:
6054	*
6055	* (We perturb the cache size by a small (0..4k)
6056	* value to compensate size/alignment related artifacts.
6057	* We also subtract the cost of the operation done on
6058	* the same CPU.)
6059	*/
6060	cost1 = 0;
6061
6062	/*
6063	* dry run, to make sure we start off cache-cold on cpu1,
6064	* and to get any vmalloc pagefaults in advance:
6065	*/
6066	measure_one(cache, size, cpu1, cpu2);
6067	for (i = 0; i < ITERATIONS; i++)
6068	cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
6069
6070	measure_one(cache, size, cpu2, cpu1);
6071	for (i = 0; i < ITERATIONS; i++)
6072	cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
6073
6074	/*
6075	* (We measure the non-migrating [cached] cost on both
6076	* cpu1 and cpu2, to handle CPUs with different speeds)
6077	*/
6078	cost2 = 0;
6079
6080	measure_one(cache, size, cpu1, cpu1);
6081	for (i = 0; i < ITERATIONS; i++)
6082	cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
6083
6084	measure_one(cache, size, cpu2, cpu2);
6085	for (i = 0; i < ITERATIONS; i++)
6086	cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
6087
6088	/*
6089	* Get the per-iteration migration cost:
6090	*/
6091	do_div(cost1, 2 * ITERATIONS);
6092	do_div(cost2, 2 * ITERATIONS);
6093
6094	return cost1 - cost2;
6095	}
6096
6097	static unsigned long long measure_migration_cost(int cpu1, int cpu2)
6098	{
6099	unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
6100	unsigned int max_size, size, size_found = 0;
6101	long long cost = 0, prev_cost;
6102	void *cache;
6103
6104	/*
6105	* Search from max_cache_size*5 down to 64K - the real relevant
6106	* cachesize has to lie somewhere inbetween.
6107	*/
6108	if (max_cache_size) {
6109	max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
6110	size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
6111	} else {
6112	/*
6113	* Since we have no estimation about the relevant
6114	* search range
6115	*/
6116	max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
6117	size = MIN_CACHE_SIZE;
6118	}
6119
6120	if (!cpu_online(cpu1) \|\| !cpu_online(cpu2)) {
6121	printk("cpu %d and %d not both online!\n", cpu1, cpu2);
6122	return 0;
6123	}
6124
6125	/*
6126	* Allocate the working set:
6127	*/
6128	cache = vmalloc(max_size);
6129	if (!cache) {
6130	printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
6131	return 1000000; /* return 1 msec on very small boxen */
6132	}
6133
6134	while (size <= max_size) {
6135	prev_cost = cost;
6136	cost = measure_cost(cpu1, cpu2, cache, size);
6137
6138	/*
6139	* Update the max:
6140	*/
6141	if (cost > 0) {
6142	if (max_cost < cost) {
6143	max_cost = cost;
6144	size_found = size;
6145	}
6146	}
6147	/*
6148	* Calculate average fluctuation, we use this to prevent
6149	* noise from triggering an early break out of the loop:
6150	*/
6151	fluct = abs(cost - prev_cost);
6152	avg_fluct = (avg_fluct + fluct)/2;
6153
6154	if (migration_debug)
6155	printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
6156	"(%8Ld %8Ld)\n",
6157	cpu1, cpu2, size,
6158	(long)cost / 1000000,
6159	((long)cost / 100000) % 10,
6160	(long)max_cost / 1000000,
6161	((long)max_cost / 100000) % 10,
6162	domain_distance(cpu1, cpu2),
6163	cost, avg_fluct);
6164
6165	/*
6166	* If we iterated at least 20% past the previous maximum,
6167	* and the cost has dropped by more than 20% already,
6168	* (taking fluctuations into account) then we assume to
6169	* have found the maximum and break out of the loop early:
6170	*/
6171	if (size_found && (size100 > size_foundSIZE_THRESH))
6172	if (cost+avg_fluct <= 0 \|\|
6173	max_cost100 > (cost+avg_fluct)COST_THRESH) {
6174
6175	if (migration_debug)
6176	printk("-> found max.\n");
6177	break;
6178	}
6179	/*
6180	* Increase the cachesize in 10% steps:
6181	*/
6182	size = size * 10 / 9;
6183	}
6184
6185	if (migration_debug)
6186	printk("[%d][%d] working set size found: %d, cost: %Ld\n",
6187	cpu1, cpu2, size_found, max_cost);
6188
6189	vfree(cache);
6190
6191	/*
6192	* A task is considered 'cache cold' if at least 2 times
6193	* the worst-case cost of migration has passed.
6194	*
6195	* (this limit is only listened to if the load-balancing
6196	* situation is 'nice' - if there is a large imbalance we
6197	* ignore it for the sake of CPU utilization and
6198	* processing fairness.)
6199	*/
6200	return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
6201	}
6202
6203	static void calibrate_migration_costs(const cpumask_t *cpu_map)
6204	{
6205	int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
6206	unsigned long j0, j1, distance, max_distance = 0;
6207	struct sched_domain *sd;
6208
6209	j0 = jiffies;
6210
6211	/*
6212	* First pass - calculate the cacheflush times:
6213	*/
6214	for_each_cpu_mask(cpu1, *cpu_map) {
6215	for_each_cpu_mask(cpu2, *cpu_map) {
6216	if (cpu1 == cpu2)
6217	continue;
6218	distance = domain_distance(cpu1, cpu2);
6219	max_distance = max(max_distance, distance);
6220	/*
6221	* No result cached yet?
6222	*/
6223	if (migration_cost[distance] == -1LL)
6224	migration_cost[distance] =
6225	measure_migration_cost(cpu1, cpu2);
6226	}
6227	}
6228	/*
6229	* Second pass - update the sched domain hierarchy with
6230	* the new cache-hot-time estimations:
6231	*/
6232	for_each_cpu_mask(cpu, *cpu_map) {
6233	distance = 0;
6234	for_each_domain(cpu, sd) {
6235	sd->cache_hot_time = migration_cost[distance];
6236	distance++;
6237	}
6238	}
6239	/*
6240	* Print the matrix:
6241	*/
6242	if (migration_debug)
6243	printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
6244	max_cache_size,
6245	#ifdef CONFIG_X86
6246	cpu_khz/1000
6247	#else
6248	-1
6249	#endif
6250	);
6251	if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
6252	printk("migration_cost=");
6253	for (distance = 0; distance <= max_distance; distance++) {
6254	if (distance)
6255	printk(",");
6256	printk("%ld", (long)migration_cost[distance] / 1000);
6257	}
6258	printk("\n");
6259	}
6260	j1 = jiffies;
6261	if (migration_debug)
6262	printk("migration: %ld seconds\n", (j1-j0) / HZ);
6263
6264	/*
6265	* Move back to the original CPU. NUMA-Q gets confused
6266	* if we migrate to another quad during bootup.
6267	*/
6268	if (raw_smp_processor_id() != orig_cpu) {
6269	cpumask_t mask = cpumask_of_cpu(orig_cpu),
6270	saved_mask = current->cpus_allowed;
6271
6272	set_cpus_allowed(current, mask);
6273	set_cpus_allowed(current, saved_mask);
6274	}
6275	}
6276
6277	#ifdef CONFIG_NUMA	5800	#ifdef CONFIG_NUMA
6278		5801
6279	/**	5802	/**
@@ -6803,10 +6326,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6803	#endif	6326	#endif
6804	cpu_attach_domain(sd, i);	6327	cpu_attach_domain(sd, i);
6805	}	6328	}
6806	/*
6807	* Tune cache-hot values:
6808	*/
6809	calibrate_migration_costs(cpu_map);
6810		6329
6811	return 0;	6330	return 0;
6812		6331