1 files changed, 0 insertions, 481 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index ac054d9a0719..46b23f0fee24 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5797,483 +5797,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
 #define SD_NODES_PER_DOMAIN 16
-/*
- * Self-tuning task migration cost measurement between source and target CPUs.
- *
- * This is done by measuring the cost of manipulating buffers of varying
- * sizes. For a given buffer-size here are the steps that are taken:
- *
- * 1) the source CPU reads+dirties a shared buffer
- * 2) the target CPU reads+dirties the same shared buffer
- *
- * We measure how long they take, in the following 4 scenarios:
- *
- *  - source: CPU1, target: CPU2 | cost1
- *  - source: CPU2, target: CPU1 | cost2
- *  - source: CPU1, target: CPU1 | cost3
- *  - source: CPU2, target: CPU2 | cost4
- *
- * We then calculate the cost3+cost4-cost1-cost2 difference - this is
- * the cost of migration.
- *
- * We then start off from a small buffer-size and iterate up to larger
- * buffer sizes, in 5% steps - measuring each buffer-size separately, and
- * doing a maximum search for the cost. (The maximum cost for a migration
- * normally occurs when the working set size is around the effective cache
- * size.)
- */
-#define SEARCH_SCOPE            2
-#define MIN_CACHE_SIZE          (64*1024U)
-#define DEFAULT_CACHE_SIZE      (5*1024*1024U)
-#define ITERATIONS              1
-#define SIZE_THRESH             130
-#define COST_THRESH             130
-/*
- * The migration cost is a function of 'domain distance'. Domain
- * distance is the number of steps a CPU has to iterate down its
- * domain tree to share a domain with the other CPU. The farther
- * two CPUs are from each other, the larger the distance gets.
- *
- * Note that we use the distance only to cache measurement results,
- * the distance value is not used numerically otherwise. When two
- * CPUs have the same distance it is assumed that the migration
- * cost is the same. (this is a simplification but quite practical)
- */
-#define MAX_DOMAIN_DISTANCE 32
-static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
-                { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
-/*
- * Architectures may override the migration cost and thus avoid
- * boot-time calibration. Unit is nanoseconds. Mostly useful for
- * virtualized hardware:
- */
-#ifdef CONFIG_DEFAULT_MIGRATION_COST
-                        CONFIG_DEFAULT_MIGRATION_COST
-#else
-                        -1LL
-#endif
-};
-/*
- * Allow override of migration cost - in units of microseconds.
- * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
- * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
- */
-static int __init migration_cost_setup(char *str)
-{
-        int ints[MAX_DOMAIN_DISTANCE+1], i;
-        str = get_options(str, ARRAY_SIZE(ints), ints);
-        printk("#ints: %d\n", ints[0]);
-        for (i = 1; i <= ints[0]; i++) {
-                migration_cost[i-1] = (unsigned long long)ints[i]*1000;
-                printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
-        }
-        return 1;
-}
-__setup ("migration_cost=", migration_cost_setup);
-/*
- * Global multiplier (divisor) for migration-cutoff values,
- * in percentiles. E.g. use a value of 150 to get 1.5 times
- * longer cache-hot cutoff times.
- *
- * (We scale it from 100 to 128 to long long handling easier.)
- */
-#define MIGRATION_FACTOR_SCALE 128
-static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
-static int __init setup_migration_factor(char *str)
-{
-        get_option(&str, &migration_factor);
-        migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
-        return 1;
-}
-__setup("migration_factor=", setup_migration_factor);
-/*
- * Estimated distance of two CPUs, measured via the number of domains
- * we have to pass for the two CPUs to be in the same span:
- */
-static unsigned long domain_distance(int cpu1, int cpu2)
-{
-        unsigned long distance = 0;
-        struct sched_domain *sd;
-        for_each_domain(cpu1, sd) {
-                WARN_ON(!cpu_isset(cpu1, sd->span));
-                if (cpu_isset(cpu2, sd->span))
-                        return distance;
-                distance++;
-        }
-        if (distance >= MAX_DOMAIN_DISTANCE) {
-                WARN_ON(1);
-                distance = MAX_DOMAIN_DISTANCE-1;
-        }
-        return distance;
-}
-static unsigned int migration_debug;
-static int __init setup_migration_debug(char *str)
-{
-        get_option(&str, &migration_debug);
-        return 1;
-}
-__setup("migration_debug=", setup_migration_debug);
-/*
- * Maximum cache-size that the scheduler should try to measure.
- * Architectures with larger caches should tune this up during
- * bootup. Gets used in the domain-setup code (i.e. during SMP
- * bootup).
- */
-unsigned int max_cache_size;
-static int __init setup_max_cache_size(char *str)
-{
-        get_option(&str, &max_cache_size);
-        return 1;
-}
-__setup("max_cache_size=", setup_max_cache_size);
-/*
- * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
- * is the operation that is timed, so we try to generate unpredictable
- * cachemisses that still end up filling the L2 cache:
- */
-static void touch_cache(void *__cache, unsigned long __size)
-{
-        unsigned long size = __size / sizeof(long);
-        unsigned long chunk1 = size / 3;
-        unsigned long chunk2 = 2 * size / 3;
-        unsigned long *cache = __cache;
-        int i;
-        for (i = 0; i < size/6; i += 8) {
-                switch (i % 6) {
-                        case 0: cache[i]++;
-                        case 1: cache[size-1-i]++;
-                        case 2: cache[chunk1-i]++;
-                        case 3: cache[chunk1+i]++;
-                        case 4: cache[chunk2-i]++;
-                        case 5: cache[chunk2+i]++;
-                }
-        }
-}
-/*
- * Measure the cache-cost of one task migration. Returns in units of nsec.
- */
-static unsigned long long
-measure_one(void *cache, unsigned long size, int source, int target)
-{
-        cpumask_t mask, saved_mask;
-        unsigned long long t0, t1, t2, t3, cost;
-        saved_mask = current->cpus_allowed;
-        /*
-         * Flush source caches to RAM and invalidate them:
-         */
-        sched_cacheflush();
-        /*
-         * Migrate to the source CPU:
-         */
-        mask = cpumask_of_cpu(source);
-        set_cpus_allowed(current, mask);
-        WARN_ON(smp_processor_id() != source);
-        /*
-         * Dirty the working set:
-         */
-        t0 = sched_clock();
-        touch_cache(cache, size);
-        t1 = sched_clock();
-        /*
-         * Migrate to the target CPU, dirty the L2 cache and access
-         * the shared buffer. (which represents the working set
-         * of a migrated task.)
-         */
-        mask = cpumask_of_cpu(target);
-        set_cpus_allowed(current, mask);
-        WARN_ON(smp_processor_id() != target);
-        t2 = sched_clock();
-        touch_cache(cache, size);
-        t3 = sched_clock();
-        cost = t1-t0 + t3-t2;
-        if (migration_debug >= 2)
-                printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
-                        source, target, t1-t0, t1-t0, t3-t2, cost);
-        /*
-         * Flush target caches to RAM and invalidate them:
-         */
-        sched_cacheflush();
-        set_cpus_allowed(current, saved_mask);
-        return cost;
-}
-/*
- * Measure a series of task migrations and return the average
- * result. Since this code runs early during bootup the system
- * is 'undisturbed' and the average latency makes sense.
- *
- * The algorithm in essence auto-detects the relevant cache-size,
- * so it will properly detect different cachesizes for different
- * cache-hierarchies, depending on how the CPUs are connected.
- *
- * Architectures can prime the upper limit of the search range via
- * max_cache_size, otherwise the search range defaults to 20MB...64K.
- */
-static unsigned long long
-measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
-{
-        unsigned long long cost1, cost2;
-        int i;
-        /*
-         * Measure the migration cost of 'size' bytes, over an
-         * average of 10 runs:
-         *
-         * (We perturb the cache size by a small (0..4k)
-         *  value to compensate size/alignment related artifacts.
-         *  We also subtract the cost of the operation done on
-         *  the same CPU.)
-         */
-        cost1 = 0;
-        /*
-         * dry run, to make sure we start off cache-cold on cpu1,
-         * and to get any vmalloc pagefaults in advance:
-         */
-        measure_one(cache, size, cpu1, cpu2);
-        for (i = 0; i < ITERATIONS; i++)
-                cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
-        measure_one(cache, size, cpu2, cpu1);
-        for (i = 0; i < ITERATIONS; i++)
-                cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
-        /*
-         * (We measure the non-migrating [cached] cost on both
-         *  cpu1 and cpu2, to handle CPUs with different speeds)
-         */
-        cost2 = 0;
-        measure_one(cache, size, cpu1, cpu1);
-        for (i = 0; i < ITERATIONS; i++)
-                cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
-        measure_one(cache, size, cpu2, cpu2);
-        for (i = 0; i < ITERATIONS; i++)
-                cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
-        /*
-         * Get the per-iteration migration cost:
-         */
-        do_div(cost1, 2 * ITERATIONS);
-        do_div(cost2, 2 * ITERATIONS);
-        return cost1 - cost2;
-}
-static unsigned long long measure_migration_cost(int cpu1, int cpu2)
-{
-        unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
-        unsigned int max_size, size, size_found = 0;
-        long long cost = 0, prev_cost;
-        void *cache;
-        /*
-         * Search from max_cache_size*5 down to 64K - the real relevant
-         * cachesize has to lie somewhere inbetween.
-         */
-        if (max_cache_size) {
-                max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
-                size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
-        } else {
-                /*
-                 * Since we have no estimation about the relevant
-                 * search range
-                 */
-                max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
-                size = MIN_CACHE_SIZE;
-        }
-        if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
-                printk("cpu %d and %d not both online!\n", cpu1, cpu2);
-                return 0;
-        }
-        /*
-         * Allocate the working set:
-         */
-        cache = vmalloc(max_size);
-        if (!cache) {
-                printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
-                return 1000000; /* return 1 msec on very small boxen */
-        }
-        while (size <= max_size) {
-                prev_cost = cost;
-                cost = measure_cost(cpu1, cpu2, cache, size);
-                /*
-                 * Update the max:
-                 */
-                if (cost > 0) {
-                        if (max_cost < cost) {
-                                max_cost = cost;
-                                size_found = size;
-                        }
-                }
-                /*
-                 * Calculate average fluctuation, we use this to prevent
-                 * noise from triggering an early break out of the loop:
-                 */
-                fluct = abs(cost - prev_cost);
-                avg_fluct = (avg_fluct + fluct)/2;
-                if (migration_debug)
-                        printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
-                                "(%8Ld %8Ld)\n",
-                                cpu1, cpu2, size,
-                                (long)cost / 1000000,
-                                ((long)cost / 100000) % 10,
-                                (long)max_cost / 1000000,
-                                ((long)max_cost / 100000) % 10,
-                                domain_distance(cpu1, cpu2),
-                                cost, avg_fluct);
-                /*
-                 * If we iterated at least 20% past the previous maximum,
-                 * and the cost has dropped by more than 20% already,
-                 * (taking fluctuations into account) then we assume to
-                 * have found the maximum and break out of the loop early:
-                 */
-                if (size_found && (size*100 > size_found*SIZE_THRESH))
-                        if (cost+avg_fluct <= 0 ||
-                                max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
-                                if (migration_debug)
-                                        printk("-> found max.\n");
-                                break;
-                        }
-                /*
-                 * Increase the cachesize in 10% steps:
-                 */
-                size = size * 10 / 9;
-        }
-        if (migration_debug)
-                printk("[%d][%d] working set size found: %d, cost: %Ld\n",
-                        cpu1, cpu2, size_found, max_cost);
-        vfree(cache);
-        /*
-         * A task is considered 'cache cold' if at least 2 times
-         * the worst-case cost of migration has passed.
-         *
-         * (this limit is only listened to if the load-balancing
-         * situation is 'nice' - if there is a large imbalance we
-         * ignore it for the sake of CPU utilization and
-         * processing fairness.)
-         */
-        return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
-}
-static void calibrate_migration_costs(const cpumask_t *cpu_map)
-{
-        int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
-        unsigned long j0, j1, distance, max_distance = 0;
-        struct sched_domain *sd;
-        j0 = jiffies;
-        /*
-         * First pass - calculate the cacheflush times:
-         */
-        for_each_cpu_mask(cpu1, *cpu_map) {
-                for_each_cpu_mask(cpu2, *cpu_map) {
-                        if (cpu1 == cpu2)
-                                continue;
-                        distance = domain_distance(cpu1, cpu2);
-                        max_distance = max(max_distance, distance);
-                        /*
-                         * No result cached yet?
-                         */
-                        if (migration_cost[distance] == -1LL)
-                                migration_cost[distance] =
-                                        measure_migration_cost(cpu1, cpu2);
-                }
-        }
-        /*
-         * Second pass - update the sched domain hierarchy with
-         * the new cache-hot-time estimations:
-         */
-        for_each_cpu_mask(cpu, *cpu_map) {
-                distance = 0;
-                for_each_domain(cpu, sd) {
-                        sd->cache_hot_time = migration_cost[distance];
-                        distance++;
-                }
-        }
-        /*
-         * Print the matrix:
-         */
-        if (migration_debug)
-                printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
-                        max_cache_size,
-#ifdef CONFIG_X86
-                        cpu_khz/1000
-#else
-                        -1
-#endif
-                );
-        if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
-                printk("migration_cost=");
-                for (distance = 0; distance <= max_distance; distance++) {
-                        if (distance)
-                                printk(",");
-                        printk("%ld", (long)migration_cost[distance] / 1000);
-                }
-                printk("\n");
-        }
-        j1 = jiffies;
-        if (migration_debug)
-                printk("migration: %ld seconds\n", (j1-j0) / HZ);
-        /*
-         * Move back to the original CPU. NUMA-Q gets confused
-         * if we migrate to another quad during bootup.
-         */
-        if (raw_smp_processor_id() != orig_cpu) {
-                cpumask_t mask = cpumask_of_cpu(orig_cpu),
-                        saved_mask = current->cpus_allowed;
-                set_cpus_allowed(current, mask);
-                set_cpus_allowed(current, saved_mask);
-        }
-}
 #ifdef CONFIG_NUMA
 /**
@@ -6803,10 +6326,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #endif
                cpu_attach_domain(sd, i);
        }
-        /*
-         * Tune cache-hot values:
-         */
-        calibrate_migration_costs(cpu_map);
        return 0;

diff --git a/kernel/sched.c b/kernel/sched.c index ac054d9a0719..46b23f0fee24 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -5797,483 +5797,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5797		5797
5798	#define SD_NODES_PER_DOMAIN 16	5798	#define SD_NODES_PER_DOMAIN 16
5799		5799
5800	/*
5801	* Self-tuning task migration cost measurement between source and target CPUs.
5802	*
5803	* This is done by measuring the cost of manipulating buffers of varying
5804	* sizes. For a given buffer-size here are the steps that are taken:
5805	*
5806	* 1) the source CPU reads+dirties a shared buffer
5807	* 2) the target CPU reads+dirties the same shared buffer
5808	*
5809	* We measure how long they take, in the following 4 scenarios:
5810	*
5811	* - source: CPU1, target: CPU2 \| cost1
5812	* - source: CPU2, target: CPU1 \| cost2
5813	* - source: CPU1, target: CPU1 \| cost3
5814	* - source: CPU2, target: CPU2 \| cost4
5815	*
5816	* We then calculate the cost3+cost4-cost1-cost2 difference - this is
5817	* the cost of migration.
5818	*
5819	* We then start off from a small buffer-size and iterate up to larger
5820	* buffer sizes, in 5% steps - measuring each buffer-size separately, and
5821	* doing a maximum search for the cost. (The maximum cost for a migration
5822	* normally occurs when the working set size is around the effective cache
5823	* size.)
5824	*/
5825	#define SEARCH_SCOPE 2
5826	#define MIN_CACHE_SIZE (64*1024U)
5827	#define DEFAULT_CACHE_SIZE (510241024U)
5828	#define ITERATIONS 1
5829	#define SIZE_THRESH 130
5830	#define COST_THRESH 130
5831
5832	/*
5833	* The migration cost is a function of 'domain distance'. Domain
5834	* distance is the number of steps a CPU has to iterate down its
5835	* domain tree to share a domain with the other CPU. The farther
5836	* two CPUs are from each other, the larger the distance gets.
5837	*
5838	* Note that we use the distance only to cache measurement results,
5839	* the distance value is not used numerically otherwise. When two
5840	* CPUs have the same distance it is assumed that the migration
5841	* cost is the same. (this is a simplification but quite practical)
5842	*/
5843	#define MAX_DOMAIN_DISTANCE 32
5844
5845	static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
5846	{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
5847	/*
5848	* Architectures may override the migration cost and thus avoid
5849	* boot-time calibration. Unit is nanoseconds. Mostly useful for
5850	* virtualized hardware:
5851	*/
5852	#ifdef CONFIG_DEFAULT_MIGRATION_COST
5853	CONFIG_DEFAULT_MIGRATION_COST
5854	#else
5855	-1LL
5856	#endif
5857	};
5858
5859	/*
5860	* Allow override of migration cost - in units of microseconds.
5861	* E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
5862	* of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
5863	*/
5864	static int __init migration_cost_setup(char *str)
5865	{
5866	int ints[MAX_DOMAIN_DISTANCE+1], i;
5867
5868	str = get_options(str, ARRAY_SIZE(ints), ints);
5869
5870	printk("#ints: %d\n", ints[0]);
5871	for (i = 1; i <= ints[0]; i++) {
5872	migration_cost[i-1] = (unsigned long long)ints[i]*1000;
5873	printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
5874	}
5875	return 1;
5876	}
5877
5878	__setup ("migration_cost=", migration_cost_setup);
5879
5880	/*
5881	* Global multiplier (divisor) for migration-cutoff values,
5882	* in percentiles. E.g. use a value of 150 to get 1.5 times
5883	* longer cache-hot cutoff times.
5884	*
5885	* (We scale it from 100 to 128 to long long handling easier.)
5886	*/
5887
5888	#define MIGRATION_FACTOR_SCALE 128
5889
5890	static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
5891
5892	static int __init setup_migration_factor(char *str)
5893	{
5894	get_option(&str, &migration_factor);
5895	migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
5896	return 1;
5897	}
5898
5899	__setup("migration_factor=", setup_migration_factor);
5900
5901	/*
5902	* Estimated distance of two CPUs, measured via the number of domains
5903	* we have to pass for the two CPUs to be in the same span:
5904	*/
5905	static unsigned long domain_distance(int cpu1, int cpu2)
5906	{
5907	unsigned long distance = 0;
5908	struct sched_domain *sd;
5909
5910	for_each_domain(cpu1, sd) {
5911	WARN_ON(!cpu_isset(cpu1, sd->span));
5912	if (cpu_isset(cpu2, sd->span))
5913	return distance;
5914	distance++;
5915	}
5916	if (distance >= MAX_DOMAIN_DISTANCE) {
5917	WARN_ON(1);
5918	distance = MAX_DOMAIN_DISTANCE-1;
5919	}
5920
5921	return distance;
5922	}
5923
5924	static unsigned int migration_debug;
5925
5926	static int __init setup_migration_debug(char *str)
5927	{
5928	get_option(&str, &migration_debug);
5929	return 1;
5930	}
5931
5932	__setup("migration_debug=", setup_migration_debug);
5933
5934	/*
5935	* Maximum cache-size that the scheduler should try to measure.
5936	* Architectures with larger caches should tune this up during
5937	* bootup. Gets used in the domain-setup code (i.e. during SMP
5938	* bootup).
5939	*/
5940	unsigned int max_cache_size;
5941
5942	static int __init setup_max_cache_size(char *str)
5943	{
5944	get_option(&str, &max_cache_size);
5945	return 1;
5946	}
5947
5948	__setup("max_cache_size=", setup_max_cache_size);
5949
5950	/*
5951	* Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
5952	* is the operation that is timed, so we try to generate unpredictable
5953	* cachemisses that still end up filling the L2 cache:
5954	*/
5955	static void touch_cache(void *__cache, unsigned long __size)
5956	{
5957	unsigned long size = __size / sizeof(long);
5958	unsigned long chunk1 = size / 3;
5959	unsigned long chunk2 = 2 * size / 3;
5960	unsigned long *cache = __cache;
5961	int i;
5962
5963	for (i = 0; i < size/6; i += 8) {
5964	switch (i % 6) {
5965	case 0: cache[i]++;
5966	case 1: cache[size-1-i]++;
5967	case 2: cache[chunk1-i]++;
5968	case 3: cache[chunk1+i]++;
5969	case 4: cache[chunk2-i]++;
5970	case 5: cache[chunk2+i]++;
5971	}
5972	}
5973	}
5974
5975	/*
5976	* Measure the cache-cost of one task migration. Returns in units of nsec.
5977	*/
5978	static unsigned long long
5979	measure_one(void *cache, unsigned long size, int source, int target)
5980	{
5981	cpumask_t mask, saved_mask;
5982	unsigned long long t0, t1, t2, t3, cost;
5983
5984	saved_mask = current->cpus_allowed;
5985
5986	/*
5987	* Flush source caches to RAM and invalidate them:
5988	*/
5989	sched_cacheflush();
5990
5991	/*
5992	* Migrate to the source CPU:
5993	*/
5994	mask = cpumask_of_cpu(source);
5995	set_cpus_allowed(current, mask);
5996	WARN_ON(smp_processor_id() != source);
5997
5998	/*
5999	* Dirty the working set:
6000	*/
6001	t0 = sched_clock();
6002	touch_cache(cache, size);
6003	t1 = sched_clock();
6004
6005	/*
6006	* Migrate to the target CPU, dirty the L2 cache and access
6007	* the shared buffer. (which represents the working set
6008	* of a migrated task.)
6009	*/
6010	mask = cpumask_of_cpu(target);
6011	set_cpus_allowed(current, mask);
6012	WARN_ON(smp_processor_id() != target);
6013
6014	t2 = sched_clock();
6015	touch_cache(cache, size);
6016	t3 = sched_clock();
6017
6018	cost = t1-t0 + t3-t2;
6019
6020	if (migration_debug >= 2)
6021	printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
6022	source, target, t1-t0, t1-t0, t3-t2, cost);
6023	/*
6024	* Flush target caches to RAM and invalidate them:
6025	*/
6026	sched_cacheflush();
6027
6028	set_cpus_allowed(current, saved_mask);
6029
6030	return cost;
6031	}
6032
6033	/*
6034	* Measure a series of task migrations and return the average
6035	* result. Since this code runs early during bootup the system
6036	* is 'undisturbed' and the average latency makes sense.
6037	*
6038	* The algorithm in essence auto-detects the relevant cache-size,
6039	* so it will properly detect different cachesizes for different
6040	* cache-hierarchies, depending on how the CPUs are connected.
6041	*
6042	* Architectures can prime the upper limit of the search range via
6043	* max_cache_size, otherwise the search range defaults to 20MB...64K.
6044	*/
6045	static unsigned long long
6046	measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
6047	{
6048	unsigned long long cost1, cost2;
6049	int i;
6050
6051	/*
6052	* Measure the migration cost of 'size' bytes, over an
6053	* average of 10 runs:
6054	*
6055	* (We perturb the cache size by a small (0..4k)
6056	* value to compensate size/alignment related artifacts.
6057	* We also subtract the cost of the operation done on
6058	* the same CPU.)
6059	*/
6060	cost1 = 0;
6061
6062	/*
6063	* dry run, to make sure we start off cache-cold on cpu1,
6064	* and to get any vmalloc pagefaults in advance:
6065	*/
6066	measure_one(cache, size, cpu1, cpu2);
6067	for (i = 0; i < ITERATIONS; i++)
6068	cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
6069
6070	measure_one(cache, size, cpu2, cpu1);
6071	for (i = 0; i < ITERATIONS; i++)
6072	cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
6073
6074	/*
6075	* (We measure the non-migrating [cached] cost on both
6076	* cpu1 and cpu2, to handle CPUs with different speeds)
6077	*/
6078	cost2 = 0;
6079
6080	measure_one(cache, size, cpu1, cpu1);
6081	for (i = 0; i < ITERATIONS; i++)
6082	cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
6083
6084	measure_one(cache, size, cpu2, cpu2);
6085	for (i = 0; i < ITERATIONS; i++)
6086	cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
6087
6088	/*
6089	* Get the per-iteration migration cost:
6090	*/
6091	do_div(cost1, 2 * ITERATIONS);
6092	do_div(cost2, 2 * ITERATIONS);
6093
6094	return cost1 - cost2;
6095	}
6096
6097	static unsigned long long measure_migration_cost(int cpu1, int cpu2)
6098	{
6099	unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
6100	unsigned int max_size, size, size_found = 0;
6101	long long cost = 0, prev_cost;
6102	void *cache;
6103
6104	/*
6105	* Search from max_cache_size*5 down to 64K - the real relevant
6106	* cachesize has to lie somewhere inbetween.
6107	*/
6108	if (max_cache_size) {
6109	max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
6110	size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
6111	} else {
6112	/*
6113	* Since we have no estimation about the relevant
6114	* search range
6115	*/
6116	max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
6117	size = MIN_CACHE_SIZE;
6118	}
6119
6120	if (!cpu_online(cpu1) \|\| !cpu_online(cpu2)) {
6121	printk("cpu %d and %d not both online!\n", cpu1, cpu2);
6122	return 0;
6123	}
6124
6125	/*
6126	* Allocate the working set:
6127	*/
6128	cache = vmalloc(max_size);
6129	if (!cache) {
6130	printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
6131	return 1000000; /* return 1 msec on very small boxen */
6132	}
6133
6134	while (size <= max_size) {
6135	prev_cost = cost;
6136	cost = measure_cost(cpu1, cpu2, cache, size);
6137
6138	/*
6139	* Update the max:
6140	*/
6141	if (cost > 0) {
6142	if (max_cost < cost) {
6143	max_cost = cost;
6144	size_found = size;
6145	}
6146	}
6147	/*
6148	* Calculate average fluctuation, we use this to prevent
6149	* noise from triggering an early break out of the loop:
6150	*/
6151	fluct = abs(cost - prev_cost);
6152	avg_fluct = (avg_fluct + fluct)/2;
6153
6154	if (migration_debug)
6155	printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
6156	"(%8Ld %8Ld)\n",
6157	cpu1, cpu2, size,
6158	(long)cost / 1000000,
6159	((long)cost / 100000) % 10,
6160	(long)max_cost / 1000000,
6161	((long)max_cost / 100000) % 10,
6162	domain_distance(cpu1, cpu2),
6163	cost, avg_fluct);
6164
6165	/*
6166	* If we iterated at least 20% past the previous maximum,
6167	* and the cost has dropped by more than 20% already,
6168	* (taking fluctuations into account) then we assume to
6169	* have found the maximum and break out of the loop early:
6170	*/
6171	if (size_found && (size100 > size_foundSIZE_THRESH))
6172	if (cost+avg_fluct <= 0 \|\|
6173	max_cost100 > (cost+avg_fluct)COST_THRESH) {
6174
6175	if (migration_debug)
6176	printk("-> found max.\n");
6177	break;
6178	}
6179	/*
6180	* Increase the cachesize in 10% steps:
6181	*/
6182	size = size * 10 / 9;
6183	}
6184
6185	if (migration_debug)
6186	printk("[%d][%d] working set size found: %d, cost: %Ld\n",
6187	cpu1, cpu2, size_found, max_cost);
6188
6189	vfree(cache);
6190
6191	/*
6192	* A task is considered 'cache cold' if at least 2 times
6193	* the worst-case cost of migration has passed.
6194	*
6195	* (this limit is only listened to if the load-balancing
6196	* situation is 'nice' - if there is a large imbalance we
6197	* ignore it for the sake of CPU utilization and
6198	* processing fairness.)
6199	*/
6200	return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
6201	}
6202
6203	static void calibrate_migration_costs(const cpumask_t *cpu_map)
6204	{
6205	int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
6206	unsigned long j0, j1, distance, max_distance = 0;
6207	struct sched_domain *sd;
6208
6209	j0 = jiffies;
6210
6211	/*
6212	* First pass - calculate the cacheflush times:
6213	*/
6214	for_each_cpu_mask(cpu1, *cpu_map) {
6215	for_each_cpu_mask(cpu2, *cpu_map) {
6216	if (cpu1 == cpu2)
6217	continue;
6218	distance = domain_distance(cpu1, cpu2);
6219	max_distance = max(max_distance, distance);
6220	/*
6221	* No result cached yet?
6222	*/
6223	if (migration_cost[distance] == -1LL)
6224	migration_cost[distance] =
6225	measure_migration_cost(cpu1, cpu2);
6226	}
6227	}
6228	/*
6229	* Second pass - update the sched domain hierarchy with
6230	* the new cache-hot-time estimations:
6231	*/
6232	for_each_cpu_mask(cpu, *cpu_map) {
6233	distance = 0;
6234	for_each_domain(cpu, sd) {
6235	sd->cache_hot_time = migration_cost[distance];
6236	distance++;
6237	}
6238	}
6239	/*
6240	* Print the matrix:
6241	*/
6242	if (migration_debug)
6243	printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
6244	max_cache_size,
6245	#ifdef CONFIG_X86
6246	cpu_khz/1000
6247	#else
6248	-1
6249	#endif
6250	);
6251	if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
6252	printk("migration_cost=");
6253	for (distance = 0; distance <= max_distance; distance++) {
6254	if (distance)
6255	printk(",");
6256	printk("%ld", (long)migration_cost[distance] / 1000);
6257	}
6258	printk("\n");
6259	}
6260	j1 = jiffies;
6261	if (migration_debug)
6262	printk("migration: %ld seconds\n", (j1-j0) / HZ);
6263
6264	/*
6265	* Move back to the original CPU. NUMA-Q gets confused
6266	* if we migrate to another quad during bootup.
6267	*/
6268	if (raw_smp_processor_id() != orig_cpu) {
6269	cpumask_t mask = cpumask_of_cpu(orig_cpu),
6270	saved_mask = current->cpus_allowed;
6271
6272	set_cpus_allowed(current, mask);
6273	set_cpus_allowed(current, saved_mask);
6274	}
6275	}
6276
6277	#ifdef CONFIG_NUMA	5800	#ifdef CONFIG_NUMA
6278		5801
6279	/**	5802	/**
@@ -6803,10 +6326,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6803	#endif	6326	#endif
6804	cpu_attach_domain(sd, i);	6327	cpu_attach_domain(sd, i);
6805	}	6328	}
6806	/*
6807	* Tune cache-hot values:
6808	*/
6809	calibrate_migration_costs(cpu_map);
6810		6329
6811	return 0;	6330	return 0;
6812		6331