1 files changed, 468 insertions, 0 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index c0c60c926d5e..98461de1ab65 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -34,6 +34,7 @@
 #include <linux/notifier.h>
 #include <linux/profile.h>
 #include <linux/suspend.h>
+#include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/smp.h>
@@ -5082,7 +5083,470 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
 #define SD_NODES_PER_DOMAIN 16
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads+dirties a shared buffer
+ * 2) the target CPU reads+dirties the same shared buffer
+ *
+ * We measure how long they take, in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a small buffer-size and iterate up to larger
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * doing a maximum search for the cost. (The maximum cost for a migration
+ * normally occurs when the working set size is around the effective cache
+ * size.)
+ */
+#define SEARCH_SCOPE            2
+#define MIN_CACHE_SIZE          (64*1024U)
+#define DEFAULT_CACHE_SIZE      (5*1024*1024U)
+#define ITERATIONS              2
+#define SIZE_THRESH             130
+#define COST_THRESH             130
+/*
+ * The migration cost is a function of 'domain distance'. Domain
+ * distance is the number of steps a CPU has to iterate down its
+ * domain tree to share a domain with the other CPU. The farther
+ * two CPUs are from each other, the larger the distance gets.
+ *
+ * Note that we use the distance only to cache measurement results,
+ * the distance value is not used numerically otherwise. When two
+ * CPUs have the same distance it is assumed that the migration
+ * cost is the same. (this is a simplification but quite practical)
+ */
+#define MAX_DOMAIN_DISTANCE 32
+static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
+                { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL };
+/*
+ * Allow override of migration cost - in units of microseconds.
+ * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
+ * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
+ */
+static int __init migration_cost_setup(char *str)
+{
+        int ints[MAX_DOMAIN_DISTANCE+1], i;
+        str = get_options(str, ARRAY_SIZE(ints), ints);
+        printk("#ints: %d\n", ints[0]);
+        for (i = 1; i <= ints[0]; i++) {
+                migration_cost[i-1] = (unsigned long long)ints[i]*1000;
+                printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
+        }
+        return 1;
+}
+__setup ("migration_cost=", migration_cost_setup);
+/*
+ * Global multiplier (divisor) for migration-cutoff values,
+ * in percentiles. E.g. use a value of 150 to get 1.5 times
+ * longer cache-hot cutoff times.
+ *
+ * (We scale it from 100 to 128 to long long handling easier.)
+ */
+#define MIGRATION_FACTOR_SCALE 128
+static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
+static int __init setup_migration_factor(char *str)
+{
+        get_option(&str, &migration_factor);
+        migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
+        return 1;
+}
+__setup("migration_factor=", setup_migration_factor);
+/*
+ * Estimated distance of two CPUs, measured via the number of domains
+ * we have to pass for the two CPUs to be in the same span:
+ */
+static unsigned long domain_distance(int cpu1, int cpu2)
+{
+        unsigned long distance = 0;
+        struct sched_domain *sd;
+        for_each_domain(cpu1, sd) {
+                WARN_ON(!cpu_isset(cpu1, sd->span));
+                if (cpu_isset(cpu2, sd->span))
+                        return distance;
+                distance++;
+        }
+        if (distance >= MAX_DOMAIN_DISTANCE) {
+                WARN_ON(1);
+                distance = MAX_DOMAIN_DISTANCE-1;
+        }
+        return distance;
+}
+static unsigned int migration_debug;
+static int __init setup_migration_debug(char *str)
+{
+        get_option(&str, &migration_debug);
+        return 1;
+}
+__setup("migration_debug=", setup_migration_debug);
+/*
+ * Maximum cache-size that the scheduler should try to measure.
+ * Architectures with larger caches should tune this up during
+ * bootup. Gets used in the domain-setup code (i.e. during SMP
+ * bootup).
+ */
+unsigned int max_cache_size;
+static int __init setup_max_cache_size(char *str)
+{
+        get_option(&str, &max_cache_size);
+        return 1;
+}
+__setup("max_cache_size=", setup_max_cache_size);
+/*
+ * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
+ * is the operation that is timed, so we try to generate unpredictable
+ * cachemisses that still end up filling the L2 cache:
+ */
+static void touch_cache(void *__cache, unsigned long __size)
+{
+        unsigned long size = __size/sizeof(long), chunk1 = size/3,
+                        chunk2 = 2*size/3;
+        unsigned long *cache = __cache;
+        int i;
+        for (i = 0; i < size/6; i += 8) {
+                switch (i % 6) {
+                        case 0: cache[i]++;
+                        case 1: cache[size-1-i]++;
+                        case 2: cache[chunk1-i]++;
+                        case 3: cache[chunk1+i]++;
+                        case 4: cache[chunk2-i]++;
+                        case 5: cache[chunk2+i]++;
+                }
+        }
+}
+/*
+ * Measure the cache-cost of one task migration. Returns in units of nsec.
+ */
+static unsigned long long measure_one(void *cache, unsigned long size,
+                                      int source, int target)
+{
+        cpumask_t mask, saved_mask;
+        unsigned long long t0, t1, t2, t3, cost;
+        saved_mask = current->cpus_allowed;
+        /*
+         * Flush source caches to RAM and invalidate them:
+         */
+        sched_cacheflush();
+        /*
+         * Migrate to the source CPU:
+         */
+        mask = cpumask_of_cpu(source);
+        set_cpus_allowed(current, mask);
+        WARN_ON(smp_processor_id() != source);
+        /*
+         * Dirty the working set:
+         */
+        t0 = sched_clock();
+        touch_cache(cache, size);
+        t1 = sched_clock();
+        /*
+         * Migrate to the target CPU, dirty the L2 cache and access
+         * the shared buffer. (which represents the working set
+         * of a migrated task.)
+         */
+        mask = cpumask_of_cpu(target);
+        set_cpus_allowed(current, mask);
+        WARN_ON(smp_processor_id() != target);
+        t2 = sched_clock();
+        touch_cache(cache, size);
+        t3 = sched_clock();
+        cost = t1-t0 + t3-t2;
+        if (migration_debug >= 2)
+                printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
+                        source, target, t1-t0, t1-t0, t3-t2, cost);
+        /*
+         * Flush target caches to RAM and invalidate them:
+         */
+        sched_cacheflush();
+        set_cpus_allowed(current, saved_mask);
+        return cost;
+}
+/*
+ * Measure a series of task migrations and return the average
+ * result. Since this code runs early during bootup the system
+ * is 'undisturbed' and the average latency makes sense.
+ *
+ * The algorithm in essence auto-detects the relevant cache-size,
+ * so it will properly detect different cachesizes for different
+ * cache-hierarchies, depending on how the CPUs are connected.
+ *
+ * Architectures can prime the upper limit of the search range via
+ * max_cache_size, otherwise the search range defaults to 20MB...64K.
+ */
+static unsigned long long
+measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
+{
+        unsigned long long cost1, cost2;
+        int i;
+        /*
+         * Measure the migration cost of 'size' bytes, over an
+         * average of 10 runs:
+         *
+         * (We perturb the cache size by a small (0..4k)
+         *  value to compensate size/alignment related artifacts.
+         *  We also subtract the cost of the operation done on
+         *  the same CPU.)
+         */
+        cost1 = 0;
+        /*
+         * dry run, to make sure we start off cache-cold on cpu1,
+         * and to get any vmalloc pagefaults in advance:
+         */
+        measure_one(cache, size, cpu1, cpu2);
+        for (i = 0; i < ITERATIONS; i++)
+                cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
+        measure_one(cache, size, cpu2, cpu1);
+        for (i = 0; i < ITERATIONS; i++)
+                cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
+        /*
+         * (We measure the non-migrating [cached] cost on both
+         *  cpu1 and cpu2, to handle CPUs with different speeds)
+         */
+        cost2 = 0;
+        measure_one(cache, size, cpu1, cpu1);
+        for (i = 0; i < ITERATIONS; i++)
+                cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
+        measure_one(cache, size, cpu2, cpu2);
+        for (i = 0; i < ITERATIONS; i++)
+                cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
+        /*
+         * Get the per-iteration migration cost:
+         */
+        do_div(cost1, 2*ITERATIONS);
+        do_div(cost2, 2*ITERATIONS);
+        return cost1 - cost2;
+}
+static unsigned long long measure_migration_cost(int cpu1, int cpu2)
+{
+        unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
+        unsigned int max_size, size, size_found = 0;
+        long long cost = 0, prev_cost;
+        void *cache;
+        /*
+         * Search from max_cache_size*5 down to 64K - the real relevant
+         * cachesize has to lie somewhere inbetween.
+         */
+        if (max_cache_size) {
+                max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
+                size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
+        } else {
+                /*
+                 * Since we have no estimation about the relevant
+                 * search range
+                 */
+                max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
+                size = MIN_CACHE_SIZE;
+        }
+        if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
+                printk("cpu %d and %d not both online!\n", cpu1, cpu2);
+                return 0;
+        }
+        /*
+         * Allocate the working set:
+         */
+        cache = vmalloc(max_size);
+        if (!cache) {
+                printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
+                return 1000000; // return 1 msec on very small boxen
+        }
+        while (size <= max_size) {
+                prev_cost = cost;
+                cost = measure_cost(cpu1, cpu2, cache, size);
+                /*
+                 * Update the max:
+                 */
+                if (cost > 0) {
+                        if (max_cost < cost) {
+                                max_cost = cost;
+                                size_found = size;
+                        }
+                }
+                /*
+                 * Calculate average fluctuation, we use this to prevent
+                 * noise from triggering an early break out of the loop:
+                 */
+                fluct = abs(cost - prev_cost);
+                avg_fluct = (avg_fluct + fluct)/2;
+                if (migration_debug)
+                        printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
+                                cpu1, cpu2, size,
+                                (long)cost / 1000000,
+                                ((long)cost / 100000) % 10,
+                                (long)max_cost / 1000000,
+                                ((long)max_cost / 100000) % 10,
+                                domain_distance(cpu1, cpu2),
+                                cost, avg_fluct);
+                /*
+                 * If we iterated at least 20% past the previous maximum,
+                 * and the cost has dropped by more than 20% already,
+                 * (taking fluctuations into account) then we assume to
+                 * have found the maximum and break out of the loop early:
+                 */
+                if (size_found && (size*100 > size_found*SIZE_THRESH))
+                        if (cost+avg_fluct <= 0 ||
+                                max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
+                                if (migration_debug)
+                                        printk("-> found max.\n");
+                                break;
+                        }
+                /*
+                 * Increase the cachesize in 5% steps:
+                 */
+                size = size * 20 / 19;
+        }
+        if (migration_debug)
+                printk("[%d][%d] working set size found: %d, cost: %Ld\n",
+                        cpu1, cpu2, size_found, max_cost);
+        vfree(cache);
+        /*
+         * A task is considered 'cache cold' if at least 2 times
+         * the worst-case cost of migration has passed.
+         *
+         * (this limit is only listened to if the load-balancing
+         * situation is 'nice' - if there is a large imbalance we
+         * ignore it for the sake of CPU utilization and
+         * processing fairness.)
+         */
+        return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
+}
+static void calibrate_migration_costs(const cpumask_t *cpu_map)
+{
+        int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
+        unsigned long j0, j1, distance, max_distance = 0;
+        struct sched_domain *sd;
+        j0 = jiffies;
+        /*
+         * First pass - calculate the cacheflush times:
+         */
+        for_each_cpu_mask(cpu1, *cpu_map) {
+                for_each_cpu_mask(cpu2, *cpu_map) {
+                        if (cpu1 == cpu2)
+                                continue;
+                        distance = domain_distance(cpu1, cpu2);
+                        max_distance = max(max_distance, distance);
+                        /*
+                         * No result cached yet?
+                         */
+                        if (migration_cost[distance] == -1LL)
+                                migration_cost[distance] =
+                                        measure_migration_cost(cpu1, cpu2);
+                }
+        }
+        /*
+         * Second pass - update the sched domain hierarchy with
+         * the new cache-hot-time estimations:
+         */
+        for_each_cpu_mask(cpu, *cpu_map) {
+                distance = 0;
+                for_each_domain(cpu, sd) {
+                        sd->cache_hot_time = migration_cost[distance];
+                        distance++;
+                }
+        }
+        /*
+         * Print the matrix:
+         */
+        if (migration_debug)
+                printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
+                        max_cache_size,
+#ifdef CONFIG_X86
+                        cpu_khz/1000
+#else
+                        -1
+#endif
+                );
+        printk("migration_cost=");
+        for (distance = 0; distance <= max_distance; distance++) {
+                if (distance)
+                        printk(",");
+                printk("%ld", (long)migration_cost[distance] / 1000);
+        }
+        printk("\n");
+        j1 = jiffies;
+        if (migration_debug)
+                printk("migration: %ld seconds\n", (j1-j0)/HZ);
+        /*
+         * Move back to the original CPU. NUMA-Q gets confused
+         * if we migrate to another quad during bootup.
+         */
+        if (raw_smp_processor_id() != orig_cpu) {
+                cpumask_t mask = cpumask_of_cpu(orig_cpu),
+                        saved_mask = current->cpus_allowed;
+                set_cpus_allowed(current, mask);
+                set_cpus_allowed(current, saved_mask);
+        }
+}
 #ifdef CONFIG_NUMA
 /**
 * find_next_best_node - find the next node to include in a sched_domain
 * @node: node whose sched_domain we're building
@@ -5448,6 +5912,10 @@ next_sg:
 #endif
                cpu_attach_domain(sd, i);
        }
+        /*
+         * Tune cache-hot values:
+         */
+        calibrate_migration_costs(cpu_map);
 }
 /*
 * Set up scheduler domains and groups.  Callers must hold the hotplug lock.

diff --git a/kernel/sched.c b/kernel/sched.c index c0c60c926d5e..98461de1ab65 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -34,6 +34,7 @@
34	#include <linux/notifier.h>	34	#include <linux/notifier.h>
35	#include <linux/profile.h>	35	#include <linux/profile.h>
36	#include <linux/suspend.h>	36	#include <linux/suspend.h>
		37	#include <linux/vmalloc.h>
37	#include <linux/blkdev.h>	38	#include <linux/blkdev.h>
38	#include <linux/delay.h>	39	#include <linux/delay.h>
39	#include <linux/smp.h>	40	#include <linux/smp.h>
@@ -5082,7 +5083,470 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
5082		5083
5083	#define SD_NODES_PER_DOMAIN 16	5084	#define SD_NODES_PER_DOMAIN 16
5084		5085
		5086	/*
		5087	* Self-tuning task migration cost measurement between source and target CPUs.
		5088	*
		5089	* This is done by measuring the cost of manipulating buffers of varying
		5090	* sizes. For a given buffer-size here are the steps that are taken:
		5091	*
		5092	* 1) the source CPU reads+dirties a shared buffer
		5093	* 2) the target CPU reads+dirties the same shared buffer
		5094	*
		5095	* We measure how long they take, in the following 4 scenarios:
		5096	*
		5097	* - source: CPU1, target: CPU2 \| cost1
		5098	* - source: CPU2, target: CPU1 \| cost2
		5099	* - source: CPU1, target: CPU1 \| cost3
		5100	* - source: CPU2, target: CPU2 \| cost4
		5101	*
		5102	* We then calculate the cost3+cost4-cost1-cost2 difference - this is
		5103	* the cost of migration.
		5104	*
		5105	* We then start off from a small buffer-size and iterate up to larger
		5106	* buffer sizes, in 5% steps - measuring each buffer-size separately, and
		5107	* doing a maximum search for the cost. (The maximum cost for a migration
		5108	* normally occurs when the working set size is around the effective cache
		5109	* size.)
		5110	*/
		5111	#define SEARCH_SCOPE 2
		5112	#define MIN_CACHE_SIZE (64*1024U)
		5113	#define DEFAULT_CACHE_SIZE (510241024U)
		5114	#define ITERATIONS 2
		5115	#define SIZE_THRESH 130
		5116	#define COST_THRESH 130
		5117
		5118	/*
		5119	* The migration cost is a function of 'domain distance'. Domain
		5120	* distance is the number of steps a CPU has to iterate down its
		5121	* domain tree to share a domain with the other CPU. The farther
		5122	* two CPUs are from each other, the larger the distance gets.
		5123	*
		5124	* Note that we use the distance only to cache measurement results,
		5125	* the distance value is not used numerically otherwise. When two
		5126	* CPUs have the same distance it is assumed that the migration
		5127	* cost is the same. (this is a simplification but quite practical)
		5128	*/
		5129	#define MAX_DOMAIN_DISTANCE 32
		5130
		5131	static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
		5132	{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL };
		5133
		5134	/*
		5135	* Allow override of migration cost - in units of microseconds.
		5136	* E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
		5137	* of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
		5138	*/
		5139	static int __init migration_cost_setup(char *str)
		5140	{
		5141	int ints[MAX_DOMAIN_DISTANCE+1], i;
		5142
		5143	str = get_options(str, ARRAY_SIZE(ints), ints);
		5144
		5145	printk("#ints: %d\n", ints[0]);
		5146	for (i = 1; i <= ints[0]; i++) {
		5147	migration_cost[i-1] = (unsigned long long)ints[i]*1000;
		5148	printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
		5149	}
		5150	return 1;
		5151	}
		5152
		5153	__setup ("migration_cost=", migration_cost_setup);
		5154
		5155	/*
		5156	* Global multiplier (divisor) for migration-cutoff values,
		5157	* in percentiles. E.g. use a value of 150 to get 1.5 times
		5158	* longer cache-hot cutoff times.
		5159	*
		5160	* (We scale it from 100 to 128 to long long handling easier.)
		5161	*/
		5162
		5163	#define MIGRATION_FACTOR_SCALE 128
		5164
		5165	static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
		5166
		5167	static int __init setup_migration_factor(char *str)
		5168	{
		5169	get_option(&str, &migration_factor);
		5170	migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
		5171	return 1;
		5172	}
		5173
		5174	__setup("migration_factor=", setup_migration_factor);
		5175
		5176	/*
		5177	* Estimated distance of two CPUs, measured via the number of domains
		5178	* we have to pass for the two CPUs to be in the same span:
		5179	*/
		5180	static unsigned long domain_distance(int cpu1, int cpu2)
		5181	{
		5182	unsigned long distance = 0;
		5183	struct sched_domain *sd;
		5184
		5185	for_each_domain(cpu1, sd) {
		5186	WARN_ON(!cpu_isset(cpu1, sd->span));
		5187	if (cpu_isset(cpu2, sd->span))
		5188	return distance;
		5189	distance++;
		5190	}
		5191	if (distance >= MAX_DOMAIN_DISTANCE) {
		5192	WARN_ON(1);
		5193	distance = MAX_DOMAIN_DISTANCE-1;
		5194	}
		5195
		5196	return distance;
		5197	}
		5198
		5199	static unsigned int migration_debug;
		5200
		5201	static int __init setup_migration_debug(char *str)
		5202	{
		5203	get_option(&str, &migration_debug);
		5204	return 1;
		5205	}
		5206
		5207	__setup("migration_debug=", setup_migration_debug);
		5208
		5209	/*
		5210	* Maximum cache-size that the scheduler should try to measure.
		5211	* Architectures with larger caches should tune this up during
		5212	* bootup. Gets used in the domain-setup code (i.e. during SMP
		5213	* bootup).
		5214	*/
		5215	unsigned int max_cache_size;
		5216
		5217	static int __init setup_max_cache_size(char *str)
		5218	{
		5219	get_option(&str, &max_cache_size);
		5220	return 1;
		5221	}
		5222
		5223	__setup("max_cache_size=", setup_max_cache_size);
		5224
		5225	/*
		5226	* Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
		5227	* is the operation that is timed, so we try to generate unpredictable
		5228	* cachemisses that still end up filling the L2 cache:
		5229	*/
		5230	static void touch_cache(void *__cache, unsigned long __size)
		5231	{
		5232	unsigned long size = __size/sizeof(long), chunk1 = size/3,
		5233	chunk2 = 2*size/3;
		5234	unsigned long *cache = __cache;
		5235	int i;
		5236
		5237	for (i = 0; i < size/6; i += 8) {
		5238	switch (i % 6) {
		5239	case 0: cache[i]++;
		5240	case 1: cache[size-1-i]++;
		5241	case 2: cache[chunk1-i]++;
		5242	case 3: cache[chunk1+i]++;
		5243	case 4: cache[chunk2-i]++;
		5244	case 5: cache[chunk2+i]++;
		5245	}
		5246	}
		5247	}
		5248
		5249	/*
		5250	* Measure the cache-cost of one task migration. Returns in units of nsec.
		5251	*/
		5252	static unsigned long long measure_one(void *cache, unsigned long size,
		5253	int source, int target)
		5254	{
		5255	cpumask_t mask, saved_mask;
		5256	unsigned long long t0, t1, t2, t3, cost;
		5257
		5258	saved_mask = current->cpus_allowed;
		5259
		5260	/*
		5261	* Flush source caches to RAM and invalidate them:
		5262	*/
		5263	sched_cacheflush();
		5264
		5265	/*
		5266	* Migrate to the source CPU:
		5267	*/
		5268	mask = cpumask_of_cpu(source);
		5269	set_cpus_allowed(current, mask);
		5270	WARN_ON(smp_processor_id() != source);
		5271
		5272	/*
		5273	* Dirty the working set:
		5274	*/
		5275	t0 = sched_clock();
		5276	touch_cache(cache, size);
		5277	t1 = sched_clock();
		5278
		5279	/*
		5280	* Migrate to the target CPU, dirty the L2 cache and access
		5281	* the shared buffer. (which represents the working set
		5282	* of a migrated task.)
		5283	*/
		5284	mask = cpumask_of_cpu(target);
		5285	set_cpus_allowed(current, mask);
		5286	WARN_ON(smp_processor_id() != target);
		5287
		5288	t2 = sched_clock();
		5289	touch_cache(cache, size);
		5290	t3 = sched_clock();
		5291
		5292	cost = t1-t0 + t3-t2;
		5293
		5294	if (migration_debug >= 2)
		5295	printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
		5296	source, target, t1-t0, t1-t0, t3-t2, cost);
		5297	/*
		5298	* Flush target caches to RAM and invalidate them:
		5299	*/
		5300	sched_cacheflush();
		5301
		5302	set_cpus_allowed(current, saved_mask);
		5303
		5304	return cost;
		5305	}
		5306
		5307	/*
		5308	* Measure a series of task migrations and return the average
		5309	* result. Since this code runs early during bootup the system
		5310	* is 'undisturbed' and the average latency makes sense.
		5311	*
		5312	* The algorithm in essence auto-detects the relevant cache-size,
		5313	* so it will properly detect different cachesizes for different
		5314	* cache-hierarchies, depending on how the CPUs are connected.
		5315	*
		5316	* Architectures can prime the upper limit of the search range via
		5317	* max_cache_size, otherwise the search range defaults to 20MB...64K.
		5318	*/
		5319	static unsigned long long
		5320	measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
		5321	{
		5322	unsigned long long cost1, cost2;
		5323	int i;
		5324
		5325	/*
		5326	* Measure the migration cost of 'size' bytes, over an
		5327	* average of 10 runs:
		5328	*
		5329	* (We perturb the cache size by a small (0..4k)
		5330	* value to compensate size/alignment related artifacts.
		5331	* We also subtract the cost of the operation done on
		5332	* the same CPU.)
		5333	*/
		5334	cost1 = 0;
		5335
		5336	/*
		5337	* dry run, to make sure we start off cache-cold on cpu1,
		5338	* and to get any vmalloc pagefaults in advance:
		5339	*/
		5340	measure_one(cache, size, cpu1, cpu2);
		5341	for (i = 0; i < ITERATIONS; i++)
		5342	cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
		5343
		5344	measure_one(cache, size, cpu2, cpu1);
		5345	for (i = 0; i < ITERATIONS; i++)
		5346	cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
		5347
		5348	/*
		5349	* (We measure the non-migrating [cached] cost on both
		5350	* cpu1 and cpu2, to handle CPUs with different speeds)
		5351	*/
		5352	cost2 = 0;
		5353
		5354	measure_one(cache, size, cpu1, cpu1);
		5355	for (i = 0; i < ITERATIONS; i++)
		5356	cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
		5357
		5358	measure_one(cache, size, cpu2, cpu2);
		5359	for (i = 0; i < ITERATIONS; i++)
		5360	cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
		5361
		5362	/*
		5363	* Get the per-iteration migration cost:
		5364	*/
		5365	do_div(cost1, 2*ITERATIONS);
		5366	do_div(cost2, 2*ITERATIONS);
		5367
		5368	return cost1 - cost2;
		5369	}
		5370
		5371	static unsigned long long measure_migration_cost(int cpu1, int cpu2)
		5372	{
		5373	unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
		5374	unsigned int max_size, size, size_found = 0;
		5375	long long cost = 0, prev_cost;
		5376	void *cache;
		5377
		5378	/*
		5379	* Search from max_cache_size*5 down to 64K - the real relevant
		5380	* cachesize has to lie somewhere inbetween.
		5381	*/
		5382	if (max_cache_size) {
		5383	max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
		5384	size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
		5385	} else {
		5386	/*
		5387	* Since we have no estimation about the relevant
		5388	* search range
		5389	*/
		5390	max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
		5391	size = MIN_CACHE_SIZE;
		5392	}
		5393
		5394	if (!cpu_online(cpu1) \|\| !cpu_online(cpu2)) {
		5395	printk("cpu %d and %d not both online!\n", cpu1, cpu2);
		5396	return 0;
		5397	}
		5398
		5399	/*
		5400	* Allocate the working set:
		5401	*/
		5402	cache = vmalloc(max_size);
		5403	if (!cache) {
		5404	printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
		5405	return 1000000; // return 1 msec on very small boxen
		5406	}
		5407
		5408	while (size <= max_size) {
		5409	prev_cost = cost;
		5410	cost = measure_cost(cpu1, cpu2, cache, size);
		5411
		5412	/*
		5413	* Update the max:
		5414	*/
		5415	if (cost > 0) {
		5416	if (max_cost < cost) {
		5417	max_cost = cost;
		5418	size_found = size;
		5419	}
		5420	}
		5421	/*
		5422	* Calculate average fluctuation, we use this to prevent
		5423	* noise from triggering an early break out of the loop:
		5424	*/
		5425	fluct = abs(cost - prev_cost);
		5426	avg_fluct = (avg_fluct + fluct)/2;
		5427
		5428	if (migration_debug)
		5429	printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
		5430	cpu1, cpu2, size,
		5431	(long)cost / 1000000,
		5432	((long)cost / 100000) % 10,
		5433	(long)max_cost / 1000000,
		5434	((long)max_cost / 100000) % 10,
		5435	domain_distance(cpu1, cpu2),
		5436	cost, avg_fluct);
		5437
		5438	/*
		5439	* If we iterated at least 20% past the previous maximum,
		5440	* and the cost has dropped by more than 20% already,
		5441	* (taking fluctuations into account) then we assume to
		5442	* have found the maximum and break out of the loop early:
		5443	*/
		5444	if (size_found && (size100 > size_foundSIZE_THRESH))
		5445	if (cost+avg_fluct <= 0 \|\|
		5446	max_cost100 > (cost+avg_fluct)COST_THRESH) {
		5447
		5448	if (migration_debug)
		5449	printk("-> found max.\n");
		5450	break;
		5451	}
		5452	/*
		5453	* Increase the cachesize in 5% steps:
		5454	*/
		5455	size = size * 20 / 19;
		5456	}
		5457
		5458	if (migration_debug)
		5459	printk("[%d][%d] working set size found: %d, cost: %Ld\n",
		5460	cpu1, cpu2, size_found, max_cost);
		5461
		5462	vfree(cache);
		5463
		5464	/*
		5465	* A task is considered 'cache cold' if at least 2 times
		5466	* the worst-case cost of migration has passed.
		5467	*
		5468	* (this limit is only listened to if the load-balancing
		5469	* situation is 'nice' - if there is a large imbalance we
		5470	* ignore it for the sake of CPU utilization and
		5471	* processing fairness.)
		5472	*/
		5473	return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
		5474	}
		5475
		5476	static void calibrate_migration_costs(const cpumask_t *cpu_map)
		5477	{
		5478	int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
		5479	unsigned long j0, j1, distance, max_distance = 0;
		5480	struct sched_domain *sd;
		5481
		5482	j0 = jiffies;
		5483
		5484	/*
		5485	* First pass - calculate the cacheflush times:
		5486	*/
		5487	for_each_cpu_mask(cpu1, *cpu_map) {
		5488	for_each_cpu_mask(cpu2, *cpu_map) {
		5489	if (cpu1 == cpu2)
		5490	continue;
		5491	distance = domain_distance(cpu1, cpu2);
		5492	max_distance = max(max_distance, distance);
		5493	/*
		5494	* No result cached yet?
		5495	*/
		5496	if (migration_cost[distance] == -1LL)
		5497	migration_cost[distance] =
		5498	measure_migration_cost(cpu1, cpu2);
		5499	}
		5500	}
		5501	/*
		5502	* Second pass - update the sched domain hierarchy with
		5503	* the new cache-hot-time estimations:
		5504	*/
		5505	for_each_cpu_mask(cpu, *cpu_map) {
		5506	distance = 0;
		5507	for_each_domain(cpu, sd) {
		5508	sd->cache_hot_time = migration_cost[distance];
		5509	distance++;
		5510	}
		5511	}
		5512	/*
		5513	* Print the matrix:
		5514	*/
		5515	if (migration_debug)
		5516	printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
		5517	max_cache_size,
		5518	#ifdef CONFIG_X86
		5519	cpu_khz/1000
		5520	#else
		5521	-1
		5522	#endif
		5523	);
		5524	printk("migration_cost=");
		5525	for (distance = 0; distance <= max_distance; distance++) {
		5526	if (distance)
		5527	printk(",");
		5528	printk("%ld", (long)migration_cost[distance] / 1000);
		5529	}
		5530	printk("\n");
		5531	j1 = jiffies;
		5532	if (migration_debug)
		5533	printk("migration: %ld seconds\n", (j1-j0)/HZ);
		5534
		5535	/*
		5536	* Move back to the original CPU. NUMA-Q gets confused
		5537	* if we migrate to another quad during bootup.
		5538	*/
		5539	if (raw_smp_processor_id() != orig_cpu) {
		5540	cpumask_t mask = cpumask_of_cpu(orig_cpu),
		5541	saved_mask = current->cpus_allowed;
		5542
		5543	set_cpus_allowed(current, mask);
		5544	set_cpus_allowed(current, saved_mask);
		5545	}
		5546	}
		5547
5085	#ifdef CONFIG_NUMA	5548	#ifdef CONFIG_NUMA
		5549
5086	/**	5550	/**
5087	* find_next_best_node - find the next node to include in a sched_domain	5551	* find_next_best_node - find the next node to include in a sched_domain
5088	* @node: node whose sched_domain we're building	5552	* @node: node whose sched_domain we're building
@@ -5448,6 +5912,10 @@ next_sg:
5448	#endif	5912	#endif
5449	cpu_attach_domain(sd, i);	5913	cpu_attach_domain(sd, i);
5450	}	5914	}
		5915	/*
		5916	* Tune cache-hot values:
		5917	*/
		5918	calibrate_migration_costs(cpu_map);
5451	}	5919	}
5452	/*	5920	/*
5453	* Set up scheduler domains and groups. Callers must hold the hotplug lock.	5921	* Set up scheduler domains and groups. Callers must hold the hotplug lock.