aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2007-07-09 12:51:57 -0400
committerIngo Molnar <mingo@elte.hu>2007-07-09 12:51:57 -0400
commit0437e109e1841607f2988891eaa36c531c6aa6ac (patch)
treee9d8f170786f7e33d4c5829cb008cf38d42a2014
parent0e6aca43e08a62a48d6770e9a159dbec167bf4c6 (diff)
sched: zap the migration init / cache-hot balancing code
the SMP load-balancer uses the boot-time migration-cost estimation code to attempt to improve the quality of balancing. The reason for this code is that the discrete priority queues do not preserve the order of scheduling accurately, so the load-balancer skips tasks that were running on a CPU 'recently'. this code is fundamental fragile: the boot-time migration cost detector doesnt really work on systems that had large L3 caches, it caused boot delays on large systems and the whole cache-hot concept made the balancing code pretty undeterministic as well. (and hey, i wrote most of it, so i can say it out loud that it sucks ;-) under CFS the same purpose of cache affinity can be achieved without any special cache-hot special-case: tasks are sorted in the 'timeline' tree and the SMP balancer picks tasks from the left side of the tree, thus the most cache-cold task is balanced automatically. Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--Documentation/kernel-parameters.txt43
-rw-r--r--arch/i386/kernel/smpboot.c12
-rw-r--r--arch/ia64/kernel/setup.c6
-rw-r--r--arch/mips/kernel/smp.c11
-rw-r--r--arch/sparc/kernel/smp.c10
-rw-r--r--arch/sparc64/kernel/smp.c27
-rw-r--r--include/linux/sched.h6
-rw-r--r--kernel/sched.c481
8 files changed, 0 insertions, 596 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index af50f9bbe68e..4d880b3d1f35 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1014,49 +1014,6 @@ and is between 256 and 4096 characters. It is defined in the file
1014 1014
1015 mga= [HW,DRM] 1015 mga= [HW,DRM]
1016 1016
1017 migration_cost=
1018 [KNL,SMP] debug: override scheduler migration costs
1019 Format: <level-1-usecs>,<level-2-usecs>,...
1020 This debugging option can be used to override the
1021 default scheduler migration cost matrix. The numbers
1022 are indexed by 'CPU domain distance'.
1023 E.g. migration_cost=1000,2000,3000 on an SMT NUMA
1024 box will set up an intra-core migration cost of
1025 1 msec, an inter-core migration cost of 2 msecs,
1026 and an inter-node migration cost of 3 msecs.
1027
1028 WARNING: using the wrong values here can break
1029 scheduler performance, so it's only for scheduler
1030 development purposes, not production environments.
1031
1032 migration_debug=
1033 [KNL,SMP] migration cost auto-detect verbosity
1034 Format=<0|1|2>
1035 If a system's migration matrix reported at bootup
1036 seems erroneous then this option can be used to
1037 increase verbosity of the detection process.
1038 We default to 0 (no extra messages), 1 will print
1039 some more information, and 2 will be really
1040 verbose (probably only useful if you also have a
1041 serial console attached to the system).
1042
1043 migration_factor=
1044 [KNL,SMP] multiply/divide migration costs by a factor
1045 Format=<percent>
1046 This debug option can be used to proportionally
1047 increase or decrease the auto-detected migration
1048 costs for all entries of the migration matrix.
1049 E.g. migration_factor=150 will increase migration
1050 costs by 50%. (and thus the scheduler will be less
1051 eager migrating cache-hot tasks)
1052 migration_factor=80 will decrease migration costs
1053 by 20%. (thus the scheduler will be more eager to
1054 migrate tasks)
1055
1056 WARNING: using the wrong values here can break
1057 scheduler performance, so it's only for scheduler
1058 development purposes, not production environments.
1059
1060 mousedev.tap_time= 1017 mousedev.tap_time=
1061 [MOUSE] Maximum time between finger touching and 1018 [MOUSE] Maximum time between finger touching and
1062 leaving touchpad surface for touch to be considered 1019 leaving touchpad surface for touch to be considered
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 88baed1e7e83..0b2954534b8e 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -941,17 +941,6 @@ exit:
941} 941}
942#endif 942#endif
943 943
944static void smp_tune_scheduling(void)
945{
946 if (cpu_khz) {
947 /* cache size in kB */
948 long cachesize = boot_cpu_data.x86_cache_size;
949
950 if (cachesize > 0)
951 max_cache_size = cachesize * 1024;
952 }
953}
954
955/* 944/*
956 * Cycle through the processors sending APIC IPIs to boot each. 945 * Cycle through the processors sending APIC IPIs to boot each.
957 */ 946 */
@@ -980,7 +969,6 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
980 x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; 969 x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
981 970
982 current_thread_info()->cpu = 0; 971 current_thread_info()->cpu = 0;
983 smp_tune_scheduling();
984 972
985 set_cpu_sibling_map(0); 973 set_cpu_sibling_map(0);
986 974
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index eaa6a24bc0b6..188fb73c6845 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -805,7 +805,6 @@ static void __cpuinit
805get_max_cacheline_size (void) 805get_max_cacheline_size (void)
806{ 806{
807 unsigned long line_size, max = 1; 807 unsigned long line_size, max = 1;
808 unsigned int cache_size = 0;
809 u64 l, levels, unique_caches; 808 u64 l, levels, unique_caches;
810 pal_cache_config_info_t cci; 809 pal_cache_config_info_t cci;
811 s64 status; 810 s64 status;
@@ -835,8 +834,6 @@ get_max_cacheline_size (void)
835 line_size = 1 << cci.pcci_line_size; 834 line_size = 1 << cci.pcci_line_size;
836 if (line_size > max) 835 if (line_size > max)
837 max = line_size; 836 max = line_size;
838 if (cache_size < cci.pcci_cache_size)
839 cache_size = cci.pcci_cache_size;
840 if (!cci.pcci_unified) { 837 if (!cci.pcci_unified) {
841 status = ia64_pal_cache_config_info(l, 838 status = ia64_pal_cache_config_info(l,
842 /* cache_type (instruction)= */ 1, 839 /* cache_type (instruction)= */ 1,
@@ -853,9 +850,6 @@ get_max_cacheline_size (void)
853 ia64_i_cache_stride_shift = cci.pcci_stride; 850 ia64_i_cache_stride_shift = cci.pcci_stride;
854 } 851 }
855 out: 852 out:
856#ifdef CONFIG_SMP
857 max_cache_size = max(max_cache_size, cache_size);
858#endif
859 if (max > ia64_max_cacheline_size) 853 if (max > ia64_max_cacheline_size)
860 ia64_max_cacheline_size = max; 854 ia64_max_cacheline_size = max;
861} 855}
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 67edfa7ed93a..a1b017f2dbb3 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -51,16 +51,6 @@ int __cpu_logical_map[NR_CPUS]; /* Map logical to physical */
51EXPORT_SYMBOL(phys_cpu_present_map); 51EXPORT_SYMBOL(phys_cpu_present_map);
52EXPORT_SYMBOL(cpu_online_map); 52EXPORT_SYMBOL(cpu_online_map);
53 53
54/* This happens early in bootup, can't really do it better */
55static void smp_tune_scheduling (void)
56{
57 struct cache_desc *cd = &current_cpu_data.scache;
58 unsigned long cachesize = cd->linesz * cd->sets * cd->ways;
59
60 if (cachesize > max_cache_size)
61 max_cache_size = cachesize;
62}
63
64extern void __init calibrate_delay(void); 54extern void __init calibrate_delay(void);
65extern ATTRIB_NORET void cpu_idle(void); 55extern ATTRIB_NORET void cpu_idle(void);
66 56
@@ -228,7 +218,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
228{ 218{
229 init_new_context(current, &init_mm); 219 init_new_context(current, &init_mm);
230 current_thread_info()->cpu = 0; 220 current_thread_info()->cpu = 0;
231 smp_tune_scheduling();
232 plat_prepare_cpus(max_cpus); 221 plat_prepare_cpus(max_cpus);
233#ifndef CONFIG_HOTPLUG_CPU 222#ifndef CONFIG_HOTPLUG_CPU
234 cpu_present_map = cpu_possible_map; 223 cpu_present_map = cpu_possible_map;
diff --git a/arch/sparc/kernel/smp.c b/arch/sparc/kernel/smp.c
index 4d9ad59031bb..4fea3ac7bff0 100644
--- a/arch/sparc/kernel/smp.c
+++ b/arch/sparc/kernel/smp.c
@@ -68,16 +68,6 @@ void __cpuinit smp_store_cpu_info(int id)
68 cpu_data(id).prom_node = cpu_node; 68 cpu_data(id).prom_node = cpu_node;
69 cpu_data(id).mid = cpu_get_hwmid(cpu_node); 69 cpu_data(id).mid = cpu_get_hwmid(cpu_node);
70 70
71 /* this is required to tune the scheduler correctly */
72 /* is it possible to have CPUs with different cache sizes? */
73 if (id == boot_cpu_id) {
74 int cache_line,cache_nlines;
75 cache_line = 0x20;
76 cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line);
77 cache_nlines = 0x8000;
78 cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines);
79 max_cache_size = cache_line * cache_nlines;
80 }
81 if (cpu_data(id).mid < 0) 71 if (cpu_data(id).mid < 0)
82 panic("No MID found for CPU%d at node 0x%08d", id, cpu_node); 72 panic("No MID found for CPU%d at node 0x%08d", id, cpu_node);
83} 73}
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 4dcd7d0b60f2..40e40f968d61 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -1163,32 +1163,6 @@ int setup_profiling_timer(unsigned int multiplier)
1163 return -EINVAL; 1163 return -EINVAL;
1164} 1164}
1165 1165
1166static void __init smp_tune_scheduling(void)
1167{
1168 unsigned int smallest = ~0U;
1169 int i;
1170
1171 for (i = 0; i < NR_CPUS; i++) {
1172 unsigned int val = cpu_data(i).ecache_size;
1173
1174 if (val && val < smallest)
1175 smallest = val;
1176 }
1177
1178 /* Any value less than 256K is nonsense. */
1179 if (smallest < (256U * 1024U))
1180 smallest = 256 * 1024;
1181
1182 max_cache_size = smallest;
1183
1184 if (smallest < 1U * 1024U * 1024U)
1185 printk(KERN_INFO "Using max_cache_size of %uKB\n",
1186 smallest / 1024U);
1187 else
1188 printk(KERN_INFO "Using max_cache_size of %uMB\n",
1189 smallest / 1024U / 1024U);
1190}
1191
1192/* Constrain the number of cpus to max_cpus. */ 1166/* Constrain the number of cpus to max_cpus. */
1193void __init smp_prepare_cpus(unsigned int max_cpus) 1167void __init smp_prepare_cpus(unsigned int max_cpus)
1194{ 1168{
@@ -1206,7 +1180,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
1206 } 1180 }
1207 1181
1208 cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy; 1182 cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy;
1209 smp_tune_scheduling();
1210} 1183}
1211 1184
1212void __devinit smp_prepare_boot_cpu(void) 1185void __devinit smp_prepare_boot_cpu(void)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7e74262f98e1..8764cda0feca 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -754,12 +754,6 @@ struct sched_domain {
754extern int partition_sched_domains(cpumask_t *partition1, 754extern int partition_sched_domains(cpumask_t *partition1,
755 cpumask_t *partition2); 755 cpumask_t *partition2);
756 756
757/*
758 * Maximum cache size the migration-costs auto-tuning code will
759 * search from:
760 */
761extern unsigned int max_cache_size;
762
763#endif /* CONFIG_SMP */ 757#endif /* CONFIG_SMP */
764 758
765 759
diff --git a/kernel/sched.c b/kernel/sched.c
index ac054d9a0719..46b23f0fee24 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5797,483 +5797,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5797 5797
5798#define SD_NODES_PER_DOMAIN 16 5798#define SD_NODES_PER_DOMAIN 16
5799 5799
5800/*
5801 * Self-tuning task migration cost measurement between source and target CPUs.
5802 *
5803 * This is done by measuring the cost of manipulating buffers of varying
5804 * sizes. For a given buffer-size here are the steps that are taken:
5805 *
5806 * 1) the source CPU reads+dirties a shared buffer
5807 * 2) the target CPU reads+dirties the same shared buffer
5808 *
5809 * We measure how long they take, in the following 4 scenarios:
5810 *
5811 * - source: CPU1, target: CPU2 | cost1
5812 * - source: CPU2, target: CPU1 | cost2
5813 * - source: CPU1, target: CPU1 | cost3
5814 * - source: CPU2, target: CPU2 | cost4
5815 *
5816 * We then calculate the cost3+cost4-cost1-cost2 difference - this is
5817 * the cost of migration.
5818 *
5819 * We then start off from a small buffer-size and iterate up to larger
5820 * buffer sizes, in 5% steps - measuring each buffer-size separately, and
5821 * doing a maximum search for the cost. (The maximum cost for a migration
5822 * normally occurs when the working set size is around the effective cache
5823 * size.)
5824 */
5825#define SEARCH_SCOPE 2
5826#define MIN_CACHE_SIZE (64*1024U)
5827#define DEFAULT_CACHE_SIZE (5*1024*1024U)
5828#define ITERATIONS 1
5829#define SIZE_THRESH 130
5830#define COST_THRESH 130
5831
5832/*
5833 * The migration cost is a function of 'domain distance'. Domain
5834 * distance is the number of steps a CPU has to iterate down its
5835 * domain tree to share a domain with the other CPU. The farther
5836 * two CPUs are from each other, the larger the distance gets.
5837 *
5838 * Note that we use the distance only to cache measurement results,
5839 * the distance value is not used numerically otherwise. When two
5840 * CPUs have the same distance it is assumed that the migration
5841 * cost is the same. (this is a simplification but quite practical)
5842 */
5843#define MAX_DOMAIN_DISTANCE 32
5844
5845static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
5846 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
5847/*
5848 * Architectures may override the migration cost and thus avoid
5849 * boot-time calibration. Unit is nanoseconds. Mostly useful for
5850 * virtualized hardware:
5851 */
5852#ifdef CONFIG_DEFAULT_MIGRATION_COST
5853 CONFIG_DEFAULT_MIGRATION_COST
5854#else
5855 -1LL
5856#endif
5857};
5858
5859/*
5860 * Allow override of migration cost - in units of microseconds.
5861 * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
5862 * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
5863 */
5864static int __init migration_cost_setup(char *str)
5865{
5866 int ints[MAX_DOMAIN_DISTANCE+1], i;
5867
5868 str = get_options(str, ARRAY_SIZE(ints), ints);
5869
5870 printk("#ints: %d\n", ints[0]);
5871 for (i = 1; i <= ints[0]; i++) {
5872 migration_cost[i-1] = (unsigned long long)ints[i]*1000;
5873 printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
5874 }
5875 return 1;
5876}
5877
5878__setup ("migration_cost=", migration_cost_setup);
5879
5880/*
5881 * Global multiplier (divisor) for migration-cutoff values,
5882 * in percentiles. E.g. use a value of 150 to get 1.5 times
5883 * longer cache-hot cutoff times.
5884 *
5885 * (We scale it from 100 to 128 to long long handling easier.)
5886 */
5887
5888#define MIGRATION_FACTOR_SCALE 128
5889
5890static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
5891
5892static int __init setup_migration_factor(char *str)
5893{
5894 get_option(&str, &migration_factor);
5895 migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
5896 return 1;
5897}
5898
5899__setup("migration_factor=", setup_migration_factor);
5900
5901/*
5902 * Estimated distance of two CPUs, measured via the number of domains
5903 * we have to pass for the two CPUs to be in the same span:
5904 */
5905static unsigned long domain_distance(int cpu1, int cpu2)
5906{
5907 unsigned long distance = 0;
5908 struct sched_domain *sd;
5909
5910 for_each_domain(cpu1, sd) {
5911 WARN_ON(!cpu_isset(cpu1, sd->span));
5912 if (cpu_isset(cpu2, sd->span))
5913 return distance;
5914 distance++;
5915 }
5916 if (distance >= MAX_DOMAIN_DISTANCE) {
5917 WARN_ON(1);
5918 distance = MAX_DOMAIN_DISTANCE-1;
5919 }
5920
5921 return distance;
5922}
5923
5924static unsigned int migration_debug;
5925
5926static int __init setup_migration_debug(char *str)
5927{
5928 get_option(&str, &migration_debug);
5929 return 1;
5930}
5931
5932__setup("migration_debug=", setup_migration_debug);
5933
5934/*
5935 * Maximum cache-size that the scheduler should try to measure.
5936 * Architectures with larger caches should tune this up during
5937 * bootup. Gets used in the domain-setup code (i.e. during SMP
5938 * bootup).
5939 */
5940unsigned int max_cache_size;
5941
5942static int __init setup_max_cache_size(char *str)
5943{
5944 get_option(&str, &max_cache_size);
5945 return 1;
5946}
5947
5948__setup("max_cache_size=", setup_max_cache_size);
5949
5950/*
5951 * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
5952 * is the operation that is timed, so we try to generate unpredictable
5953 * cachemisses that still end up filling the L2 cache:
5954 */
5955static void touch_cache(void *__cache, unsigned long __size)
5956{
5957 unsigned long size = __size / sizeof(long);
5958 unsigned long chunk1 = size / 3;
5959 unsigned long chunk2 = 2 * size / 3;
5960 unsigned long *cache = __cache;
5961 int i;
5962
5963 for (i = 0; i < size/6; i += 8) {
5964 switch (i % 6) {
5965 case 0: cache[i]++;
5966 case 1: cache[size-1-i]++;
5967 case 2: cache[chunk1-i]++;
5968 case 3: cache[chunk1+i]++;
5969 case 4: cache[chunk2-i]++;
5970 case 5: cache[chunk2+i]++;
5971 }
5972 }
5973}
5974
5975/*
5976 * Measure the cache-cost of one task migration. Returns in units of nsec.
5977 */
5978static unsigned long long
5979measure_one(void *cache, unsigned long size, int source, int target)
5980{
5981 cpumask_t mask, saved_mask;
5982 unsigned long long t0, t1, t2, t3, cost;
5983
5984 saved_mask = current->cpus_allowed;
5985
5986 /*
5987 * Flush source caches to RAM and invalidate them:
5988 */
5989 sched_cacheflush();
5990
5991 /*
5992 * Migrate to the source CPU:
5993 */
5994 mask = cpumask_of_cpu(source);
5995 set_cpus_allowed(current, mask);
5996 WARN_ON(smp_processor_id() != source);
5997
5998 /*
5999 * Dirty the working set:
6000 */
6001 t0 = sched_clock();
6002 touch_cache(cache, size);
6003 t1 = sched_clock();
6004
6005 /*
6006 * Migrate to the target CPU, dirty the L2 cache and access
6007 * the shared buffer. (which represents the working set
6008 * of a migrated task.)
6009 */
6010 mask = cpumask_of_cpu(target);
6011 set_cpus_allowed(current, mask);
6012 WARN_ON(smp_processor_id() != target);
6013
6014 t2 = sched_clock();
6015 touch_cache(cache, size);
6016 t3 = sched_clock();
6017
6018 cost = t1-t0 + t3-t2;
6019
6020 if (migration_debug >= 2)
6021 printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
6022 source, target, t1-t0, t1-t0, t3-t2, cost);
6023 /*
6024 * Flush target caches to RAM and invalidate them:
6025 */
6026 sched_cacheflush();
6027
6028 set_cpus_allowed(current, saved_mask);
6029
6030 return cost;
6031}
6032
6033/*
6034 * Measure a series of task migrations and return the average
6035 * result. Since this code runs early during bootup the system
6036 * is 'undisturbed' and the average latency makes sense.
6037 *
6038 * The algorithm in essence auto-detects the relevant cache-size,
6039 * so it will properly detect different cachesizes for different
6040 * cache-hierarchies, depending on how the CPUs are connected.
6041 *
6042 * Architectures can prime the upper limit of the search range via
6043 * max_cache_size, otherwise the search range defaults to 20MB...64K.
6044 */
6045static unsigned long long
6046measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
6047{
6048 unsigned long long cost1, cost2;
6049 int i;
6050
6051 /*
6052 * Measure the migration cost of 'size' bytes, over an
6053 * average of 10 runs:
6054 *
6055 * (We perturb the cache size by a small (0..4k)
6056 * value to compensate size/alignment related artifacts.
6057 * We also subtract the cost of the operation done on
6058 * the same CPU.)
6059 */
6060 cost1 = 0;
6061
6062 /*
6063 * dry run, to make sure we start off cache-cold on cpu1,
6064 * and to get any vmalloc pagefaults in advance:
6065 */
6066 measure_one(cache, size, cpu1, cpu2);
6067 for (i = 0; i < ITERATIONS; i++)
6068 cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
6069
6070 measure_one(cache, size, cpu2, cpu1);
6071 for (i = 0; i < ITERATIONS; i++)
6072 cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
6073
6074 /*
6075 * (We measure the non-migrating [cached] cost on both
6076 * cpu1 and cpu2, to handle CPUs with different speeds)
6077 */
6078 cost2 = 0;
6079
6080 measure_one(cache, size, cpu1, cpu1);
6081 for (i = 0; i < ITERATIONS; i++)
6082 cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
6083
6084 measure_one(cache, size, cpu2, cpu2);
6085 for (i = 0; i < ITERATIONS; i++)
6086 cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
6087
6088 /*
6089 * Get the per-iteration migration cost:
6090 */
6091 do_div(cost1, 2 * ITERATIONS);
6092 do_div(cost2, 2 * ITERATIONS);
6093
6094 return cost1 - cost2;
6095}
6096
6097static unsigned long long measure_migration_cost(int cpu1, int cpu2)
6098{
6099 unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
6100 unsigned int max_size, size, size_found = 0;
6101 long long cost = 0, prev_cost;
6102 void *cache;
6103
6104 /*
6105 * Search from max_cache_size*5 down to 64K - the real relevant
6106 * cachesize has to lie somewhere inbetween.
6107 */
6108 if (max_cache_size) {
6109 max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
6110 size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
6111 } else {
6112 /*
6113 * Since we have no estimation about the relevant
6114 * search range
6115 */
6116 max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
6117 size = MIN_CACHE_SIZE;
6118 }
6119
6120 if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
6121 printk("cpu %d and %d not both online!\n", cpu1, cpu2);
6122 return 0;
6123 }
6124
6125 /*
6126 * Allocate the working set:
6127 */
6128 cache = vmalloc(max_size);
6129 if (!cache) {
6130 printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
6131 return 1000000; /* return 1 msec on very small boxen */
6132 }
6133
6134 while (size <= max_size) {
6135 prev_cost = cost;
6136 cost = measure_cost(cpu1, cpu2, cache, size);
6137
6138 /*
6139 * Update the max:
6140 */
6141 if (cost > 0) {
6142 if (max_cost < cost) {
6143 max_cost = cost;
6144 size_found = size;
6145 }
6146 }
6147 /*
6148 * Calculate average fluctuation, we use this to prevent
6149 * noise from triggering an early break out of the loop:
6150 */
6151 fluct = abs(cost - prev_cost);
6152 avg_fluct = (avg_fluct + fluct)/2;
6153
6154 if (migration_debug)
6155 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
6156 "(%8Ld %8Ld)\n",
6157 cpu1, cpu2, size,
6158 (long)cost / 1000000,
6159 ((long)cost / 100000) % 10,
6160 (long)max_cost / 1000000,
6161 ((long)max_cost / 100000) % 10,
6162 domain_distance(cpu1, cpu2),
6163 cost, avg_fluct);
6164
6165 /*
6166 * If we iterated at least 20% past the previous maximum,
6167 * and the cost has dropped by more than 20% already,
6168 * (taking fluctuations into account) then we assume to
6169 * have found the maximum and break out of the loop early:
6170 */
6171 if (size_found && (size*100 > size_found*SIZE_THRESH))
6172 if (cost+avg_fluct <= 0 ||
6173 max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
6174
6175 if (migration_debug)
6176 printk("-> found max.\n");
6177 break;
6178 }
6179 /*
6180 * Increase the cachesize in 10% steps:
6181 */
6182 size = size * 10 / 9;
6183 }
6184
6185 if (migration_debug)
6186 printk("[%d][%d] working set size found: %d, cost: %Ld\n",
6187 cpu1, cpu2, size_found, max_cost);
6188
6189 vfree(cache);
6190
6191 /*
6192 * A task is considered 'cache cold' if at least 2 times
6193 * the worst-case cost of migration has passed.
6194 *
6195 * (this limit is only listened to if the load-balancing
6196 * situation is 'nice' - if there is a large imbalance we
6197 * ignore it for the sake of CPU utilization and
6198 * processing fairness.)
6199 */
6200 return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
6201}
6202
6203static void calibrate_migration_costs(const cpumask_t *cpu_map)
6204{
6205 int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
6206 unsigned long j0, j1, distance, max_distance = 0;
6207 struct sched_domain *sd;
6208
6209 j0 = jiffies;
6210
6211 /*
6212 * First pass - calculate the cacheflush times:
6213 */
6214 for_each_cpu_mask(cpu1, *cpu_map) {
6215 for_each_cpu_mask(cpu2, *cpu_map) {
6216 if (cpu1 == cpu2)
6217 continue;
6218 distance = domain_distance(cpu1, cpu2);
6219 max_distance = max(max_distance, distance);
6220 /*
6221 * No result cached yet?
6222 */
6223 if (migration_cost[distance] == -1LL)
6224 migration_cost[distance] =
6225 measure_migration_cost(cpu1, cpu2);
6226 }
6227 }
6228 /*
6229 * Second pass - update the sched domain hierarchy with
6230 * the new cache-hot-time estimations:
6231 */
6232 for_each_cpu_mask(cpu, *cpu_map) {
6233 distance = 0;
6234 for_each_domain(cpu, sd) {
6235 sd->cache_hot_time = migration_cost[distance];
6236 distance++;
6237 }
6238 }
6239 /*
6240 * Print the matrix:
6241 */
6242 if (migration_debug)
6243 printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
6244 max_cache_size,
6245#ifdef CONFIG_X86
6246 cpu_khz/1000
6247#else
6248 -1
6249#endif
6250 );
6251 if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
6252 printk("migration_cost=");
6253 for (distance = 0; distance <= max_distance; distance++) {
6254 if (distance)
6255 printk(",");
6256 printk("%ld", (long)migration_cost[distance] / 1000);
6257 }
6258 printk("\n");
6259 }
6260 j1 = jiffies;
6261 if (migration_debug)
6262 printk("migration: %ld seconds\n", (j1-j0) / HZ);
6263
6264 /*
6265 * Move back to the original CPU. NUMA-Q gets confused
6266 * if we migrate to another quad during bootup.
6267 */
6268 if (raw_smp_processor_id() != orig_cpu) {
6269 cpumask_t mask = cpumask_of_cpu(orig_cpu),
6270 saved_mask = current->cpus_allowed;
6271
6272 set_cpus_allowed(current, mask);
6273 set_cpus_allowed(current, saved_mask);
6274 }
6275}
6276
6277#ifdef CONFIG_NUMA 5800#ifdef CONFIG_NUMA
6278 5801
6279/** 5802/**
@@ -6803,10 +6326,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6803#endif 6326#endif
6804 cpu_attach_domain(sd, i); 6327 cpu_attach_domain(sd, i);
6805 } 6328 }
6806 /*
6807 * Tune cache-hot values:
6808 */
6809 calibrate_migration_costs(cpu_map);
6810 6329
6811 return 0; 6330 return 0;
6812 6331