diff options
-rw-r--r-- | Documentation/kernel-parameters.txt | 43 | ||||
-rw-r--r-- | arch/i386/kernel/smpboot.c | 1 | ||||
-rw-r--r-- | arch/ia64/kernel/setup.c | 6 | ||||
-rw-r--r-- | arch/ppc/xmon/xmon.c | 2 | ||||
-rw-r--r-- | include/asm-i386/topology.h | 1 | ||||
-rw-r--r-- | include/asm-ia64/topology.h | 2 | ||||
-rw-r--r-- | include/asm-mips/mach-ip27/topology.h | 1 | ||||
-rw-r--r-- | include/asm-powerpc/topology.h | 1 | ||||
-rw-r--r-- | include/asm-x86_64/topology.h | 1 | ||||
-rw-r--r-- | include/linux/sched.h | 9 | ||||
-rw-r--r-- | include/linux/topology.h | 2 | ||||
-rw-r--r-- | kernel/sched.c | 468 |
12 files changed, 527 insertions, 10 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index dd0bfc291a68..fe11fccf7e41 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -856,6 +856,49 @@ running once the system is up. | |||
856 | 856 | ||
857 | mga= [HW,DRM] | 857 | mga= [HW,DRM] |
858 | 858 | ||
859 | migration_cost= | ||
860 | [KNL,SMP] debug: override scheduler migration costs | ||
861 | Format: <level-1-usecs>,<level-2-usecs>,... | ||
862 | This debugging option can be used to override the | ||
863 | default scheduler migration cost matrix. The numbers | ||
864 | are indexed by 'CPU domain distance'. | ||
865 | E.g. migration_cost=1000,2000,3000 on an SMT NUMA | ||
866 | box will set up an intra-core migration cost of | ||
867 | 1 msec, an inter-core migration cost of 2 msecs, | ||
868 | and an inter-node migration cost of 3 msecs. | ||
869 | |||
870 | WARNING: using the wrong values here can break | ||
871 | scheduler performance, so it's only for scheduler | ||
872 | development purposes, not production environments. | ||
873 | |||
874 | migration_debug= | ||
875 | [KNL,SMP] migration cost auto-detect verbosity | ||
876 | Format=<0|1|2> | ||
877 | If a system's migration matrix reported at bootup | ||
878 | seems erroneous then this option can be used to | ||
879 | increase verbosity of the detection process. | ||
880 | We default to 0 (no extra messages), 1 will print | ||
881 | some more information, and 2 will be really | ||
882 | verbose (probably only useful if you also have a | ||
883 | serial console attached to the system). | ||
884 | |||
885 | migration_factor= | ||
886 | [KNL,SMP] multiply/divide migration costs by a factor | ||
887 | Format=<percent> | ||
888 | This debug option can be used to proportionally | ||
889 | increase or decrease the auto-detected migration | ||
890 | costs for all entries of the migration matrix. | ||
891 | E.g. migration_factor=150 will increase migration | ||
892 | costs by 50%. (and thus the scheduler will be less | ||
893 | eager migrating cache-hot tasks) | ||
894 | migration_factor=80 will decrease migration costs | ||
895 | by 20%. (thus the scheduler will be more eager to | ||
896 | migrate tasks) | ||
897 | |||
898 | WARNING: using the wrong values here can break | ||
899 | scheduler performance, so it's only for scheduler | ||
900 | development purposes, not production environments. | ||
901 | |||
859 | mousedev.tap_time= | 902 | mousedev.tap_time= |
860 | [MOUSE] Maximum time between finger touching and | 903 | [MOUSE] Maximum time between finger touching and |
861 | leaving touchpad surface for touch to be considered | 904 | leaving touchpad surface for touch to be considered |
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index b3c2e2c26743..a9bf5f222e47 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c | |||
@@ -1096,6 +1096,7 @@ static void smp_tune_scheduling (void) | |||
1096 | cachesize = 16; /* Pentiums, 2x8kB cache */ | 1096 | cachesize = 16; /* Pentiums, 2x8kB cache */ |
1097 | bandwidth = 100; | 1097 | bandwidth = 100; |
1098 | } | 1098 | } |
1099 | max_cache_size = cachesize * 1024; | ||
1099 | } | 1100 | } |
1100 | } | 1101 | } |
1101 | 1102 | ||
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index d91c8ff2c0d7..0daa8fa9ef32 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c | |||
@@ -696,6 +696,7 @@ static void | |||
696 | get_max_cacheline_size (void) | 696 | get_max_cacheline_size (void) |
697 | { | 697 | { |
698 | unsigned long line_size, max = 1; | 698 | unsigned long line_size, max = 1; |
699 | unsigned int cache_size = 0; | ||
699 | u64 l, levels, unique_caches; | 700 | u64 l, levels, unique_caches; |
700 | pal_cache_config_info_t cci; | 701 | pal_cache_config_info_t cci; |
701 | s64 status; | 702 | s64 status; |
@@ -725,6 +726,8 @@ get_max_cacheline_size (void) | |||
725 | line_size = 1 << cci.pcci_line_size; | 726 | line_size = 1 << cci.pcci_line_size; |
726 | if (line_size > max) | 727 | if (line_size > max) |
727 | max = line_size; | 728 | max = line_size; |
729 | if (cache_size < cci.pcci_cache_size) | ||
730 | cache_size = cci.pcci_cache_size; | ||
728 | if (!cci.pcci_unified) { | 731 | if (!cci.pcci_unified) { |
729 | status = ia64_pal_cache_config_info(l, | 732 | status = ia64_pal_cache_config_info(l, |
730 | /* cache_type (instruction)= */ 1, | 733 | /* cache_type (instruction)= */ 1, |
@@ -741,6 +744,9 @@ get_max_cacheline_size (void) | |||
741 | ia64_i_cache_stride_shift = cci.pcci_stride; | 744 | ia64_i_cache_stride_shift = cci.pcci_stride; |
742 | } | 745 | } |
743 | out: | 746 | out: |
747 | #ifdef CONFIG_SMP | ||
748 | max_cache_size = max(max_cache_size, cache_size); | ||
749 | #endif | ||
744 | if (max > ia64_max_cacheline_size) | 750 | if (max > ia64_max_cacheline_size) |
745 | ia64_max_cacheline_size = max; | 751 | ia64_max_cacheline_size = max; |
746 | } | 752 | } |
diff --git a/arch/ppc/xmon/xmon.c b/arch/ppc/xmon/xmon.c index 2b483b4f1602..9075a7538e26 100644 --- a/arch/ppc/xmon/xmon.c +++ b/arch/ppc/xmon/xmon.c | |||
@@ -99,7 +99,7 @@ static void remove_bpts(void); | |||
99 | static void insert_bpts(void); | 99 | static void insert_bpts(void); |
100 | static struct bpt *at_breakpoint(unsigned pc); | 100 | static struct bpt *at_breakpoint(unsigned pc); |
101 | static void bpt_cmds(void); | 101 | static void bpt_cmds(void); |
102 | static void cacheflush(void); | 102 | void cacheflush(void); |
103 | #ifdef CONFIG_SMP | 103 | #ifdef CONFIG_SMP |
104 | static void cpu_cmd(void); | 104 | static void cpu_cmd(void); |
105 | #endif /* CONFIG_SMP */ | 105 | #endif /* CONFIG_SMP */ |
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h index 0ec27c9e8e45..d7e19eb344b7 100644 --- a/include/asm-i386/topology.h +++ b/include/asm-i386/topology.h | |||
@@ -72,7 +72,6 @@ static inline int node_to_first_cpu(int node) | |||
72 | .max_interval = 32, \ | 72 | .max_interval = 32, \ |
73 | .busy_factor = 32, \ | 73 | .busy_factor = 32, \ |
74 | .imbalance_pct = 125, \ | 74 | .imbalance_pct = 125, \ |
75 | .cache_hot_time = (10*1000000), \ | ||
76 | .cache_nice_tries = 1, \ | 75 | .cache_nice_tries = 1, \ |
77 | .busy_idx = 3, \ | 76 | .busy_idx = 3, \ |
78 | .idle_idx = 1, \ | 77 | .idle_idx = 1, \ |
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h index f7c330467e7e..d8aae4da3978 100644 --- a/include/asm-ia64/topology.h +++ b/include/asm-ia64/topology.h | |||
@@ -55,7 +55,6 @@ void build_cpu_to_node_map(void); | |||
55 | .max_interval = 4, \ | 55 | .max_interval = 4, \ |
56 | .busy_factor = 64, \ | 56 | .busy_factor = 64, \ |
57 | .imbalance_pct = 125, \ | 57 | .imbalance_pct = 125, \ |
58 | .cache_hot_time = (10*1000000), \ | ||
59 | .per_cpu_gain = 100, \ | 58 | .per_cpu_gain = 100, \ |
60 | .cache_nice_tries = 2, \ | 59 | .cache_nice_tries = 2, \ |
61 | .busy_idx = 2, \ | 60 | .busy_idx = 2, \ |
@@ -81,7 +80,6 @@ void build_cpu_to_node_map(void); | |||
81 | .max_interval = 8*(min(num_online_cpus(), 32)), \ | 80 | .max_interval = 8*(min(num_online_cpus(), 32)), \ |
82 | .busy_factor = 64, \ | 81 | .busy_factor = 64, \ |
83 | .imbalance_pct = 125, \ | 82 | .imbalance_pct = 125, \ |
84 | .cache_hot_time = (10*1000000), \ | ||
85 | .cache_nice_tries = 2, \ | 83 | .cache_nice_tries = 2, \ |
86 | .busy_idx = 3, \ | 84 | .busy_idx = 3, \ |
87 | .idle_idx = 2, \ | 85 | .idle_idx = 2, \ |
diff --git a/include/asm-mips/mach-ip27/topology.h b/include/asm-mips/mach-ip27/topology.h index 82141c711c33..59d26b52ba32 100644 --- a/include/asm-mips/mach-ip27/topology.h +++ b/include/asm-mips/mach-ip27/topology.h | |||
@@ -27,7 +27,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES]; | |||
27 | .max_interval = 32, \ | 27 | .max_interval = 32, \ |
28 | .busy_factor = 32, \ | 28 | .busy_factor = 32, \ |
29 | .imbalance_pct = 125, \ | 29 | .imbalance_pct = 125, \ |
30 | .cache_hot_time = (10*1000), \ | ||
31 | .cache_nice_tries = 1, \ | 30 | .cache_nice_tries = 1, \ |
32 | .per_cpu_gain = 100, \ | 31 | .per_cpu_gain = 100, \ |
33 | .flags = SD_LOAD_BALANCE \ | 32 | .flags = SD_LOAD_BALANCE \ |
diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h index 9f3d4da261c4..1e19cd00af25 100644 --- a/include/asm-powerpc/topology.h +++ b/include/asm-powerpc/topology.h | |||
@@ -39,7 +39,6 @@ static inline int node_to_first_cpu(int node) | |||
39 | .max_interval = 32, \ | 39 | .max_interval = 32, \ |
40 | .busy_factor = 32, \ | 40 | .busy_factor = 32, \ |
41 | .imbalance_pct = 125, \ | 41 | .imbalance_pct = 125, \ |
42 | .cache_hot_time = (10*1000000), \ | ||
43 | .cache_nice_tries = 1, \ | 42 | .cache_nice_tries = 1, \ |
44 | .per_cpu_gain = 100, \ | 43 | .per_cpu_gain = 100, \ |
45 | .busy_idx = 3, \ | 44 | .busy_idx = 3, \ |
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h index 7d82bc56b9fa..2fa7f27381b4 100644 --- a/include/asm-x86_64/topology.h +++ b/include/asm-x86_64/topology.h | |||
@@ -39,7 +39,6 @@ extern int __node_distance(int, int); | |||
39 | .max_interval = 32, \ | 39 | .max_interval = 32, \ |
40 | .busy_factor = 32, \ | 40 | .busy_factor = 32, \ |
41 | .imbalance_pct = 125, \ | 41 | .imbalance_pct = 125, \ |
42 | .cache_hot_time = (10*1000000), \ | ||
43 | .cache_nice_tries = 2, \ | 42 | .cache_nice_tries = 2, \ |
44 | .busy_idx = 3, \ | 43 | .busy_idx = 3, \ |
45 | .idle_idx = 2, \ | 44 | .idle_idx = 2, \ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 3b74c4bf2934..5d6b9228bba9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -631,7 +631,14 @@ struct sched_domain { | |||
631 | 631 | ||
632 | extern void partition_sched_domains(cpumask_t *partition1, | 632 | extern void partition_sched_domains(cpumask_t *partition1, |
633 | cpumask_t *partition2); | 633 | cpumask_t *partition2); |
634 | #endif /* CONFIG_SMP */ | 634 | |
635 | /* | ||
636 | * Maximum cache size the migration-costs auto-tuning code will | ||
637 | * search from: | ||
638 | */ | ||
639 | extern unsigned int max_cache_size; | ||
640 | |||
641 | #endif /* CONFIG_SMP */ | ||
635 | 642 | ||
636 | 643 | ||
637 | struct io_context; /* See blkdev.h */ | 644 | struct io_context; /* See blkdev.h */ |
diff --git a/include/linux/topology.h b/include/linux/topology.h index 3df1d474e5c5..315a5163d6a0 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
@@ -86,7 +86,6 @@ | |||
86 | .max_interval = 2, \ | 86 | .max_interval = 2, \ |
87 | .busy_factor = 8, \ | 87 | .busy_factor = 8, \ |
88 | .imbalance_pct = 110, \ | 88 | .imbalance_pct = 110, \ |
89 | .cache_hot_time = 0, \ | ||
90 | .cache_nice_tries = 0, \ | 89 | .cache_nice_tries = 0, \ |
91 | .per_cpu_gain = 25, \ | 90 | .per_cpu_gain = 25, \ |
92 | .busy_idx = 0, \ | 91 | .busy_idx = 0, \ |
@@ -117,7 +116,6 @@ | |||
117 | .max_interval = 4, \ | 116 | .max_interval = 4, \ |
118 | .busy_factor = 64, \ | 117 | .busy_factor = 64, \ |
119 | .imbalance_pct = 125, \ | 118 | .imbalance_pct = 125, \ |
120 | .cache_hot_time = (5*1000000/2), \ | ||
121 | .cache_nice_tries = 1, \ | 119 | .cache_nice_tries = 1, \ |
122 | .per_cpu_gain = 100, \ | 120 | .per_cpu_gain = 100, \ |
123 | .busy_idx = 2, \ | 121 | .busy_idx = 2, \ |
diff --git a/kernel/sched.c b/kernel/sched.c index c0c60c926d5e..98461de1ab65 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/notifier.h> | 34 | #include <linux/notifier.h> |
35 | #include <linux/profile.h> | 35 | #include <linux/profile.h> |
36 | #include <linux/suspend.h> | 36 | #include <linux/suspend.h> |
37 | #include <linux/vmalloc.h> | ||
37 | #include <linux/blkdev.h> | 38 | #include <linux/blkdev.h> |
38 | #include <linux/delay.h> | 39 | #include <linux/delay.h> |
39 | #include <linux/smp.h> | 40 | #include <linux/smp.h> |
@@ -5082,7 +5083,470 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, | |||
5082 | 5083 | ||
5083 | #define SD_NODES_PER_DOMAIN 16 | 5084 | #define SD_NODES_PER_DOMAIN 16 |
5084 | 5085 | ||
5086 | /* | ||
5087 | * Self-tuning task migration cost measurement between source and target CPUs. | ||
5088 | * | ||
5089 | * This is done by measuring the cost of manipulating buffers of varying | ||
5090 | * sizes. For a given buffer-size here are the steps that are taken: | ||
5091 | * | ||
5092 | * 1) the source CPU reads+dirties a shared buffer | ||
5093 | * 2) the target CPU reads+dirties the same shared buffer | ||
5094 | * | ||
5095 | * We measure how long they take, in the following 4 scenarios: | ||
5096 | * | ||
5097 | * - source: CPU1, target: CPU2 | cost1 | ||
5098 | * - source: CPU2, target: CPU1 | cost2 | ||
5099 | * - source: CPU1, target: CPU1 | cost3 | ||
5100 | * - source: CPU2, target: CPU2 | cost4 | ||
5101 | * | ||
5102 | * We then calculate the cost3+cost4-cost1-cost2 difference - this is | ||
5103 | * the cost of migration. | ||
5104 | * | ||
5105 | * We then start off from a small buffer-size and iterate up to larger | ||
5106 | * buffer sizes, in 5% steps - measuring each buffer-size separately, and | ||
5107 | * doing a maximum search for the cost. (The maximum cost for a migration | ||
5108 | * normally occurs when the working set size is around the effective cache | ||
5109 | * size.) | ||
5110 | */ | ||
5111 | #define SEARCH_SCOPE 2 | ||
5112 | #define MIN_CACHE_SIZE (64*1024U) | ||
5113 | #define DEFAULT_CACHE_SIZE (5*1024*1024U) | ||
5114 | #define ITERATIONS 2 | ||
5115 | #define SIZE_THRESH 130 | ||
5116 | #define COST_THRESH 130 | ||
5117 | |||
5118 | /* | ||
5119 | * The migration cost is a function of 'domain distance'. Domain | ||
5120 | * distance is the number of steps a CPU has to iterate down its | ||
5121 | * domain tree to share a domain with the other CPU. The farther | ||
5122 | * two CPUs are from each other, the larger the distance gets. | ||
5123 | * | ||
5124 | * Note that we use the distance only to cache measurement results, | ||
5125 | * the distance value is not used numerically otherwise. When two | ||
5126 | * CPUs have the same distance it is assumed that the migration | ||
5127 | * cost is the same. (this is a simplification but quite practical) | ||
5128 | */ | ||
5129 | #define MAX_DOMAIN_DISTANCE 32 | ||
5130 | |||
5131 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = | ||
5132 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL }; | ||
5133 | |||
5134 | /* | ||
5135 | * Allow override of migration cost - in units of microseconds. | ||
5136 | * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost | ||
5137 | * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: | ||
5138 | */ | ||
5139 | static int __init migration_cost_setup(char *str) | ||
5140 | { | ||
5141 | int ints[MAX_DOMAIN_DISTANCE+1], i; | ||
5142 | |||
5143 | str = get_options(str, ARRAY_SIZE(ints), ints); | ||
5144 | |||
5145 | printk("#ints: %d\n", ints[0]); | ||
5146 | for (i = 1; i <= ints[0]; i++) { | ||
5147 | migration_cost[i-1] = (unsigned long long)ints[i]*1000; | ||
5148 | printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); | ||
5149 | } | ||
5150 | return 1; | ||
5151 | } | ||
5152 | |||
5153 | __setup ("migration_cost=", migration_cost_setup); | ||
5154 | |||
5155 | /* | ||
5156 | * Global multiplier (divisor) for migration-cutoff values, | ||
5157 | * in percentiles. E.g. use a value of 150 to get 1.5 times | ||
5158 | * longer cache-hot cutoff times. | ||
5159 | * | ||
5160 | * (We scale it from 100 to 128 to long long handling easier.) | ||
5161 | */ | ||
5162 | |||
5163 | #define MIGRATION_FACTOR_SCALE 128 | ||
5164 | |||
5165 | static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; | ||
5166 | |||
5167 | static int __init setup_migration_factor(char *str) | ||
5168 | { | ||
5169 | get_option(&str, &migration_factor); | ||
5170 | migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; | ||
5171 | return 1; | ||
5172 | } | ||
5173 | |||
5174 | __setup("migration_factor=", setup_migration_factor); | ||
5175 | |||
5176 | /* | ||
5177 | * Estimated distance of two CPUs, measured via the number of domains | ||
5178 | * we have to pass for the two CPUs to be in the same span: | ||
5179 | */ | ||
5180 | static unsigned long domain_distance(int cpu1, int cpu2) | ||
5181 | { | ||
5182 | unsigned long distance = 0; | ||
5183 | struct sched_domain *sd; | ||
5184 | |||
5185 | for_each_domain(cpu1, sd) { | ||
5186 | WARN_ON(!cpu_isset(cpu1, sd->span)); | ||
5187 | if (cpu_isset(cpu2, sd->span)) | ||
5188 | return distance; | ||
5189 | distance++; | ||
5190 | } | ||
5191 | if (distance >= MAX_DOMAIN_DISTANCE) { | ||
5192 | WARN_ON(1); | ||
5193 | distance = MAX_DOMAIN_DISTANCE-1; | ||
5194 | } | ||
5195 | |||
5196 | return distance; | ||
5197 | } | ||
5198 | |||
5199 | static unsigned int migration_debug; | ||
5200 | |||
5201 | static int __init setup_migration_debug(char *str) | ||
5202 | { | ||
5203 | get_option(&str, &migration_debug); | ||
5204 | return 1; | ||
5205 | } | ||
5206 | |||
5207 | __setup("migration_debug=", setup_migration_debug); | ||
5208 | |||
5209 | /* | ||
5210 | * Maximum cache-size that the scheduler should try to measure. | ||
5211 | * Architectures with larger caches should tune this up during | ||
5212 | * bootup. Gets used in the domain-setup code (i.e. during SMP | ||
5213 | * bootup). | ||
5214 | */ | ||
5215 | unsigned int max_cache_size; | ||
5216 | |||
5217 | static int __init setup_max_cache_size(char *str) | ||
5218 | { | ||
5219 | get_option(&str, &max_cache_size); | ||
5220 | return 1; | ||
5221 | } | ||
5222 | |||
5223 | __setup("max_cache_size=", setup_max_cache_size); | ||
5224 | |||
5225 | /* | ||
5226 | * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This | ||
5227 | * is the operation that is timed, so we try to generate unpredictable | ||
5228 | * cachemisses that still end up filling the L2 cache: | ||
5229 | */ | ||
5230 | static void touch_cache(void *__cache, unsigned long __size) | ||
5231 | { | ||
5232 | unsigned long size = __size/sizeof(long), chunk1 = size/3, | ||
5233 | chunk2 = 2*size/3; | ||
5234 | unsigned long *cache = __cache; | ||
5235 | int i; | ||
5236 | |||
5237 | for (i = 0; i < size/6; i += 8) { | ||
5238 | switch (i % 6) { | ||
5239 | case 0: cache[i]++; | ||
5240 | case 1: cache[size-1-i]++; | ||
5241 | case 2: cache[chunk1-i]++; | ||
5242 | case 3: cache[chunk1+i]++; | ||
5243 | case 4: cache[chunk2-i]++; | ||
5244 | case 5: cache[chunk2+i]++; | ||
5245 | } | ||
5246 | } | ||
5247 | } | ||
5248 | |||
5249 | /* | ||
5250 | * Measure the cache-cost of one task migration. Returns in units of nsec. | ||
5251 | */ | ||
5252 | static unsigned long long measure_one(void *cache, unsigned long size, | ||
5253 | int source, int target) | ||
5254 | { | ||
5255 | cpumask_t mask, saved_mask; | ||
5256 | unsigned long long t0, t1, t2, t3, cost; | ||
5257 | |||
5258 | saved_mask = current->cpus_allowed; | ||
5259 | |||
5260 | /* | ||
5261 | * Flush source caches to RAM and invalidate them: | ||
5262 | */ | ||
5263 | sched_cacheflush(); | ||
5264 | |||
5265 | /* | ||
5266 | * Migrate to the source CPU: | ||
5267 | */ | ||
5268 | mask = cpumask_of_cpu(source); | ||
5269 | set_cpus_allowed(current, mask); | ||
5270 | WARN_ON(smp_processor_id() != source); | ||
5271 | |||
5272 | /* | ||
5273 | * Dirty the working set: | ||
5274 | */ | ||
5275 | t0 = sched_clock(); | ||
5276 | touch_cache(cache, size); | ||
5277 | t1 = sched_clock(); | ||
5278 | |||
5279 | /* | ||
5280 | * Migrate to the target CPU, dirty the L2 cache and access | ||
5281 | * the shared buffer. (which represents the working set | ||
5282 | * of a migrated task.) | ||
5283 | */ | ||
5284 | mask = cpumask_of_cpu(target); | ||
5285 | set_cpus_allowed(current, mask); | ||
5286 | WARN_ON(smp_processor_id() != target); | ||
5287 | |||
5288 | t2 = sched_clock(); | ||
5289 | touch_cache(cache, size); | ||
5290 | t3 = sched_clock(); | ||
5291 | |||
5292 | cost = t1-t0 + t3-t2; | ||
5293 | |||
5294 | if (migration_debug >= 2) | ||
5295 | printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", | ||
5296 | source, target, t1-t0, t1-t0, t3-t2, cost); | ||
5297 | /* | ||
5298 | * Flush target caches to RAM and invalidate them: | ||
5299 | */ | ||
5300 | sched_cacheflush(); | ||
5301 | |||
5302 | set_cpus_allowed(current, saved_mask); | ||
5303 | |||
5304 | return cost; | ||
5305 | } | ||
5306 | |||
5307 | /* | ||
5308 | * Measure a series of task migrations and return the average | ||
5309 | * result. Since this code runs early during bootup the system | ||
5310 | * is 'undisturbed' and the average latency makes sense. | ||
5311 | * | ||
5312 | * The algorithm in essence auto-detects the relevant cache-size, | ||
5313 | * so it will properly detect different cachesizes for different | ||
5314 | * cache-hierarchies, depending on how the CPUs are connected. | ||
5315 | * | ||
5316 | * Architectures can prime the upper limit of the search range via | ||
5317 | * max_cache_size, otherwise the search range defaults to 20MB...64K. | ||
5318 | */ | ||
5319 | static unsigned long long | ||
5320 | measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | ||
5321 | { | ||
5322 | unsigned long long cost1, cost2; | ||
5323 | int i; | ||
5324 | |||
5325 | /* | ||
5326 | * Measure the migration cost of 'size' bytes, over an | ||
5327 | * average of 10 runs: | ||
5328 | * | ||
5329 | * (We perturb the cache size by a small (0..4k) | ||
5330 | * value to compensate size/alignment related artifacts. | ||
5331 | * We also subtract the cost of the operation done on | ||
5332 | * the same CPU.) | ||
5333 | */ | ||
5334 | cost1 = 0; | ||
5335 | |||
5336 | /* | ||
5337 | * dry run, to make sure we start off cache-cold on cpu1, | ||
5338 | * and to get any vmalloc pagefaults in advance: | ||
5339 | */ | ||
5340 | measure_one(cache, size, cpu1, cpu2); | ||
5341 | for (i = 0; i < ITERATIONS; i++) | ||
5342 | cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); | ||
5343 | |||
5344 | measure_one(cache, size, cpu2, cpu1); | ||
5345 | for (i = 0; i < ITERATIONS; i++) | ||
5346 | cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); | ||
5347 | |||
5348 | /* | ||
5349 | * (We measure the non-migrating [cached] cost on both | ||
5350 | * cpu1 and cpu2, to handle CPUs with different speeds) | ||
5351 | */ | ||
5352 | cost2 = 0; | ||
5353 | |||
5354 | measure_one(cache, size, cpu1, cpu1); | ||
5355 | for (i = 0; i < ITERATIONS; i++) | ||
5356 | cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); | ||
5357 | |||
5358 | measure_one(cache, size, cpu2, cpu2); | ||
5359 | for (i = 0; i < ITERATIONS; i++) | ||
5360 | cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); | ||
5361 | |||
5362 | /* | ||
5363 | * Get the per-iteration migration cost: | ||
5364 | */ | ||
5365 | do_div(cost1, 2*ITERATIONS); | ||
5366 | do_div(cost2, 2*ITERATIONS); | ||
5367 | |||
5368 | return cost1 - cost2; | ||
5369 | } | ||
5370 | |||
5371 | static unsigned long long measure_migration_cost(int cpu1, int cpu2) | ||
5372 | { | ||
5373 | unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; | ||
5374 | unsigned int max_size, size, size_found = 0; | ||
5375 | long long cost = 0, prev_cost; | ||
5376 | void *cache; | ||
5377 | |||
5378 | /* | ||
5379 | * Search from max_cache_size*5 down to 64K - the real relevant | ||
5380 | * cachesize has to lie somewhere inbetween. | ||
5381 | */ | ||
5382 | if (max_cache_size) { | ||
5383 | max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
5384 | size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
5385 | } else { | ||
5386 | /* | ||
5387 | * Since we have no estimation about the relevant | ||
5388 | * search range | ||
5389 | */ | ||
5390 | max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; | ||
5391 | size = MIN_CACHE_SIZE; | ||
5392 | } | ||
5393 | |||
5394 | if (!cpu_online(cpu1) || !cpu_online(cpu2)) { | ||
5395 | printk("cpu %d and %d not both online!\n", cpu1, cpu2); | ||
5396 | return 0; | ||
5397 | } | ||
5398 | |||
5399 | /* | ||
5400 | * Allocate the working set: | ||
5401 | */ | ||
5402 | cache = vmalloc(max_size); | ||
5403 | if (!cache) { | ||
5404 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | ||
5405 | return 1000000; // return 1 msec on very small boxen | ||
5406 | } | ||
5407 | |||
5408 | while (size <= max_size) { | ||
5409 | prev_cost = cost; | ||
5410 | cost = measure_cost(cpu1, cpu2, cache, size); | ||
5411 | |||
5412 | /* | ||
5413 | * Update the max: | ||
5414 | */ | ||
5415 | if (cost > 0) { | ||
5416 | if (max_cost < cost) { | ||
5417 | max_cost = cost; | ||
5418 | size_found = size; | ||
5419 | } | ||
5420 | } | ||
5421 | /* | ||
5422 | * Calculate average fluctuation, we use this to prevent | ||
5423 | * noise from triggering an early break out of the loop: | ||
5424 | */ | ||
5425 | fluct = abs(cost - prev_cost); | ||
5426 | avg_fluct = (avg_fluct + fluct)/2; | ||
5427 | |||
5428 | if (migration_debug) | ||
5429 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", | ||
5430 | cpu1, cpu2, size, | ||
5431 | (long)cost / 1000000, | ||
5432 | ((long)cost / 100000) % 10, | ||
5433 | (long)max_cost / 1000000, | ||
5434 | ((long)max_cost / 100000) % 10, | ||
5435 | domain_distance(cpu1, cpu2), | ||
5436 | cost, avg_fluct); | ||
5437 | |||
5438 | /* | ||
5439 | * If we iterated at least 20% past the previous maximum, | ||
5440 | * and the cost has dropped by more than 20% already, | ||
5441 | * (taking fluctuations into account) then we assume to | ||
5442 | * have found the maximum and break out of the loop early: | ||
5443 | */ | ||
5444 | if (size_found && (size*100 > size_found*SIZE_THRESH)) | ||
5445 | if (cost+avg_fluct <= 0 || | ||
5446 | max_cost*100 > (cost+avg_fluct)*COST_THRESH) { | ||
5447 | |||
5448 | if (migration_debug) | ||
5449 | printk("-> found max.\n"); | ||
5450 | break; | ||
5451 | } | ||
5452 | /* | ||
5453 | * Increase the cachesize in 5% steps: | ||
5454 | */ | ||
5455 | size = size * 20 / 19; | ||
5456 | } | ||
5457 | |||
5458 | if (migration_debug) | ||
5459 | printk("[%d][%d] working set size found: %d, cost: %Ld\n", | ||
5460 | cpu1, cpu2, size_found, max_cost); | ||
5461 | |||
5462 | vfree(cache); | ||
5463 | |||
5464 | /* | ||
5465 | * A task is considered 'cache cold' if at least 2 times | ||
5466 | * the worst-case cost of migration has passed. | ||
5467 | * | ||
5468 | * (this limit is only listened to if the load-balancing | ||
5469 | * situation is 'nice' - if there is a large imbalance we | ||
5470 | * ignore it for the sake of CPU utilization and | ||
5471 | * processing fairness.) | ||
5472 | */ | ||
5473 | return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; | ||
5474 | } | ||
5475 | |||
5476 | static void calibrate_migration_costs(const cpumask_t *cpu_map) | ||
5477 | { | ||
5478 | int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); | ||
5479 | unsigned long j0, j1, distance, max_distance = 0; | ||
5480 | struct sched_domain *sd; | ||
5481 | |||
5482 | j0 = jiffies; | ||
5483 | |||
5484 | /* | ||
5485 | * First pass - calculate the cacheflush times: | ||
5486 | */ | ||
5487 | for_each_cpu_mask(cpu1, *cpu_map) { | ||
5488 | for_each_cpu_mask(cpu2, *cpu_map) { | ||
5489 | if (cpu1 == cpu2) | ||
5490 | continue; | ||
5491 | distance = domain_distance(cpu1, cpu2); | ||
5492 | max_distance = max(max_distance, distance); | ||
5493 | /* | ||
5494 | * No result cached yet? | ||
5495 | */ | ||
5496 | if (migration_cost[distance] == -1LL) | ||
5497 | migration_cost[distance] = | ||
5498 | measure_migration_cost(cpu1, cpu2); | ||
5499 | } | ||
5500 | } | ||
5501 | /* | ||
5502 | * Second pass - update the sched domain hierarchy with | ||
5503 | * the new cache-hot-time estimations: | ||
5504 | */ | ||
5505 | for_each_cpu_mask(cpu, *cpu_map) { | ||
5506 | distance = 0; | ||
5507 | for_each_domain(cpu, sd) { | ||
5508 | sd->cache_hot_time = migration_cost[distance]; | ||
5509 | distance++; | ||
5510 | } | ||
5511 | } | ||
5512 | /* | ||
5513 | * Print the matrix: | ||
5514 | */ | ||
5515 | if (migration_debug) | ||
5516 | printk("migration: max_cache_size: %d, cpu: %d MHz:\n", | ||
5517 | max_cache_size, | ||
5518 | #ifdef CONFIG_X86 | ||
5519 | cpu_khz/1000 | ||
5520 | #else | ||
5521 | -1 | ||
5522 | #endif | ||
5523 | ); | ||
5524 | printk("migration_cost="); | ||
5525 | for (distance = 0; distance <= max_distance; distance++) { | ||
5526 | if (distance) | ||
5527 | printk(","); | ||
5528 | printk("%ld", (long)migration_cost[distance] / 1000); | ||
5529 | } | ||
5530 | printk("\n"); | ||
5531 | j1 = jiffies; | ||
5532 | if (migration_debug) | ||
5533 | printk("migration: %ld seconds\n", (j1-j0)/HZ); | ||
5534 | |||
5535 | /* | ||
5536 | * Move back to the original CPU. NUMA-Q gets confused | ||
5537 | * if we migrate to another quad during bootup. | ||
5538 | */ | ||
5539 | if (raw_smp_processor_id() != orig_cpu) { | ||
5540 | cpumask_t mask = cpumask_of_cpu(orig_cpu), | ||
5541 | saved_mask = current->cpus_allowed; | ||
5542 | |||
5543 | set_cpus_allowed(current, mask); | ||
5544 | set_cpus_allowed(current, saved_mask); | ||
5545 | } | ||
5546 | } | ||
5547 | |||
5085 | #ifdef CONFIG_NUMA | 5548 | #ifdef CONFIG_NUMA |
5549 | |||
5086 | /** | 5550 | /** |
5087 | * find_next_best_node - find the next node to include in a sched_domain | 5551 | * find_next_best_node - find the next node to include in a sched_domain |
5088 | * @node: node whose sched_domain we're building | 5552 | * @node: node whose sched_domain we're building |
@@ -5448,6 +5912,10 @@ next_sg: | |||
5448 | #endif | 5912 | #endif |
5449 | cpu_attach_domain(sd, i); | 5913 | cpu_attach_domain(sd, i); |
5450 | } | 5914 | } |
5915 | /* | ||
5916 | * Tune cache-hot values: | ||
5917 | */ | ||
5918 | calibrate_migration_costs(cpu_map); | ||
5451 | } | 5919 | } |
5452 | /* | 5920 | /* |
5453 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 5921 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |