diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 481 |
1 files changed, 0 insertions, 481 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index ac054d9a0719..46b23f0fee24 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -5797,483 +5797,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | |||
5797 | 5797 | ||
5798 | #define SD_NODES_PER_DOMAIN 16 | 5798 | #define SD_NODES_PER_DOMAIN 16 |
5799 | 5799 | ||
5800 | /* | ||
5801 | * Self-tuning task migration cost measurement between source and target CPUs. | ||
5802 | * | ||
5803 | * This is done by measuring the cost of manipulating buffers of varying | ||
5804 | * sizes. For a given buffer-size here are the steps that are taken: | ||
5805 | * | ||
5806 | * 1) the source CPU reads+dirties a shared buffer | ||
5807 | * 2) the target CPU reads+dirties the same shared buffer | ||
5808 | * | ||
5809 | * We measure how long they take, in the following 4 scenarios: | ||
5810 | * | ||
5811 | * - source: CPU1, target: CPU2 | cost1 | ||
5812 | * - source: CPU2, target: CPU1 | cost2 | ||
5813 | * - source: CPU1, target: CPU1 | cost3 | ||
5814 | * - source: CPU2, target: CPU2 | cost4 | ||
5815 | * | ||
5816 | * We then calculate the cost3+cost4-cost1-cost2 difference - this is | ||
5817 | * the cost of migration. | ||
5818 | * | ||
5819 | * We then start off from a small buffer-size and iterate up to larger | ||
5820 | * buffer sizes, in 5% steps - measuring each buffer-size separately, and | ||
5821 | * doing a maximum search for the cost. (The maximum cost for a migration | ||
5822 | * normally occurs when the working set size is around the effective cache | ||
5823 | * size.) | ||
5824 | */ | ||
5825 | #define SEARCH_SCOPE 2 | ||
5826 | #define MIN_CACHE_SIZE (64*1024U) | ||
5827 | #define DEFAULT_CACHE_SIZE (5*1024*1024U) | ||
5828 | #define ITERATIONS 1 | ||
5829 | #define SIZE_THRESH 130 | ||
5830 | #define COST_THRESH 130 | ||
5831 | |||
5832 | /* | ||
5833 | * The migration cost is a function of 'domain distance'. Domain | ||
5834 | * distance is the number of steps a CPU has to iterate down its | ||
5835 | * domain tree to share a domain with the other CPU. The farther | ||
5836 | * two CPUs are from each other, the larger the distance gets. | ||
5837 | * | ||
5838 | * Note that we use the distance only to cache measurement results, | ||
5839 | * the distance value is not used numerically otherwise. When two | ||
5840 | * CPUs have the same distance it is assumed that the migration | ||
5841 | * cost is the same. (this is a simplification but quite practical) | ||
5842 | */ | ||
5843 | #define MAX_DOMAIN_DISTANCE 32 | ||
5844 | |||
5845 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = | ||
5846 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = | ||
5847 | /* | ||
5848 | * Architectures may override the migration cost and thus avoid | ||
5849 | * boot-time calibration. Unit is nanoseconds. Mostly useful for | ||
5850 | * virtualized hardware: | ||
5851 | */ | ||
5852 | #ifdef CONFIG_DEFAULT_MIGRATION_COST | ||
5853 | CONFIG_DEFAULT_MIGRATION_COST | ||
5854 | #else | ||
5855 | -1LL | ||
5856 | #endif | ||
5857 | }; | ||
5858 | |||
5859 | /* | ||
5860 | * Allow override of migration cost - in units of microseconds. | ||
5861 | * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost | ||
5862 | * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: | ||
5863 | */ | ||
5864 | static int __init migration_cost_setup(char *str) | ||
5865 | { | ||
5866 | int ints[MAX_DOMAIN_DISTANCE+1], i; | ||
5867 | |||
5868 | str = get_options(str, ARRAY_SIZE(ints), ints); | ||
5869 | |||
5870 | printk("#ints: %d\n", ints[0]); | ||
5871 | for (i = 1; i <= ints[0]; i++) { | ||
5872 | migration_cost[i-1] = (unsigned long long)ints[i]*1000; | ||
5873 | printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); | ||
5874 | } | ||
5875 | return 1; | ||
5876 | } | ||
5877 | |||
5878 | __setup ("migration_cost=", migration_cost_setup); | ||
5879 | |||
5880 | /* | ||
5881 | * Global multiplier (divisor) for migration-cutoff values, | ||
5882 | * in percentiles. E.g. use a value of 150 to get 1.5 times | ||
5883 | * longer cache-hot cutoff times. | ||
5884 | * | ||
5885 | * (We scale it from 100 to 128 to long long handling easier.) | ||
5886 | */ | ||
5887 | |||
5888 | #define MIGRATION_FACTOR_SCALE 128 | ||
5889 | |||
5890 | static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; | ||
5891 | |||
5892 | static int __init setup_migration_factor(char *str) | ||
5893 | { | ||
5894 | get_option(&str, &migration_factor); | ||
5895 | migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; | ||
5896 | return 1; | ||
5897 | } | ||
5898 | |||
5899 | __setup("migration_factor=", setup_migration_factor); | ||
5900 | |||
5901 | /* | ||
5902 | * Estimated distance of two CPUs, measured via the number of domains | ||
5903 | * we have to pass for the two CPUs to be in the same span: | ||
5904 | */ | ||
5905 | static unsigned long domain_distance(int cpu1, int cpu2) | ||
5906 | { | ||
5907 | unsigned long distance = 0; | ||
5908 | struct sched_domain *sd; | ||
5909 | |||
5910 | for_each_domain(cpu1, sd) { | ||
5911 | WARN_ON(!cpu_isset(cpu1, sd->span)); | ||
5912 | if (cpu_isset(cpu2, sd->span)) | ||
5913 | return distance; | ||
5914 | distance++; | ||
5915 | } | ||
5916 | if (distance >= MAX_DOMAIN_DISTANCE) { | ||
5917 | WARN_ON(1); | ||
5918 | distance = MAX_DOMAIN_DISTANCE-1; | ||
5919 | } | ||
5920 | |||
5921 | return distance; | ||
5922 | } | ||
5923 | |||
5924 | static unsigned int migration_debug; | ||
5925 | |||
5926 | static int __init setup_migration_debug(char *str) | ||
5927 | { | ||
5928 | get_option(&str, &migration_debug); | ||
5929 | return 1; | ||
5930 | } | ||
5931 | |||
5932 | __setup("migration_debug=", setup_migration_debug); | ||
5933 | |||
5934 | /* | ||
5935 | * Maximum cache-size that the scheduler should try to measure. | ||
5936 | * Architectures with larger caches should tune this up during | ||
5937 | * bootup. Gets used in the domain-setup code (i.e. during SMP | ||
5938 | * bootup). | ||
5939 | */ | ||
5940 | unsigned int max_cache_size; | ||
5941 | |||
5942 | static int __init setup_max_cache_size(char *str) | ||
5943 | { | ||
5944 | get_option(&str, &max_cache_size); | ||
5945 | return 1; | ||
5946 | } | ||
5947 | |||
5948 | __setup("max_cache_size=", setup_max_cache_size); | ||
5949 | |||
5950 | /* | ||
5951 | * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This | ||
5952 | * is the operation that is timed, so we try to generate unpredictable | ||
5953 | * cachemisses that still end up filling the L2 cache: | ||
5954 | */ | ||
5955 | static void touch_cache(void *__cache, unsigned long __size) | ||
5956 | { | ||
5957 | unsigned long size = __size / sizeof(long); | ||
5958 | unsigned long chunk1 = size / 3; | ||
5959 | unsigned long chunk2 = 2 * size / 3; | ||
5960 | unsigned long *cache = __cache; | ||
5961 | int i; | ||
5962 | |||
5963 | for (i = 0; i < size/6; i += 8) { | ||
5964 | switch (i % 6) { | ||
5965 | case 0: cache[i]++; | ||
5966 | case 1: cache[size-1-i]++; | ||
5967 | case 2: cache[chunk1-i]++; | ||
5968 | case 3: cache[chunk1+i]++; | ||
5969 | case 4: cache[chunk2-i]++; | ||
5970 | case 5: cache[chunk2+i]++; | ||
5971 | } | ||
5972 | } | ||
5973 | } | ||
5974 | |||
5975 | /* | ||
5976 | * Measure the cache-cost of one task migration. Returns in units of nsec. | ||
5977 | */ | ||
5978 | static unsigned long long | ||
5979 | measure_one(void *cache, unsigned long size, int source, int target) | ||
5980 | { | ||
5981 | cpumask_t mask, saved_mask; | ||
5982 | unsigned long long t0, t1, t2, t3, cost; | ||
5983 | |||
5984 | saved_mask = current->cpus_allowed; | ||
5985 | |||
5986 | /* | ||
5987 | * Flush source caches to RAM and invalidate them: | ||
5988 | */ | ||
5989 | sched_cacheflush(); | ||
5990 | |||
5991 | /* | ||
5992 | * Migrate to the source CPU: | ||
5993 | */ | ||
5994 | mask = cpumask_of_cpu(source); | ||
5995 | set_cpus_allowed(current, mask); | ||
5996 | WARN_ON(smp_processor_id() != source); | ||
5997 | |||
5998 | /* | ||
5999 | * Dirty the working set: | ||
6000 | */ | ||
6001 | t0 = sched_clock(); | ||
6002 | touch_cache(cache, size); | ||
6003 | t1 = sched_clock(); | ||
6004 | |||
6005 | /* | ||
6006 | * Migrate to the target CPU, dirty the L2 cache and access | ||
6007 | * the shared buffer. (which represents the working set | ||
6008 | * of a migrated task.) | ||
6009 | */ | ||
6010 | mask = cpumask_of_cpu(target); | ||
6011 | set_cpus_allowed(current, mask); | ||
6012 | WARN_ON(smp_processor_id() != target); | ||
6013 | |||
6014 | t2 = sched_clock(); | ||
6015 | touch_cache(cache, size); | ||
6016 | t3 = sched_clock(); | ||
6017 | |||
6018 | cost = t1-t0 + t3-t2; | ||
6019 | |||
6020 | if (migration_debug >= 2) | ||
6021 | printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", | ||
6022 | source, target, t1-t0, t1-t0, t3-t2, cost); | ||
6023 | /* | ||
6024 | * Flush target caches to RAM and invalidate them: | ||
6025 | */ | ||
6026 | sched_cacheflush(); | ||
6027 | |||
6028 | set_cpus_allowed(current, saved_mask); | ||
6029 | |||
6030 | return cost; | ||
6031 | } | ||
6032 | |||
6033 | /* | ||
6034 | * Measure a series of task migrations and return the average | ||
6035 | * result. Since this code runs early during bootup the system | ||
6036 | * is 'undisturbed' and the average latency makes sense. | ||
6037 | * | ||
6038 | * The algorithm in essence auto-detects the relevant cache-size, | ||
6039 | * so it will properly detect different cachesizes for different | ||
6040 | * cache-hierarchies, depending on how the CPUs are connected. | ||
6041 | * | ||
6042 | * Architectures can prime the upper limit of the search range via | ||
6043 | * max_cache_size, otherwise the search range defaults to 20MB...64K. | ||
6044 | */ | ||
6045 | static unsigned long long | ||
6046 | measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | ||
6047 | { | ||
6048 | unsigned long long cost1, cost2; | ||
6049 | int i; | ||
6050 | |||
6051 | /* | ||
6052 | * Measure the migration cost of 'size' bytes, over an | ||
6053 | * average of 10 runs: | ||
6054 | * | ||
6055 | * (We perturb the cache size by a small (0..4k) | ||
6056 | * value to compensate size/alignment related artifacts. | ||
6057 | * We also subtract the cost of the operation done on | ||
6058 | * the same CPU.) | ||
6059 | */ | ||
6060 | cost1 = 0; | ||
6061 | |||
6062 | /* | ||
6063 | * dry run, to make sure we start off cache-cold on cpu1, | ||
6064 | * and to get any vmalloc pagefaults in advance: | ||
6065 | */ | ||
6066 | measure_one(cache, size, cpu1, cpu2); | ||
6067 | for (i = 0; i < ITERATIONS; i++) | ||
6068 | cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); | ||
6069 | |||
6070 | measure_one(cache, size, cpu2, cpu1); | ||
6071 | for (i = 0; i < ITERATIONS; i++) | ||
6072 | cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); | ||
6073 | |||
6074 | /* | ||
6075 | * (We measure the non-migrating [cached] cost on both | ||
6076 | * cpu1 and cpu2, to handle CPUs with different speeds) | ||
6077 | */ | ||
6078 | cost2 = 0; | ||
6079 | |||
6080 | measure_one(cache, size, cpu1, cpu1); | ||
6081 | for (i = 0; i < ITERATIONS; i++) | ||
6082 | cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); | ||
6083 | |||
6084 | measure_one(cache, size, cpu2, cpu2); | ||
6085 | for (i = 0; i < ITERATIONS; i++) | ||
6086 | cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); | ||
6087 | |||
6088 | /* | ||
6089 | * Get the per-iteration migration cost: | ||
6090 | */ | ||
6091 | do_div(cost1, 2 * ITERATIONS); | ||
6092 | do_div(cost2, 2 * ITERATIONS); | ||
6093 | |||
6094 | return cost1 - cost2; | ||
6095 | } | ||
6096 | |||
6097 | static unsigned long long measure_migration_cost(int cpu1, int cpu2) | ||
6098 | { | ||
6099 | unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; | ||
6100 | unsigned int max_size, size, size_found = 0; | ||
6101 | long long cost = 0, prev_cost; | ||
6102 | void *cache; | ||
6103 | |||
6104 | /* | ||
6105 | * Search from max_cache_size*5 down to 64K - the real relevant | ||
6106 | * cachesize has to lie somewhere inbetween. | ||
6107 | */ | ||
6108 | if (max_cache_size) { | ||
6109 | max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
6110 | size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
6111 | } else { | ||
6112 | /* | ||
6113 | * Since we have no estimation about the relevant | ||
6114 | * search range | ||
6115 | */ | ||
6116 | max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; | ||
6117 | size = MIN_CACHE_SIZE; | ||
6118 | } | ||
6119 | |||
6120 | if (!cpu_online(cpu1) || !cpu_online(cpu2)) { | ||
6121 | printk("cpu %d and %d not both online!\n", cpu1, cpu2); | ||
6122 | return 0; | ||
6123 | } | ||
6124 | |||
6125 | /* | ||
6126 | * Allocate the working set: | ||
6127 | */ | ||
6128 | cache = vmalloc(max_size); | ||
6129 | if (!cache) { | ||
6130 | printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); | ||
6131 | return 1000000; /* return 1 msec on very small boxen */ | ||
6132 | } | ||
6133 | |||
6134 | while (size <= max_size) { | ||
6135 | prev_cost = cost; | ||
6136 | cost = measure_cost(cpu1, cpu2, cache, size); | ||
6137 | |||
6138 | /* | ||
6139 | * Update the max: | ||
6140 | */ | ||
6141 | if (cost > 0) { | ||
6142 | if (max_cost < cost) { | ||
6143 | max_cost = cost; | ||
6144 | size_found = size; | ||
6145 | } | ||
6146 | } | ||
6147 | /* | ||
6148 | * Calculate average fluctuation, we use this to prevent | ||
6149 | * noise from triggering an early break out of the loop: | ||
6150 | */ | ||
6151 | fluct = abs(cost - prev_cost); | ||
6152 | avg_fluct = (avg_fluct + fluct)/2; | ||
6153 | |||
6154 | if (migration_debug) | ||
6155 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " | ||
6156 | "(%8Ld %8Ld)\n", | ||
6157 | cpu1, cpu2, size, | ||
6158 | (long)cost / 1000000, | ||
6159 | ((long)cost / 100000) % 10, | ||
6160 | (long)max_cost / 1000000, | ||
6161 | ((long)max_cost / 100000) % 10, | ||
6162 | domain_distance(cpu1, cpu2), | ||
6163 | cost, avg_fluct); | ||
6164 | |||
6165 | /* | ||
6166 | * If we iterated at least 20% past the previous maximum, | ||
6167 | * and the cost has dropped by more than 20% already, | ||
6168 | * (taking fluctuations into account) then we assume to | ||
6169 | * have found the maximum and break out of the loop early: | ||
6170 | */ | ||
6171 | if (size_found && (size*100 > size_found*SIZE_THRESH)) | ||
6172 | if (cost+avg_fluct <= 0 || | ||
6173 | max_cost*100 > (cost+avg_fluct)*COST_THRESH) { | ||
6174 | |||
6175 | if (migration_debug) | ||
6176 | printk("-> found max.\n"); | ||
6177 | break; | ||
6178 | } | ||
6179 | /* | ||
6180 | * Increase the cachesize in 10% steps: | ||
6181 | */ | ||
6182 | size = size * 10 / 9; | ||
6183 | } | ||
6184 | |||
6185 | if (migration_debug) | ||
6186 | printk("[%d][%d] working set size found: %d, cost: %Ld\n", | ||
6187 | cpu1, cpu2, size_found, max_cost); | ||
6188 | |||
6189 | vfree(cache); | ||
6190 | |||
6191 | /* | ||
6192 | * A task is considered 'cache cold' if at least 2 times | ||
6193 | * the worst-case cost of migration has passed. | ||
6194 | * | ||
6195 | * (this limit is only listened to if the load-balancing | ||
6196 | * situation is 'nice' - if there is a large imbalance we | ||
6197 | * ignore it for the sake of CPU utilization and | ||
6198 | * processing fairness.) | ||
6199 | */ | ||
6200 | return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; | ||
6201 | } | ||
6202 | |||
6203 | static void calibrate_migration_costs(const cpumask_t *cpu_map) | ||
6204 | { | ||
6205 | int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); | ||
6206 | unsigned long j0, j1, distance, max_distance = 0; | ||
6207 | struct sched_domain *sd; | ||
6208 | |||
6209 | j0 = jiffies; | ||
6210 | |||
6211 | /* | ||
6212 | * First pass - calculate the cacheflush times: | ||
6213 | */ | ||
6214 | for_each_cpu_mask(cpu1, *cpu_map) { | ||
6215 | for_each_cpu_mask(cpu2, *cpu_map) { | ||
6216 | if (cpu1 == cpu2) | ||
6217 | continue; | ||
6218 | distance = domain_distance(cpu1, cpu2); | ||
6219 | max_distance = max(max_distance, distance); | ||
6220 | /* | ||
6221 | * No result cached yet? | ||
6222 | */ | ||
6223 | if (migration_cost[distance] == -1LL) | ||
6224 | migration_cost[distance] = | ||
6225 | measure_migration_cost(cpu1, cpu2); | ||
6226 | } | ||
6227 | } | ||
6228 | /* | ||
6229 | * Second pass - update the sched domain hierarchy with | ||
6230 | * the new cache-hot-time estimations: | ||
6231 | */ | ||
6232 | for_each_cpu_mask(cpu, *cpu_map) { | ||
6233 | distance = 0; | ||
6234 | for_each_domain(cpu, sd) { | ||
6235 | sd->cache_hot_time = migration_cost[distance]; | ||
6236 | distance++; | ||
6237 | } | ||
6238 | } | ||
6239 | /* | ||
6240 | * Print the matrix: | ||
6241 | */ | ||
6242 | if (migration_debug) | ||
6243 | printk("migration: max_cache_size: %d, cpu: %d MHz:\n", | ||
6244 | max_cache_size, | ||
6245 | #ifdef CONFIG_X86 | ||
6246 | cpu_khz/1000 | ||
6247 | #else | ||
6248 | -1 | ||
6249 | #endif | ||
6250 | ); | ||
6251 | if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { | ||
6252 | printk("migration_cost="); | ||
6253 | for (distance = 0; distance <= max_distance; distance++) { | ||
6254 | if (distance) | ||
6255 | printk(","); | ||
6256 | printk("%ld", (long)migration_cost[distance] / 1000); | ||
6257 | } | ||
6258 | printk("\n"); | ||
6259 | } | ||
6260 | j1 = jiffies; | ||
6261 | if (migration_debug) | ||
6262 | printk("migration: %ld seconds\n", (j1-j0) / HZ); | ||
6263 | |||
6264 | /* | ||
6265 | * Move back to the original CPU. NUMA-Q gets confused | ||
6266 | * if we migrate to another quad during bootup. | ||
6267 | */ | ||
6268 | if (raw_smp_processor_id() != orig_cpu) { | ||
6269 | cpumask_t mask = cpumask_of_cpu(orig_cpu), | ||
6270 | saved_mask = current->cpus_allowed; | ||
6271 | |||
6272 | set_cpus_allowed(current, mask); | ||
6273 | set_cpus_allowed(current, saved_mask); | ||
6274 | } | ||
6275 | } | ||
6276 | |||
6277 | #ifdef CONFIG_NUMA | 5800 | #ifdef CONFIG_NUMA |
6278 | 5801 | ||
6279 | /** | 5802 | /** |
@@ -6803,10 +6326,6 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6803 | #endif | 6326 | #endif |
6804 | cpu_attach_domain(sd, i); | 6327 | cpu_attach_domain(sd, i); |
6805 | } | 6328 | } |
6806 | /* | ||
6807 | * Tune cache-hot values: | ||
6808 | */ | ||
6809 | calibrate_migration_costs(cpu_map); | ||
6810 | 6329 | ||
6811 | return 0; | 6330 | return 0; |
6812 | 6331 | ||