diff options
-rw-r--r-- | Documentation/kernel-parameters.txt | 43 | ||||
-rw-r--r-- | arch/i386/kernel/smpboot.c | 12 | ||||
-rw-r--r-- | arch/ia64/kernel/setup.c | 6 | ||||
-rw-r--r-- | arch/mips/kernel/smp.c | 11 | ||||
-rw-r--r-- | arch/sparc/kernel/smp.c | 10 | ||||
-rw-r--r-- | arch/sparc64/kernel/smp.c | 27 | ||||
-rw-r--r-- | include/linux/sched.h | 6 | ||||
-rw-r--r-- | kernel/sched.c | 481 |
8 files changed, 0 insertions, 596 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index af50f9bbe68e..4d880b3d1f35 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1014,49 +1014,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1014 | 1014 | ||
1015 | mga= [HW,DRM] | 1015 | mga= [HW,DRM] |
1016 | 1016 | ||
1017 | migration_cost= | ||
1018 | [KNL,SMP] debug: override scheduler migration costs | ||
1019 | Format: <level-1-usecs>,<level-2-usecs>,... | ||
1020 | This debugging option can be used to override the | ||
1021 | default scheduler migration cost matrix. The numbers | ||
1022 | are indexed by 'CPU domain distance'. | ||
1023 | E.g. migration_cost=1000,2000,3000 on an SMT NUMA | ||
1024 | box will set up an intra-core migration cost of | ||
1025 | 1 msec, an inter-core migration cost of 2 msecs, | ||
1026 | and an inter-node migration cost of 3 msecs. | ||
1027 | |||
1028 | WARNING: using the wrong values here can break | ||
1029 | scheduler performance, so it's only for scheduler | ||
1030 | development purposes, not production environments. | ||
1031 | |||
1032 | migration_debug= | ||
1033 | [KNL,SMP] migration cost auto-detect verbosity | ||
1034 | Format=<0|1|2> | ||
1035 | If a system's migration matrix reported at bootup | ||
1036 | seems erroneous then this option can be used to | ||
1037 | increase verbosity of the detection process. | ||
1038 | We default to 0 (no extra messages), 1 will print | ||
1039 | some more information, and 2 will be really | ||
1040 | verbose (probably only useful if you also have a | ||
1041 | serial console attached to the system). | ||
1042 | |||
1043 | migration_factor= | ||
1044 | [KNL,SMP] multiply/divide migration costs by a factor | ||
1045 | Format=<percent> | ||
1046 | This debug option can be used to proportionally | ||
1047 | increase or decrease the auto-detected migration | ||
1048 | costs for all entries of the migration matrix. | ||
1049 | E.g. migration_factor=150 will increase migration | ||
1050 | costs by 50%. (and thus the scheduler will be less | ||
1051 | eager migrating cache-hot tasks) | ||
1052 | migration_factor=80 will decrease migration costs | ||
1053 | by 20%. (thus the scheduler will be more eager to | ||
1054 | migrate tasks) | ||
1055 | |||
1056 | WARNING: using the wrong values here can break | ||
1057 | scheduler performance, so it's only for scheduler | ||
1058 | development purposes, not production environments. | ||
1059 | |||
1060 | mousedev.tap_time= | 1017 | mousedev.tap_time= |
1061 | [MOUSE] Maximum time between finger touching and | 1018 | [MOUSE] Maximum time between finger touching and |
1062 | leaving touchpad surface for touch to be considered | 1019 | leaving touchpad surface for touch to be considered |
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 88baed1e7e83..0b2954534b8e 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c | |||
@@ -941,17 +941,6 @@ exit: | |||
941 | } | 941 | } |
942 | #endif | 942 | #endif |
943 | 943 | ||
944 | static void smp_tune_scheduling(void) | ||
945 | { | ||
946 | if (cpu_khz) { | ||
947 | /* cache size in kB */ | ||
948 | long cachesize = boot_cpu_data.x86_cache_size; | ||
949 | |||
950 | if (cachesize > 0) | ||
951 | max_cache_size = cachesize * 1024; | ||
952 | } | ||
953 | } | ||
954 | |||
955 | /* | 944 | /* |
956 | * Cycle through the processors sending APIC IPIs to boot each. | 945 | * Cycle through the processors sending APIC IPIs to boot each. |
957 | */ | 946 | */ |
@@ -980,7 +969,6 @@ static void __init smp_boot_cpus(unsigned int max_cpus) | |||
980 | x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; | 969 | x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; |
981 | 970 | ||
982 | current_thread_info()->cpu = 0; | 971 | current_thread_info()->cpu = 0; |
983 | smp_tune_scheduling(); | ||
984 | 972 | ||
985 | set_cpu_sibling_map(0); | 973 | set_cpu_sibling_map(0); |
986 | 974 | ||
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index eaa6a24bc0b6..188fb73c6845 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c | |||
@@ -805,7 +805,6 @@ static void __cpuinit | |||
805 | get_max_cacheline_size (void) | 805 | get_max_cacheline_size (void) |
806 | { | 806 | { |
807 | unsigned long line_size, max = 1; | 807 | unsigned long line_size, max = 1; |
808 | unsigned int cache_size = 0; | ||
809 | u64 l, levels, unique_caches; | 808 | u64 l, levels, unique_caches; |
810 | pal_cache_config_info_t cci; | 809 | pal_cache_config_info_t cci; |
811 | s64 status; | 810 | s64 status; |
@@ -835,8 +834,6 @@ get_max_cacheline_size (void) | |||
835 | line_size = 1 << cci.pcci_line_size; | 834 | line_size = 1 << cci.pcci_line_size; |
836 | if (line_size > max) | 835 | if (line_size > max) |
837 | max = line_size; | 836 | max = line_size; |
838 | if (cache_size < cci.pcci_cache_size) | ||
839 | cache_size = cci.pcci_cache_size; | ||
840 | if (!cci.pcci_unified) { | 837 | if (!cci.pcci_unified) { |
841 | status = ia64_pal_cache_config_info(l, | 838 | status = ia64_pal_cache_config_info(l, |
842 | /* cache_type (instruction)= */ 1, | 839 | /* cache_type (instruction)= */ 1, |
@@ -853,9 +850,6 @@ get_max_cacheline_size (void) | |||
853 | ia64_i_cache_stride_shift = cci.pcci_stride; | 850 | ia64_i_cache_stride_shift = cci.pcci_stride; |
854 | } | 851 | } |
855 | out: | 852 | out: |
856 | #ifdef CONFIG_SMP | ||
857 | max_cache_size = max(max_cache_size, cache_size); | ||
858 | #endif | ||
859 | if (max > ia64_max_cacheline_size) | 853 | if (max > ia64_max_cacheline_size) |
860 | ia64_max_cacheline_size = max; | 854 | ia64_max_cacheline_size = max; |
861 | } | 855 | } |
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 67edfa7ed93a..a1b017f2dbb3 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c | |||
@@ -51,16 +51,6 @@ int __cpu_logical_map[NR_CPUS]; /* Map logical to physical */ | |||
51 | EXPORT_SYMBOL(phys_cpu_present_map); | 51 | EXPORT_SYMBOL(phys_cpu_present_map); |
52 | EXPORT_SYMBOL(cpu_online_map); | 52 | EXPORT_SYMBOL(cpu_online_map); |
53 | 53 | ||
54 | /* This happens early in bootup, can't really do it better */ | ||
55 | static void smp_tune_scheduling (void) | ||
56 | { | ||
57 | struct cache_desc *cd = ¤t_cpu_data.scache; | ||
58 | unsigned long cachesize = cd->linesz * cd->sets * cd->ways; | ||
59 | |||
60 | if (cachesize > max_cache_size) | ||
61 | max_cache_size = cachesize; | ||
62 | } | ||
63 | |||
64 | extern void __init calibrate_delay(void); | 54 | extern void __init calibrate_delay(void); |
65 | extern ATTRIB_NORET void cpu_idle(void); | 55 | extern ATTRIB_NORET void cpu_idle(void); |
66 | 56 | ||
@@ -228,7 +218,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) | |||
228 | { | 218 | { |
229 | init_new_context(current, &init_mm); | 219 | init_new_context(current, &init_mm); |
230 | current_thread_info()->cpu = 0; | 220 | current_thread_info()->cpu = 0; |
231 | smp_tune_scheduling(); | ||
232 | plat_prepare_cpus(max_cpus); | 221 | plat_prepare_cpus(max_cpus); |
233 | #ifndef CONFIG_HOTPLUG_CPU | 222 | #ifndef CONFIG_HOTPLUG_CPU |
234 | cpu_present_map = cpu_possible_map; | 223 | cpu_present_map = cpu_possible_map; |
diff --git a/arch/sparc/kernel/smp.c b/arch/sparc/kernel/smp.c index 4d9ad59031bb..4fea3ac7bff0 100644 --- a/arch/sparc/kernel/smp.c +++ b/arch/sparc/kernel/smp.c | |||
@@ -68,16 +68,6 @@ void __cpuinit smp_store_cpu_info(int id) | |||
68 | cpu_data(id).prom_node = cpu_node; | 68 | cpu_data(id).prom_node = cpu_node; |
69 | cpu_data(id).mid = cpu_get_hwmid(cpu_node); | 69 | cpu_data(id).mid = cpu_get_hwmid(cpu_node); |
70 | 70 | ||
71 | /* this is required to tune the scheduler correctly */ | ||
72 | /* is it possible to have CPUs with different cache sizes? */ | ||
73 | if (id == boot_cpu_id) { | ||
74 | int cache_line,cache_nlines; | ||
75 | cache_line = 0x20; | ||
76 | cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line); | ||
77 | cache_nlines = 0x8000; | ||
78 | cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines); | ||
79 | max_cache_size = cache_line * cache_nlines; | ||
80 | } | ||
81 | if (cpu_data(id).mid < 0) | 71 | if (cpu_data(id).mid < 0) |
82 | panic("No MID found for CPU%d at node 0x%08d", id, cpu_node); | 72 | panic("No MID found for CPU%d at node 0x%08d", id, cpu_node); |
83 | } | 73 | } |
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index 4dcd7d0b60f2..40e40f968d61 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c | |||
@@ -1163,32 +1163,6 @@ int setup_profiling_timer(unsigned int multiplier) | |||
1163 | return -EINVAL; | 1163 | return -EINVAL; |
1164 | } | 1164 | } |
1165 | 1165 | ||
1166 | static void __init smp_tune_scheduling(void) | ||
1167 | { | ||
1168 | unsigned int smallest = ~0U; | ||
1169 | int i; | ||
1170 | |||
1171 | for (i = 0; i < NR_CPUS; i++) { | ||
1172 | unsigned int val = cpu_data(i).ecache_size; | ||
1173 | |||
1174 | if (val && val < smallest) | ||
1175 | smallest = val; | ||
1176 | } | ||
1177 | |||
1178 | /* Any value less than 256K is nonsense. */ | ||
1179 | if (smallest < (256U * 1024U)) | ||
1180 | smallest = 256 * 1024; | ||
1181 | |||
1182 | max_cache_size = smallest; | ||
1183 | |||
1184 | if (smallest < 1U * 1024U * 1024U) | ||
1185 | printk(KERN_INFO "Using max_cache_size of %uKB\n", | ||
1186 | smallest / 1024U); | ||
1187 | else | ||
1188 | printk(KERN_INFO "Using max_cache_size of %uMB\n", | ||
1189 | smallest / 1024U / 1024U); | ||
1190 | } | ||
1191 | |||
1192 | /* Constrain the number of cpus to max_cpus. */ | 1166 | /* Constrain the number of cpus to max_cpus. */ |
1193 | void __init smp_prepare_cpus(unsigned int max_cpus) | 1167 | void __init smp_prepare_cpus(unsigned int max_cpus) |
1194 | { | 1168 | { |
@@ -1206,7 +1180,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) | |||
1206 | } | 1180 | } |
1207 | 1181 | ||
1208 | cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy; | 1182 | cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy; |
1209 | smp_tune_scheduling(); | ||
1210 | } | 1183 | } |
1211 | 1184 | ||
1212 | void __devinit smp_prepare_boot_cpu(void) | 1185 | void __devinit smp_prepare_boot_cpu(void) |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 7e74262f98e1..8764cda0feca 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -754,12 +754,6 @@ struct sched_domain { | |||
754 | extern int partition_sched_domains(cpumask_t *partition1, | 754 | extern int partition_sched_domains(cpumask_t *partition1, |
755 | cpumask_t *partition2); | 755 | cpumask_t *partition2); |
756 | 756 | ||
757 | /* | ||
758 | * Maximum cache size the migration-costs auto-tuning code will | ||
759 | * search from: | ||
760 | */ | ||
761 | extern unsigned int max_cache_size; | ||
762 | |||
763 | #endif /* CONFIG_SMP */ | 757 | #endif /* CONFIG_SMP */ |
764 | 758 | ||
765 | 759 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index ac054d9a0719..46b23f0fee24 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -5797,483 +5797,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | |||
5797 | 5797 | ||
5798 | #define SD_NODES_PER_DOMAIN 16 | 5798 | #define SD_NODES_PER_DOMAIN 16 |
5799 | 5799 | ||
5800 | /* | ||
5801 | * Self-tuning task migration cost measurement between source and target CPUs. | ||
5802 | * | ||
5803 | * This is done by measuring the cost of manipulating buffers of varying | ||
5804 | * sizes. For a given buffer-size here are the steps that are taken: | ||
5805 | * | ||
5806 | * 1) the source CPU reads+dirties a shared buffer | ||
5807 | * 2) the target CPU reads+dirties the same shared buffer | ||
5808 | * | ||
5809 | * We measure how long they take, in the following 4 scenarios: | ||
5810 | * | ||
5811 | * - source: CPU1, target: CPU2 | cost1 | ||
5812 | * - source: CPU2, target: CPU1 | cost2 | ||
5813 | * - source: CPU1, target: CPU1 | cost3 | ||
5814 | * - source: CPU2, target: CPU2 | cost4 | ||
5815 | * | ||
5816 | * We then calculate the cost3+cost4-cost1-cost2 difference - this is | ||
5817 | * the cost of migration. | ||
5818 | * | ||
5819 | * We then start off from a small buffer-size and iterate up to larger | ||
5820 | * buffer sizes, in 5% steps - measuring each buffer-size separately, and | ||
5821 | * doing a maximum search for the cost. (The maximum cost for a migration | ||
5822 | * normally occurs when the working set size is around the effective cache | ||
5823 | * size.) | ||
5824 | */ | ||
5825 | #define SEARCH_SCOPE 2 | ||
5826 | #define MIN_CACHE_SIZE (64*1024U) | ||
5827 | #define DEFAULT_CACHE_SIZE (5*1024*1024U) | ||
5828 | #define ITERATIONS 1 | ||
5829 | #define SIZE_THRESH 130 | ||
5830 | #define COST_THRESH 130 | ||
5831 | |||
5832 | /* | ||
5833 | * The migration cost is a function of 'domain distance'. Domain | ||
5834 | * distance is the number of steps a CPU has to iterate down its | ||
5835 | * domain tree to share a domain with the other CPU. The farther | ||
5836 | * two CPUs are from each other, the larger the distance gets. | ||
5837 | * | ||
5838 | * Note that we use the distance only to cache measurement results, | ||
5839 | * the distance value is not used numerically otherwise. When two | ||
5840 | * CPUs have the same distance it is assumed that the migration | ||
5841 | * cost is the same. (this is a simplification but quite practical) | ||
5842 | */ | ||
5843 | #define MAX_DOMAIN_DISTANCE 32 | ||
5844 | |||
5845 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = | ||
5846 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = | ||
5847 | /* | ||
5848 | * Architectures may override the migration cost and thus avoid | ||
5849 | * boot-time calibration. Unit is nanoseconds. Mostly useful for | ||
5850 | * virtualized hardware: | ||
5851 | */ | ||
5852 | #ifdef CONFIG_DEFAULT_MIGRATION_COST | ||
5853 | CONFIG_DEFAULT_MIGRATION_COST | ||
5854 | #else | ||
5855 | -1LL | ||
5856 | #endif | ||
5857 | }; | ||
5858 | |||
5859 | /* | ||
5860 | * Allow override of migration cost - in units of microseconds. | ||
5861 | * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost | ||
5862 | * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: | ||
5863 | */ | ||
5864 | static int __init migration_cost_setup(char *str) | ||
5865 | { | ||
5866 | int ints[MAX_DOMAIN_DISTANCE+1], i; | ||
5867 | |||
5868 | str = get_options(str, ARRAY_SIZE(ints), ints); | ||
5869 | |||
5870 | printk("#ints: %d\n", ints[0]); | ||
5871 | for (i = 1; i <= ints[0]; i++) { | ||
5872 | migration_cost[i-1] = (unsigned long long)ints[i]*1000; | ||
5873 | printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); | ||
5874 | } | ||
5875 | return 1; | ||
5876 | } | ||
5877 | |||
5878 | __setup ("migration_cost=", migration_cost_setup); | ||
5879 | |||
5880 | /* | ||
5881 | * Global multiplier (divisor) for migration-cutoff values, | ||
5882 | * in percentiles. E.g. use a value of 150 to get 1.5 times | ||
5883 | * longer cache-hot cutoff times. | ||
5884 | * | ||
5885 | * (We scale it from 100 to 128 to long long handling easier.) | ||
5886 | */ | ||
5887 | |||
5888 | #define MIGRATION_FACTOR_SCALE 128 | ||
5889 | |||
5890 | static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; | ||
5891 | |||
5892 | static int __init setup_migration_factor(char *str) | ||
5893 | { | ||
5894 | get_option(&str, &migration_factor); | ||
5895 | migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; | ||
5896 | return 1; | ||
5897 | } | ||
5898 | |||
5899 | __setup("migration_factor=", setup_migration_factor); | ||
5900 | |||
5901 | /* | ||
5902 | * Estimated distance of two CPUs, measured via the number of domains | ||
5903 | * we have to pass for the two CPUs to be in the same span: | ||
5904 | */ | ||
5905 | static unsigned long domain_distance(int cpu1, int cpu2) | ||
5906 | { | ||
5907 | unsigned long distance = 0; | ||
5908 | struct sched_domain *sd; | ||
5909 | |||
5910 | for_each_domain(cpu1, sd) { | ||
5911 | WARN_ON(!cpu_isset(cpu1, sd->span)); | ||
5912 | if (cpu_isset(cpu2, sd->span)) | ||
5913 | return distance; | ||
5914 | distance++; | ||
5915 | } | ||
5916 | if (distance >= MAX_DOMAIN_DISTANCE) { | ||
5917 | WARN_ON(1); | ||
5918 | distance = MAX_DOMAIN_DISTANCE-1; | ||
5919 | } | ||
5920 | |||
5921 | return distance; | ||
5922 | } | ||
5923 | |||
5924 | static unsigned int migration_debug; | ||
5925 | |||
5926 | static int __init setup_migration_debug(char *str) | ||
5927 | { | ||
5928 | get_option(&str, &migration_debug); | ||
5929 | return 1; | ||
5930 | } | ||
5931 | |||
5932 | __setup("migration_debug=", setup_migration_debug); | ||
5933 | |||
5934 | /* | ||
5935 | * Maximum cache-size that the scheduler should try to measure. | ||
5936 | * Architectures with larger caches should tune this up during | ||
5937 | * bootup. Gets used in the domain-setup code (i.e. during SMP | ||
5938 | * bootup). | ||
5939 | */ | ||
5940 | unsigned int max_cache_size; | ||
5941 | |||
5942 | static int __init setup_max_cache_size(char *str) | ||
5943 | { | ||
5944 | get_option(&str, &max_cache_size); | ||
5945 | return 1; | ||
5946 | } | ||
5947 | |||
5948 | __setup("max_cache_size=", setup_max_cache_size); | ||
5949 | |||
5950 | /* | ||
5951 | * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This | ||
5952 | * is the operation that is timed, so we try to generate unpredictable | ||
5953 | * cachemisses that still end up filling the L2 cache: | ||
5954 | */ | ||
5955 | static void touch_cache(void *__cache, unsigned long __size) | ||
5956 | { | ||
5957 | unsigned long size = __size / sizeof(long); | ||
5958 | unsigned long chunk1 = size / 3; | ||
5959 | unsigned long chunk2 = 2 * size / 3; | ||
5960 | unsigned long *cache = __cache; | ||
5961 | int i; | ||
5962 | |||
5963 | for (i = 0; i < size/6; i += 8) { | ||
5964 | switch (i % 6) { | ||
5965 | case 0: cache[i]++; | ||
5966 | case 1: cache[size-1-i]++; | ||
5967 | case 2: cache[chunk1-i]++; | ||
5968 | case 3: cache[chunk1+i]++; | ||
5969 | case 4: cache[chunk2-i]++; | ||
5970 | case 5: cache[chunk2+i]++; | ||
5971 | } | ||
5972 | } | ||
5973 | } | ||
5974 | |||
5975 | /* | ||
5976 | * Measure the cache-cost of one task migration. Returns in units of nsec. | ||
5977 | */ | ||
5978 | static unsigned long long | ||
5979 | measure_one(void *cache, unsigned long size, int source, int target) | ||
5980 | { | ||
5981 | cpumask_t mask, saved_mask; | ||
5982 | unsigned long long t0, t1, t2, t3, cost; | ||
5983 | |||
5984 | saved_mask = current->cpus_allowed; | ||
5985 | |||
5986 | /* | ||
5987 | * Flush source caches to RAM and invalidate them: | ||
5988 | */ | ||
5989 | sched_cacheflush(); | ||
5990 | |||
5991 | /* | ||
5992 | * Migrate to the source CPU: | ||
5993 | */ | ||
5994 | mask = cpumask_of_cpu(source); | ||
5995 | set_cpus_allowed(current, mask); | ||
5996 | WARN_ON(smp_processor_id() != source); | ||
5997 | |||
5998 | /* | ||
5999 | * Dirty the working set: | ||
6000 | */ | ||
6001 | t0 = sched_clock(); | ||
6002 | touch_cache(cache, size); | ||
6003 | t1 = sched_clock(); | ||
6004 | |||
6005 | /* | ||
6006 | * Migrate to the target CPU, dirty the L2 cache and access | ||
6007 | * the shared buffer. (which represents the working set | ||
6008 | * of a migrated task.) | ||
6009 | */ | ||
6010 | mask = cpumask_of_cpu(target); | ||
6011 | set_cpus_allowed(current, mask); | ||
6012 | WARN_ON(smp_processor_id() != target); | ||
6013 | |||
6014 | t2 = sched_clock(); | ||
6015 | touch_cache(cache, size); | ||
6016 | t3 = sched_clock(); | ||
6017 | |||
6018 | cost = t1-t0 + t3-t2; | ||
6019 | |||
6020 | if (migration_debug >= 2) | ||
6021 | printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", | ||
6022 | source, target, t1-t0, t1-t0, t3-t2, cost); | ||
6023 | /* | ||
6024 | * Flush target caches to RAM and invalidate them: | ||
6025 | */ | ||
6026 | sched_cacheflush(); | ||
6027 | |||
6028 | set_cpus_allowed(current, saved_mask); | ||
6029 | |||
6030 | return cost; | ||
6031 | } | ||
6032 | |||
6033 | /* | ||
6034 | * Measure a series of task migrations and return the average | ||
6035 | * result. Since this code runs early during bootup the system | ||
6036 | * is 'undisturbed' and the average latency makes sense. | ||
6037 | * | ||
6038 | * The algorithm in essence auto-detects the relevant cache-size, | ||
6039 | * so it will properly detect different cachesizes for different | ||
6040 | * cache-hierarchies, depending on how the CPUs are connected. | ||
6041 | * | ||
6042 | * Architectures can prime the upper limit of the search range via | ||
6043 | * max_cache_size, otherwise the search range defaults to 20MB...64K. | ||
6044 | */ | ||
6045 | static unsigned long long | ||
6046 | measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | ||
6047 | { | ||
6048 | unsigned long long cost1, cost2; | ||
6049 | int i; | ||
6050 | |||
6051 | /* | ||
6052 | * Measure the migration cost of 'size' bytes, over an | ||
6053 | * average of 10 runs: | ||
6054 | * | ||
6055 | * (We perturb the cache size by a small (0..4k) | ||
6056 | * value to compensate size/alignment related artifacts. | ||
6057 | * We also subtract the cost of the operation done on | ||
6058 | * the same CPU.) | ||
6059 | */ | ||
6060 | cost1 = 0; | ||
6061 | |||
6062 | /* | ||
6063 | * dry run, to make sure we start off cache-cold on cpu1, | ||
6064 | * and to get any vmalloc pagefaults in advance: | ||
6065 | */ | ||
6066 | measure_one(cache, size, cpu1, cpu2); | ||
6067 | for (i = 0; i < ITERATIONS; i++) | ||
6068 | cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); | ||
6069 | |||
6070 | measure_one(cache, size, cpu2, cpu1); | ||
6071 | for (i = 0; i < ITERATIONS; i++) | ||
6072 | cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); | ||
6073 | |||
6074 | /* | ||
6075 | * (We measure the non-migrating [cached] cost on both | ||
6076 | * cpu1 and cpu2, to handle CPUs with different speeds) | ||
6077 | */ | ||
6078 | cost2 = 0; | ||
6079 | |||
6080 | measure_one(cache, size, cpu1, cpu1); | ||
6081 | for (i = 0; i < ITERATIONS; i++) | ||
6082 | cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); | ||
6083 | |||
6084 | measure_one(cache, size, cpu2, cpu2); | ||
6085 | for (i = 0; i < ITERATIONS; i++) | ||
6086 | cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); | ||
6087 | |||
6088 | /* | ||
6089 | * Get the per-iteration migration cost: | ||
6090 | */ | ||
6091 | do_div(cost1, 2 * ITERATIONS); | ||
6092 | do_div(cost2, 2 * ITERATIONS); | ||
6093 | |||
6094 | return cost1 - cost2; | ||
6095 | } | ||
6096 | |||
6097 | static unsigned long long measure_migration_cost(int cpu1, int cpu2) | ||
6098 | { | ||
6099 | unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; | ||
6100 | unsigned int max_size, size, size_found = 0; | ||
6101 | long long cost = 0, prev_cost; | ||
6102 | void *cache; | ||
6103 | |||
6104 | /* | ||
6105 | * Search from max_cache_size*5 down to 64K - the real relevant | ||
6106 | * cachesize has to lie somewhere inbetween. | ||
6107 | */ | ||
6108 | if (max_cache_size) { | ||
6109 | max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
6110 | size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
6111 | } else { | ||
6112 | /* | ||
6113 | * Since we have no estimation about the relevant | ||
6114 | * search range | ||
6115 | */ | ||
6116 | max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; | ||
6117 | size = MIN_CACHE_SIZE; | ||
6118 | } | ||
6119 | |||
6120 | if (!cpu_online(cpu1) || !cpu_online(cpu2)) { | ||
6121 | printk("cpu %d and %d not both online!\n", cpu1, cpu2); | ||
6122 | return 0; | ||
6123 | } | ||
6124 | |||
6125 | /* | ||
6126 | * Allocate the working set: | ||
6127 | */ | ||
6128 | cache = vmalloc(max_size); | ||
6129 | if (!cache) { | ||
6130 | printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); | ||
6131 | return 1000000; /* return 1 msec on very small boxen */ | ||
6132 | } | ||
6133 | |||
6134 | while (size <= max_size) { | ||
6135 | prev_cost = cost; | ||
6136 | cost = measure_cost(cpu1, cpu2, cache, size); | ||
6137 | |||
6138 | /* | ||
6139 | * Update the max: | ||
6140 | */ | ||
6141 | if (cost > 0) { | ||
6142 | if (max_cost < cost) { | ||
6143 | max_cost = cost; | ||
6144 | size_found = size; | ||
6145 | } | ||
6146 | } | ||
6147 | /* | ||
6148 | * Calculate average fluctuation, we use this to prevent | ||
6149 | * noise from triggering an early break out of the loop: | ||
6150 | */ | ||
6151 | fluct = abs(cost - prev_cost); | ||
6152 | avg_fluct = (avg_fluct + fluct)/2; | ||
6153 | |||
6154 | if (migration_debug) | ||
6155 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " | ||
6156 | "(%8Ld %8Ld)\n", | ||
6157 | cpu1, cpu2, size, | ||
6158 | (long)cost / 1000000, | ||
6159 | ((long)cost / 100000) % 10, | ||
6160 | (long)max_cost / 1000000, | ||
6161 | ((long)max_cost / 100000) % 10, | ||
6162 | domain_distance(cpu1, cpu2), | ||
6163 | cost, avg_fluct); | ||
6164 | |||
6165 | /* | ||
6166 | * If we iterated at least 20% past the previous maximum, | ||
6167 | * and the cost has dropped by more than 20% already, | ||
6168 | * (taking fluctuations into account) then we assume to | ||
6169 | * have found the maximum and break out of the loop early: | ||
6170 | */ | ||
6171 | if (size_found && (size*100 > size_found*SIZE_THRESH)) | ||
6172 | if (cost+avg_fluct <= 0 || | ||
6173 | max_cost*100 > (cost+avg_fluct)*COST_THRESH) { | ||
6174 | |||
6175 | if (migration_debug) | ||
6176 | printk("-> found max.\n"); | ||
6177 | break; | ||
6178 | } | ||
6179 | /* | ||
6180 | * Increase the cachesize in 10% steps: | ||
6181 | */ | ||
6182 | size = size * 10 / 9; | ||
6183 | } | ||
6184 | |||
6185 | if (migration_debug) | ||
6186 | printk("[%d][%d] working set size found: %d, cost: %Ld\n", | ||
6187 | cpu1, cpu2, size_found, max_cost); | ||
6188 | |||
6189 | vfree(cache); | ||
6190 | |||
6191 | /* | ||
6192 | * A task is considered 'cache cold' if at least 2 times | ||
6193 | * the worst-case cost of migration has passed. | ||
6194 | * | ||
6195 | * (this limit is only listened to if the load-balancing | ||
6196 | * situation is 'nice' - if there is a large imbalance we | ||
6197 | * ignore it for the sake of CPU utilization and | ||
6198 | * processing fairness.) | ||
6199 | */ | ||
6200 | return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; | ||
6201 | } | ||
6202 | |||
6203 | static void calibrate_migration_costs(const cpumask_t *cpu_map) | ||
6204 | { | ||
6205 | int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); | ||
6206 | unsigned long j0, j1, distance, max_distance = 0; | ||
6207 | struct sched_domain *sd; | ||
6208 | |||
6209 | j0 = jiffies; | ||
6210 | |||
6211 | /* | ||
6212 | * First pass - calculate the cacheflush times: | ||
6213 | */ | ||
6214 | for_each_cpu_mask(cpu1, *cpu_map) { | ||
6215 | for_each_cpu_mask(cpu2, *cpu_map) { | ||
6216 | if (cpu1 == cpu2) | ||
6217 | continue; | ||
6218 | distance = domain_distance(cpu1, cpu2); | ||
6219 | max_distance = max(max_distance, distance); | ||
6220 | /* | ||
6221 | * No result cached yet? | ||
6222 | */ | ||
6223 | if (migration_cost[distance] == -1LL) | ||
6224 | migration_cost[distance] = | ||
6225 | measure_migration_cost(cpu1, cpu2); | ||
6226 | } | ||
6227 | } | ||
6228 | /* | ||
6229 | * Second pass - update the sched domain hierarchy with | ||
6230 | * the new cache-hot-time estimations: | ||
6231 | */ | ||
6232 | for_each_cpu_mask(cpu, *cpu_map) { | ||
6233 | distance = 0; | ||
6234 | for_each_domain(cpu, sd) { | ||
6235 | sd->cache_hot_time = migration_cost[distance]; | ||
6236 | distance++; | ||
6237 | } | ||
6238 | } | ||
6239 | /* | ||
6240 | * Print the matrix: | ||
6241 | */ | ||
6242 | if (migration_debug) | ||
6243 | printk("migration: max_cache_size: %d, cpu: %d MHz:\n", | ||
6244 | max_cache_size, | ||
6245 | #ifdef CONFIG_X86 | ||
6246 | cpu_khz/1000 | ||
6247 | #else | ||
6248 | -1 | ||
6249 | #endif | ||
6250 | ); | ||
6251 | if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { | ||
6252 | printk("migration_cost="); | ||
6253 | for (distance = 0; distance <= max_distance; distance++) { | ||
6254 | if (distance) | ||
6255 | printk(","); | ||
6256 | printk("%ld", (long)migration_cost[distance] / 1000); | ||
6257 | } | ||
6258 | printk("\n"); | ||
6259 | } | ||
6260 | j1 = jiffies; | ||
6261 | if (migration_debug) | ||
6262 | printk("migration: %ld seconds\n", (j1-j0) / HZ); | ||
6263 | |||
6264 | /* | ||
6265 | * Move back to the original CPU. NUMA-Q gets confused | ||
6266 | * if we migrate to another quad during bootup. | ||
6267 | */ | ||
6268 | if (raw_smp_processor_id() != orig_cpu) { | ||
6269 | cpumask_t mask = cpumask_of_cpu(orig_cpu), | ||
6270 | saved_mask = current->cpus_allowed; | ||
6271 | |||
6272 | set_cpus_allowed(current, mask); | ||
6273 | set_cpus_allowed(current, saved_mask); | ||
6274 | } | ||
6275 | } | ||
6276 | |||
6277 | #ifdef CONFIG_NUMA | 5800 | #ifdef CONFIG_NUMA |
6278 | 5801 | ||
6279 | /** | 5802 | /** |
@@ -6803,10 +6326,6 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6803 | #endif | 6326 | #endif |
6804 | cpu_attach_domain(sd, i); | 6327 | cpu_attach_domain(sd, i); |
6805 | } | 6328 | } |
6806 | /* | ||
6807 | * Tune cache-hot values: | ||
6808 | */ | ||
6809 | calibrate_migration_costs(cpu_map); | ||
6810 | 6329 | ||
6811 | return 0; | 6330 | return 0; |
6812 | 6331 | ||