diff options
Diffstat (limited to 'kernel/sched/fair.c')
| -rw-r--r-- | kernel/sched/fair.c | 1397 |
1 files changed, 1193 insertions, 204 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c70201fbc61..df77c605c7a6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 681 | } | 681 | } |
| 682 | 682 | ||
| 683 | #ifdef CONFIG_SMP | 683 | #ifdef CONFIG_SMP |
| 684 | static unsigned long task_h_load(struct task_struct *p); | ||
| 685 | |||
| 684 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 686 | static inline void __update_task_entity_contrib(struct sched_entity *se); |
| 685 | 687 | ||
| 686 | /* Give new task start runnable values to heavy its load in infant time */ | 688 | /* Give new task start runnable values to heavy its load in infant time */ |
| @@ -818,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 818 | 820 | ||
| 819 | #ifdef CONFIG_NUMA_BALANCING | 821 | #ifdef CONFIG_NUMA_BALANCING |
| 820 | /* | 822 | /* |
| 821 | * numa task sample period in ms | 823 | * Approximate time to scan a full NUMA task in ms. The task scan period is |
| 824 | * calculated based on the tasks virtual memory size and | ||
| 825 | * numa_balancing_scan_size. | ||
| 822 | */ | 826 | */ |
| 823 | unsigned int sysctl_numa_balancing_scan_period_min = 100; | 827 | unsigned int sysctl_numa_balancing_scan_period_min = 1000; |
| 824 | unsigned int sysctl_numa_balancing_scan_period_max = 100*50; | 828 | unsigned int sysctl_numa_balancing_scan_period_max = 60000; |
| 825 | unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; | ||
| 826 | 829 | ||
| 827 | /* Portion of address space to scan in MB */ | 830 | /* Portion of address space to scan in MB */ |
| 828 | unsigned int sysctl_numa_balancing_scan_size = 256; | 831 | unsigned int sysctl_numa_balancing_scan_size = 256; |
| @@ -830,41 +833,810 @@ unsigned int sysctl_numa_balancing_scan_size = 256; | |||
| 830 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ | 833 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ |
| 831 | unsigned int sysctl_numa_balancing_scan_delay = 1000; | 834 | unsigned int sysctl_numa_balancing_scan_delay = 1000; |
| 832 | 835 | ||
| 833 | static void task_numa_placement(struct task_struct *p) | 836 | /* |
| 837 | * After skipping a page migration on a shared page, skip N more numa page | ||
| 838 | * migrations unconditionally. This reduces the number of NUMA migrations | ||
| 839 | * in shared memory workloads, and has the effect of pulling tasks towards | ||
| 840 | * where their memory lives, over pulling the memory towards the task. | ||
| 841 | */ | ||
| 842 | unsigned int sysctl_numa_balancing_migrate_deferred = 16; | ||
| 843 | |||
| 844 | static unsigned int task_nr_scan_windows(struct task_struct *p) | ||
| 845 | { | ||
| 846 | unsigned long rss = 0; | ||
| 847 | unsigned long nr_scan_pages; | ||
| 848 | |||
| 849 | /* | ||
| 850 | * Calculations based on RSS as non-present and empty pages are skipped | ||
| 851 | * by the PTE scanner and NUMA hinting faults should be trapped based | ||
| 852 | * on resident pages | ||
| 853 | */ | ||
| 854 | nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); | ||
| 855 | rss = get_mm_rss(p->mm); | ||
| 856 | if (!rss) | ||
| 857 | rss = nr_scan_pages; | ||
| 858 | |||
| 859 | rss = round_up(rss, nr_scan_pages); | ||
| 860 | return rss / nr_scan_pages; | ||
| 861 | } | ||
| 862 | |||
| 863 | /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ | ||
| 864 | #define MAX_SCAN_WINDOW 2560 | ||
| 865 | |||
| 866 | static unsigned int task_scan_min(struct task_struct *p) | ||
| 867 | { | ||
| 868 | unsigned int scan, floor; | ||
| 869 | unsigned int windows = 1; | ||
| 870 | |||
| 871 | if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) | ||
| 872 | windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; | ||
| 873 | floor = 1000 / windows; | ||
| 874 | |||
| 875 | scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); | ||
| 876 | return max_t(unsigned int, floor, scan); | ||
| 877 | } | ||
| 878 | |||
| 879 | static unsigned int task_scan_max(struct task_struct *p) | ||
| 880 | { | ||
| 881 | unsigned int smin = task_scan_min(p); | ||
| 882 | unsigned int smax; | ||
| 883 | |||
| 884 | /* Watch for min being lower than max due to floor calculations */ | ||
| 885 | smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); | ||
| 886 | return max(smin, smax); | ||
| 887 | } | ||
| 888 | |||
| 889 | /* | ||
| 890 | * Once a preferred node is selected the scheduler balancer will prefer moving | ||
| 891 | * a task to that node for sysctl_numa_balancing_settle_count number of PTE | ||
| 892 | * scans. This will give the process the chance to accumulate more faults on | ||
| 893 | * the preferred node but still allow the scheduler to move the task again if | ||
| 894 | * the nodes CPUs are overloaded. | ||
| 895 | */ | ||
| 896 | unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; | ||
| 897 | |||
| 898 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) | ||
| 899 | { | ||
| 900 | rq->nr_numa_running += (p->numa_preferred_nid != -1); | ||
| 901 | rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); | ||
| 902 | } | ||
| 903 | |||
| 904 | static void account_numa_dequeue(struct rq *rq, struct task_struct *p) | ||
| 905 | { | ||
| 906 | rq->nr_numa_running -= (p->numa_preferred_nid != -1); | ||
| 907 | rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); | ||
| 908 | } | ||
| 909 | |||
| 910 | struct numa_group { | ||
| 911 | atomic_t refcount; | ||
| 912 | |||
| 913 | spinlock_t lock; /* nr_tasks, tasks */ | ||
| 914 | int nr_tasks; | ||
| 915 | pid_t gid; | ||
| 916 | struct list_head task_list; | ||
| 917 | |||
| 918 | struct rcu_head rcu; | ||
| 919 | unsigned long total_faults; | ||
| 920 | unsigned long faults[0]; | ||
| 921 | }; | ||
| 922 | |||
| 923 | pid_t task_numa_group_id(struct task_struct *p) | ||
| 924 | { | ||
| 925 | return p->numa_group ? p->numa_group->gid : 0; | ||
| 926 | } | ||
| 927 | |||
| 928 | static inline int task_faults_idx(int nid, int priv) | ||
| 929 | { | ||
| 930 | return 2 * nid + priv; | ||
| 931 | } | ||
| 932 | |||
| 933 | static inline unsigned long task_faults(struct task_struct *p, int nid) | ||
| 934 | { | ||
| 935 | if (!p->numa_faults) | ||
| 936 | return 0; | ||
| 937 | |||
| 938 | return p->numa_faults[task_faults_idx(nid, 0)] + | ||
| 939 | p->numa_faults[task_faults_idx(nid, 1)]; | ||
| 940 | } | ||
| 941 | |||
| 942 | static inline unsigned long group_faults(struct task_struct *p, int nid) | ||
| 943 | { | ||
| 944 | if (!p->numa_group) | ||
| 945 | return 0; | ||
| 946 | |||
| 947 | return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; | ||
| 948 | } | ||
| 949 | |||
| 950 | /* | ||
| 951 | * These return the fraction of accesses done by a particular task, or | ||
| 952 | * task group, on a particular numa node. The group weight is given a | ||
| 953 | * larger multiplier, in order to group tasks together that are almost | ||
| 954 | * evenly spread out between numa nodes. | ||
| 955 | */ | ||
| 956 | static inline unsigned long task_weight(struct task_struct *p, int nid) | ||
| 957 | { | ||
| 958 | unsigned long total_faults; | ||
| 959 | |||
| 960 | if (!p->numa_faults) | ||
| 961 | return 0; | ||
| 962 | |||
| 963 | total_faults = p->total_numa_faults; | ||
| 964 | |||
| 965 | if (!total_faults) | ||
| 966 | return 0; | ||
| 967 | |||
| 968 | return 1000 * task_faults(p, nid) / total_faults; | ||
| 969 | } | ||
| 970 | |||
| 971 | static inline unsigned long group_weight(struct task_struct *p, int nid) | ||
| 834 | { | 972 | { |
| 835 | int seq; | 973 | if (!p->numa_group || !p->numa_group->total_faults) |
| 974 | return 0; | ||
| 836 | 975 | ||
| 837 | if (!p->mm) /* for example, ksmd faulting in a user's mm */ | 976 | return 1000 * group_faults(p, nid) / p->numa_group->total_faults; |
| 977 | } | ||
| 978 | |||
| 979 | static unsigned long weighted_cpuload(const int cpu); | ||
| 980 | static unsigned long source_load(int cpu, int type); | ||
| 981 | static unsigned long target_load(int cpu, int type); | ||
| 982 | static unsigned long power_of(int cpu); | ||
| 983 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg); | ||
| 984 | |||
| 985 | /* Cached statistics for all CPUs within a node */ | ||
| 986 | struct numa_stats { | ||
| 987 | unsigned long nr_running; | ||
| 988 | unsigned long load; | ||
| 989 | |||
| 990 | /* Total compute capacity of CPUs on a node */ | ||
| 991 | unsigned long power; | ||
| 992 | |||
| 993 | /* Approximate capacity in terms of runnable tasks on a node */ | ||
| 994 | unsigned long capacity; | ||
| 995 | int has_capacity; | ||
| 996 | }; | ||
| 997 | |||
| 998 | /* | ||
| 999 | * XXX borrowed from update_sg_lb_stats | ||
| 1000 | */ | ||
| 1001 | static void update_numa_stats(struct numa_stats *ns, int nid) | ||
| 1002 | { | ||
| 1003 | int cpu; | ||
| 1004 | |||
| 1005 | memset(ns, 0, sizeof(*ns)); | ||
| 1006 | for_each_cpu(cpu, cpumask_of_node(nid)) { | ||
| 1007 | struct rq *rq = cpu_rq(cpu); | ||
| 1008 | |||
| 1009 | ns->nr_running += rq->nr_running; | ||
| 1010 | ns->load += weighted_cpuload(cpu); | ||
| 1011 | ns->power += power_of(cpu); | ||
| 1012 | } | ||
| 1013 | |||
| 1014 | ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; | ||
| 1015 | ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); | ||
| 1016 | ns->has_capacity = (ns->nr_running < ns->capacity); | ||
| 1017 | } | ||
| 1018 | |||
| 1019 | struct task_numa_env { | ||
| 1020 | struct task_struct *p; | ||
| 1021 | |||
| 1022 | int src_cpu, src_nid; | ||
| 1023 | int dst_cpu, dst_nid; | ||
| 1024 | |||
| 1025 | struct numa_stats src_stats, dst_stats; | ||
| 1026 | |||
| 1027 | int imbalance_pct, idx; | ||
| 1028 | |||
| 1029 | struct task_struct *best_task; | ||
| 1030 | long best_imp; | ||
| 1031 | int best_cpu; | ||
| 1032 | }; | ||
| 1033 | |||
| 1034 | static void task_numa_assign(struct task_numa_env *env, | ||
| 1035 | struct task_struct *p, long imp) | ||
| 1036 | { | ||
| 1037 | if (env->best_task) | ||
| 1038 | put_task_struct(env->best_task); | ||
| 1039 | if (p) | ||
| 1040 | get_task_struct(p); | ||
| 1041 | |||
| 1042 | env->best_task = p; | ||
| 1043 | env->best_imp = imp; | ||
| 1044 | env->best_cpu = env->dst_cpu; | ||
| 1045 | } | ||
| 1046 | |||
| 1047 | /* | ||
| 1048 | * This checks if the overall compute and NUMA accesses of the system would | ||
| 1049 | * be improved if the source tasks was migrated to the target dst_cpu taking | ||
| 1050 | * into account that it might be best if task running on the dst_cpu should | ||
| 1051 | * be exchanged with the source task | ||
| 1052 | */ | ||
| 1053 | static void task_numa_compare(struct task_numa_env *env, | ||
| 1054 | long taskimp, long groupimp) | ||
| 1055 | { | ||
| 1056 | struct rq *src_rq = cpu_rq(env->src_cpu); | ||
| 1057 | struct rq *dst_rq = cpu_rq(env->dst_cpu); | ||
| 1058 | struct task_struct *cur; | ||
| 1059 | long dst_load, src_load; | ||
| 1060 | long load; | ||
| 1061 | long imp = (groupimp > 0) ? groupimp : taskimp; | ||
| 1062 | |||
| 1063 | rcu_read_lock(); | ||
| 1064 | cur = ACCESS_ONCE(dst_rq->curr); | ||
| 1065 | if (cur->pid == 0) /* idle */ | ||
| 1066 | cur = NULL; | ||
| 1067 | |||
| 1068 | /* | ||
| 1069 | * "imp" is the fault differential for the source task between the | ||
| 1070 | * source and destination node. Calculate the total differential for | ||
| 1071 | * the source task and potential destination task. The more negative | ||
| 1072 | * the value is, the more rmeote accesses that would be expected to | ||
| 1073 | * be incurred if the tasks were swapped. | ||
| 1074 | */ | ||
| 1075 | if (cur) { | ||
| 1076 | /* Skip this swap candidate if cannot move to the source cpu */ | ||
| 1077 | if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) | ||
| 1078 | goto unlock; | ||
| 1079 | |||
| 1080 | /* | ||
| 1081 | * If dst and source tasks are in the same NUMA group, or not | ||
| 1082 | * in any group then look only at task weights. | ||
| 1083 | */ | ||
| 1084 | if (cur->numa_group == env->p->numa_group) { | ||
| 1085 | imp = taskimp + task_weight(cur, env->src_nid) - | ||
| 1086 | task_weight(cur, env->dst_nid); | ||
| 1087 | /* | ||
| 1088 | * Add some hysteresis to prevent swapping the | ||
| 1089 | * tasks within a group over tiny differences. | ||
| 1090 | */ | ||
| 1091 | if (cur->numa_group) | ||
| 1092 | imp -= imp/16; | ||
| 1093 | } else { | ||
| 1094 | /* | ||
| 1095 | * Compare the group weights. If a task is all by | ||
| 1096 | * itself (not part of a group), use the task weight | ||
| 1097 | * instead. | ||
| 1098 | */ | ||
| 1099 | if (env->p->numa_group) | ||
| 1100 | imp = groupimp; | ||
| 1101 | else | ||
| 1102 | imp = taskimp; | ||
| 1103 | |||
| 1104 | if (cur->numa_group) | ||
| 1105 | imp += group_weight(cur, env->src_nid) - | ||
| 1106 | group_weight(cur, env->dst_nid); | ||
| 1107 | else | ||
| 1108 | imp += task_weight(cur, env->src_nid) - | ||
| 1109 | task_weight(cur, env->dst_nid); | ||
| 1110 | } | ||
| 1111 | } | ||
| 1112 | |||
| 1113 | if (imp < env->best_imp) | ||
| 1114 | goto unlock; | ||
| 1115 | |||
| 1116 | if (!cur) { | ||
| 1117 | /* Is there capacity at our destination? */ | ||
| 1118 | if (env->src_stats.has_capacity && | ||
| 1119 | !env->dst_stats.has_capacity) | ||
| 1120 | goto unlock; | ||
| 1121 | |||
| 1122 | goto balance; | ||
| 1123 | } | ||
| 1124 | |||
| 1125 | /* Balance doesn't matter much if we're running a task per cpu */ | ||
| 1126 | if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) | ||
| 1127 | goto assign; | ||
| 1128 | |||
| 1129 | /* | ||
| 1130 | * In the overloaded case, try and keep the load balanced. | ||
| 1131 | */ | ||
| 1132 | balance: | ||
| 1133 | dst_load = env->dst_stats.load; | ||
| 1134 | src_load = env->src_stats.load; | ||
| 1135 | |||
| 1136 | /* XXX missing power terms */ | ||
| 1137 | load = task_h_load(env->p); | ||
| 1138 | dst_load += load; | ||
| 1139 | src_load -= load; | ||
| 1140 | |||
| 1141 | if (cur) { | ||
| 1142 | load = task_h_load(cur); | ||
| 1143 | dst_load -= load; | ||
| 1144 | src_load += load; | ||
| 1145 | } | ||
| 1146 | |||
| 1147 | /* make src_load the smaller */ | ||
| 1148 | if (dst_load < src_load) | ||
| 1149 | swap(dst_load, src_load); | ||
| 1150 | |||
| 1151 | if (src_load * env->imbalance_pct < dst_load * 100) | ||
| 1152 | goto unlock; | ||
| 1153 | |||
| 1154 | assign: | ||
| 1155 | task_numa_assign(env, cur, imp); | ||
| 1156 | unlock: | ||
| 1157 | rcu_read_unlock(); | ||
| 1158 | } | ||
| 1159 | |||
| 1160 | static void task_numa_find_cpu(struct task_numa_env *env, | ||
| 1161 | long taskimp, long groupimp) | ||
| 1162 | { | ||
| 1163 | int cpu; | ||
| 1164 | |||
| 1165 | for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { | ||
| 1166 | /* Skip this CPU if the source task cannot migrate */ | ||
| 1167 | if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p))) | ||
| 1168 | continue; | ||
| 1169 | |||
| 1170 | env->dst_cpu = cpu; | ||
| 1171 | task_numa_compare(env, taskimp, groupimp); | ||
| 1172 | } | ||
| 1173 | } | ||
| 1174 | |||
| 1175 | static int task_numa_migrate(struct task_struct *p) | ||
| 1176 | { | ||
| 1177 | struct task_numa_env env = { | ||
| 1178 | .p = p, | ||
| 1179 | |||
| 1180 | .src_cpu = task_cpu(p), | ||
| 1181 | .src_nid = task_node(p), | ||
| 1182 | |||
| 1183 | .imbalance_pct = 112, | ||
| 1184 | |||
| 1185 | .best_task = NULL, | ||
| 1186 | .best_imp = 0, | ||
| 1187 | .best_cpu = -1 | ||
| 1188 | }; | ||
| 1189 | struct sched_domain *sd; | ||
| 1190 | unsigned long taskweight, groupweight; | ||
| 1191 | int nid, ret; | ||
| 1192 | long taskimp, groupimp; | ||
| 1193 | |||
| 1194 | /* | ||
| 1195 | * Pick the lowest SD_NUMA domain, as that would have the smallest | ||
| 1196 | * imbalance and would be the first to start moving tasks about. | ||
| 1197 | * | ||
| 1198 | * And we want to avoid any moving of tasks about, as that would create | ||
| 1199 | * random movement of tasks -- counter the numa conditions we're trying | ||
| 1200 | * to satisfy here. | ||
| 1201 | */ | ||
| 1202 | rcu_read_lock(); | ||
| 1203 | sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); | ||
| 1204 | env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; | ||
| 1205 | rcu_read_unlock(); | ||
| 1206 | |||
| 1207 | taskweight = task_weight(p, env.src_nid); | ||
| 1208 | groupweight = group_weight(p, env.src_nid); | ||
| 1209 | update_numa_stats(&env.src_stats, env.src_nid); | ||
| 1210 | env.dst_nid = p->numa_preferred_nid; | ||
| 1211 | taskimp = task_weight(p, env.dst_nid) - taskweight; | ||
| 1212 | groupimp = group_weight(p, env.dst_nid) - groupweight; | ||
| 1213 | update_numa_stats(&env.dst_stats, env.dst_nid); | ||
| 1214 | |||
| 1215 | /* If the preferred nid has capacity, try to use it. */ | ||
| 1216 | if (env.dst_stats.has_capacity) | ||
| 1217 | task_numa_find_cpu(&env, taskimp, groupimp); | ||
| 1218 | |||
| 1219 | /* No space available on the preferred nid. Look elsewhere. */ | ||
| 1220 | if (env.best_cpu == -1) { | ||
| 1221 | for_each_online_node(nid) { | ||
| 1222 | if (nid == env.src_nid || nid == p->numa_preferred_nid) | ||
| 1223 | continue; | ||
| 1224 | |||
| 1225 | /* Only consider nodes where both task and groups benefit */ | ||
| 1226 | taskimp = task_weight(p, nid) - taskweight; | ||
| 1227 | groupimp = group_weight(p, nid) - groupweight; | ||
| 1228 | if (taskimp < 0 && groupimp < 0) | ||
| 1229 | continue; | ||
| 1230 | |||
| 1231 | env.dst_nid = nid; | ||
| 1232 | update_numa_stats(&env.dst_stats, env.dst_nid); | ||
| 1233 | task_numa_find_cpu(&env, taskimp, groupimp); | ||
| 1234 | } | ||
| 1235 | } | ||
| 1236 | |||
| 1237 | /* No better CPU than the current one was found. */ | ||
| 1238 | if (env.best_cpu == -1) | ||
| 1239 | return -EAGAIN; | ||
| 1240 | |||
| 1241 | sched_setnuma(p, env.dst_nid); | ||
| 1242 | |||
| 1243 | /* | ||
| 1244 | * Reset the scan period if the task is being rescheduled on an | ||
| 1245 | * alternative node to recheck if the tasks is now properly placed. | ||
| 1246 | */ | ||
| 1247 | p->numa_scan_period = task_scan_min(p); | ||
| 1248 | |||
| 1249 | if (env.best_task == NULL) { | ||
| 1250 | int ret = migrate_task_to(p, env.best_cpu); | ||
| 1251 | return ret; | ||
| 1252 | } | ||
| 1253 | |||
| 1254 | ret = migrate_swap(p, env.best_task); | ||
| 1255 | put_task_struct(env.best_task); | ||
| 1256 | return ret; | ||
| 1257 | } | ||
| 1258 | |||
| 1259 | /* Attempt to migrate a task to a CPU on the preferred node. */ | ||
| 1260 | static void numa_migrate_preferred(struct task_struct *p) | ||
| 1261 | { | ||
| 1262 | /* This task has no NUMA fault statistics yet */ | ||
| 1263 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) | ||
| 1264 | return; | ||
| 1265 | |||
| 1266 | /* Periodically retry migrating the task to the preferred node */ | ||
| 1267 | p->numa_migrate_retry = jiffies + HZ; | ||
| 1268 | |||
| 1269 | /* Success if task is already running on preferred CPU */ | ||
| 1270 | if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) | ||
| 838 | return; | 1271 | return; |
| 1272 | |||
| 1273 | /* Otherwise, try migrate to a CPU on the preferred node */ | ||
| 1274 | task_numa_migrate(p); | ||
| 1275 | } | ||
| 1276 | |||
| 1277 | /* | ||
| 1278 | * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS | ||
| 1279 | * increments. The more local the fault statistics are, the higher the scan | ||
| 1280 | * period will be for the next scan window. If local/remote ratio is below | ||
| 1281 | * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the | ||
| 1282 | * scan period will decrease | ||
| 1283 | */ | ||
| 1284 | #define NUMA_PERIOD_SLOTS 10 | ||
| 1285 | #define NUMA_PERIOD_THRESHOLD 3 | ||
| 1286 | |||
| 1287 | /* | ||
| 1288 | * Increase the scan period (slow down scanning) if the majority of | ||
| 1289 | * our memory is already on our local node, or if the majority of | ||
| 1290 | * the page accesses are shared with other processes. | ||
| 1291 | * Otherwise, decrease the scan period. | ||
| 1292 | */ | ||
| 1293 | static void update_task_scan_period(struct task_struct *p, | ||
| 1294 | unsigned long shared, unsigned long private) | ||
| 1295 | { | ||
| 1296 | unsigned int period_slot; | ||
| 1297 | int ratio; | ||
| 1298 | int diff; | ||
| 1299 | |||
| 1300 | unsigned long remote = p->numa_faults_locality[0]; | ||
| 1301 | unsigned long local = p->numa_faults_locality[1]; | ||
| 1302 | |||
| 1303 | /* | ||
| 1304 | * If there were no record hinting faults then either the task is | ||
| 1305 | * completely idle or all activity is areas that are not of interest | ||
| 1306 | * to automatic numa balancing. Scan slower | ||
| 1307 | */ | ||
| 1308 | if (local + shared == 0) { | ||
| 1309 | p->numa_scan_period = min(p->numa_scan_period_max, | ||
| 1310 | p->numa_scan_period << 1); | ||
| 1311 | |||
| 1312 | p->mm->numa_next_scan = jiffies + | ||
| 1313 | msecs_to_jiffies(p->numa_scan_period); | ||
| 1314 | |||
| 1315 | return; | ||
| 1316 | } | ||
| 1317 | |||
| 1318 | /* | ||
| 1319 | * Prepare to scale scan period relative to the current period. | ||
| 1320 | * == NUMA_PERIOD_THRESHOLD scan period stays the same | ||
| 1321 | * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) | ||
| 1322 | * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) | ||
| 1323 | */ | ||
| 1324 | period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); | ||
| 1325 | ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); | ||
| 1326 | if (ratio >= NUMA_PERIOD_THRESHOLD) { | ||
| 1327 | int slot = ratio - NUMA_PERIOD_THRESHOLD; | ||
| 1328 | if (!slot) | ||
| 1329 | slot = 1; | ||
| 1330 | diff = slot * period_slot; | ||
| 1331 | } else { | ||
| 1332 | diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; | ||
| 1333 | |||
| 1334 | /* | ||
| 1335 | * Scale scan rate increases based on sharing. There is an | ||
| 1336 | * inverse relationship between the degree of sharing and | ||
| 1337 | * the adjustment made to the scanning period. Broadly | ||
| 1338 | * speaking the intent is that there is little point | ||
| 1339 | * scanning faster if shared accesses dominate as it may | ||
| 1340 | * simply bounce migrations uselessly | ||
| 1341 | */ | ||
| 1342 | period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); | ||
| 1343 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); | ||
| 1344 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; | ||
| 1345 | } | ||
| 1346 | |||
| 1347 | p->numa_scan_period = clamp(p->numa_scan_period + diff, | ||
| 1348 | task_scan_min(p), task_scan_max(p)); | ||
| 1349 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | ||
| 1350 | } | ||
| 1351 | |||
| 1352 | static void task_numa_placement(struct task_struct *p) | ||
| 1353 | { | ||
| 1354 | int seq, nid, max_nid = -1, max_group_nid = -1; | ||
| 1355 | unsigned long max_faults = 0, max_group_faults = 0; | ||
| 1356 | unsigned long fault_types[2] = { 0, 0 }; | ||
| 1357 | spinlock_t *group_lock = NULL; | ||
| 1358 | |||
| 839 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); | 1359 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); |
| 840 | if (p->numa_scan_seq == seq) | 1360 | if (p->numa_scan_seq == seq) |
| 841 | return; | 1361 | return; |
| 842 | p->numa_scan_seq = seq; | 1362 | p->numa_scan_seq = seq; |
| 1363 | p->numa_scan_period_max = task_scan_max(p); | ||
| 1364 | |||
| 1365 | /* If the task is part of a group prevent parallel updates to group stats */ | ||
| 1366 | if (p->numa_group) { | ||
| 1367 | group_lock = &p->numa_group->lock; | ||
| 1368 | spin_lock(group_lock); | ||
| 1369 | } | ||
| 1370 | |||
| 1371 | /* Find the node with the highest number of faults */ | ||
| 1372 | for_each_online_node(nid) { | ||
| 1373 | unsigned long faults = 0, group_faults = 0; | ||
| 1374 | int priv, i; | ||
| 1375 | |||
| 1376 | for (priv = 0; priv < 2; priv++) { | ||
| 1377 | long diff; | ||
| 1378 | |||
| 1379 | i = task_faults_idx(nid, priv); | ||
| 1380 | diff = -p->numa_faults[i]; | ||
| 1381 | |||
| 1382 | /* Decay existing window, copy faults since last scan */ | ||
| 1383 | p->numa_faults[i] >>= 1; | ||
| 1384 | p->numa_faults[i] += p->numa_faults_buffer[i]; | ||
| 1385 | fault_types[priv] += p->numa_faults_buffer[i]; | ||
| 1386 | p->numa_faults_buffer[i] = 0; | ||
| 1387 | |||
| 1388 | faults += p->numa_faults[i]; | ||
| 1389 | diff += p->numa_faults[i]; | ||
| 1390 | p->total_numa_faults += diff; | ||
| 1391 | if (p->numa_group) { | ||
| 1392 | /* safe because we can only change our own group */ | ||
| 1393 | p->numa_group->faults[i] += diff; | ||
| 1394 | p->numa_group->total_faults += diff; | ||
| 1395 | group_faults += p->numa_group->faults[i]; | ||
| 1396 | } | ||
| 1397 | } | ||
| 1398 | |||
| 1399 | if (faults > max_faults) { | ||
| 1400 | max_faults = faults; | ||
| 1401 | max_nid = nid; | ||
| 1402 | } | ||
| 1403 | |||
| 1404 | if (group_faults > max_group_faults) { | ||
| 1405 | max_group_faults = group_faults; | ||
| 1406 | max_group_nid = nid; | ||
| 1407 | } | ||
| 1408 | } | ||
| 1409 | |||
| 1410 | update_task_scan_period(p, fault_types[0], fault_types[1]); | ||
| 1411 | |||
| 1412 | if (p->numa_group) { | ||
| 1413 | /* | ||
| 1414 | * If the preferred task and group nids are different, | ||
| 1415 | * iterate over the nodes again to find the best place. | ||
| 1416 | */ | ||
| 1417 | if (max_nid != max_group_nid) { | ||
| 1418 | unsigned long weight, max_weight = 0; | ||
| 1419 | |||
| 1420 | for_each_online_node(nid) { | ||
| 1421 | weight = task_weight(p, nid) + group_weight(p, nid); | ||
| 1422 | if (weight > max_weight) { | ||
| 1423 | max_weight = weight; | ||
| 1424 | max_nid = nid; | ||
| 1425 | } | ||
| 1426 | } | ||
| 1427 | } | ||
| 1428 | |||
| 1429 | spin_unlock(group_lock); | ||
| 1430 | } | ||
| 1431 | |||
| 1432 | /* Preferred node as the node with the most faults */ | ||
| 1433 | if (max_faults && max_nid != p->numa_preferred_nid) { | ||
| 1434 | /* Update the preferred nid and migrate task if possible */ | ||
| 1435 | sched_setnuma(p, max_nid); | ||
| 1436 | numa_migrate_preferred(p); | ||
| 1437 | } | ||
| 1438 | } | ||
| 1439 | |||
| 1440 | static inline int get_numa_group(struct numa_group *grp) | ||
| 1441 | { | ||
| 1442 | return atomic_inc_not_zero(&grp->refcount); | ||
| 1443 | } | ||
| 1444 | |||
| 1445 | static inline void put_numa_group(struct numa_group *grp) | ||
| 1446 | { | ||
| 1447 | if (atomic_dec_and_test(&grp->refcount)) | ||
| 1448 | kfree_rcu(grp, rcu); | ||
| 1449 | } | ||
| 1450 | |||
| 1451 | static void task_numa_group(struct task_struct *p, int cpupid, int flags, | ||
| 1452 | int *priv) | ||
| 1453 | { | ||
| 1454 | struct numa_group *grp, *my_grp; | ||
| 1455 | struct task_struct *tsk; | ||
| 1456 | bool join = false; | ||
| 1457 | int cpu = cpupid_to_cpu(cpupid); | ||
| 1458 | int i; | ||
| 1459 | |||
| 1460 | if (unlikely(!p->numa_group)) { | ||
| 1461 | unsigned int size = sizeof(struct numa_group) + | ||
| 1462 | 2*nr_node_ids*sizeof(unsigned long); | ||
| 1463 | |||
| 1464 | grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); | ||
| 1465 | if (!grp) | ||
| 1466 | return; | ||
| 1467 | |||
| 1468 | atomic_set(&grp->refcount, 1); | ||
| 1469 | spin_lock_init(&grp->lock); | ||
| 1470 | INIT_LIST_HEAD(&grp->task_list); | ||
| 1471 | grp->gid = p->pid; | ||
| 1472 | |||
| 1473 | for (i = 0; i < 2*nr_node_ids; i++) | ||
| 1474 | grp->faults[i] = p->numa_faults[i]; | ||
| 1475 | |||
| 1476 | grp->total_faults = p->total_numa_faults; | ||
| 1477 | |||
| 1478 | list_add(&p->numa_entry, &grp->task_list); | ||
| 1479 | grp->nr_tasks++; | ||
| 1480 | rcu_assign_pointer(p->numa_group, grp); | ||
| 1481 | } | ||
| 1482 | |||
| 1483 | rcu_read_lock(); | ||
| 1484 | tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); | ||
| 1485 | |||
| 1486 | if (!cpupid_match_pid(tsk, cpupid)) | ||
| 1487 | goto no_join; | ||
| 1488 | |||
| 1489 | grp = rcu_dereference(tsk->numa_group); | ||
| 1490 | if (!grp) | ||
| 1491 | goto no_join; | ||
| 1492 | |||
| 1493 | my_grp = p->numa_group; | ||
| 1494 | if (grp == my_grp) | ||
| 1495 | goto no_join; | ||
| 1496 | |||
| 1497 | /* | ||
| 1498 | * Only join the other group if its bigger; if we're the bigger group, | ||
| 1499 | * the other task will join us. | ||
| 1500 | */ | ||
| 1501 | if (my_grp->nr_tasks > grp->nr_tasks) | ||
| 1502 | goto no_join; | ||
| 1503 | |||
| 1504 | /* | ||
| 1505 | * Tie-break on the grp address. | ||
| 1506 | */ | ||
| 1507 | if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) | ||
| 1508 | goto no_join; | ||
| 1509 | |||
| 1510 | /* Always join threads in the same process. */ | ||
| 1511 | if (tsk->mm == current->mm) | ||
| 1512 | join = true; | ||
| 1513 | |||
| 1514 | /* Simple filter to avoid false positives due to PID collisions */ | ||
| 1515 | if (flags & TNF_SHARED) | ||
| 1516 | join = true; | ||
| 1517 | |||
| 1518 | /* Update priv based on whether false sharing was detected */ | ||
| 1519 | *priv = !join; | ||
| 1520 | |||
| 1521 | if (join && !get_numa_group(grp)) | ||
| 1522 | goto no_join; | ||
| 843 | 1523 | ||
| 844 | /* FIXME: Scheduling placement policy hints go here */ | 1524 | rcu_read_unlock(); |
| 1525 | |||
| 1526 | if (!join) | ||
| 1527 | return; | ||
| 1528 | |||
| 1529 | double_lock(&my_grp->lock, &grp->lock); | ||
| 1530 | |||
| 1531 | for (i = 0; i < 2*nr_node_ids; i++) { | ||
| 1532 | my_grp->faults[i] -= p->numa_faults[i]; | ||
| 1533 | grp->faults[i] += p->numa_faults[i]; | ||
| 1534 | } | ||
| 1535 | my_grp->total_faults -= p->total_numa_faults; | ||
| 1536 | grp->total_faults += p->total_numa_faults; | ||
| 1537 | |||
| 1538 | list_move(&p->numa_entry, &grp->task_list); | ||
| 1539 | my_grp->nr_tasks--; | ||
| 1540 | grp->nr_tasks++; | ||
| 1541 | |||
| 1542 | spin_unlock(&my_grp->lock); | ||
| 1543 | spin_unlock(&grp->lock); | ||
| 1544 | |||
| 1545 | rcu_assign_pointer(p->numa_group, grp); | ||
| 1546 | |||
| 1547 | put_numa_group(my_grp); | ||
| 1548 | return; | ||
| 1549 | |||
| 1550 | no_join: | ||
| 1551 | rcu_read_unlock(); | ||
| 1552 | return; | ||
| 1553 | } | ||
| 1554 | |||
| 1555 | void task_numa_free(struct task_struct *p) | ||
| 1556 | { | ||
| 1557 | struct numa_group *grp = p->numa_group; | ||
| 1558 | int i; | ||
| 1559 | void *numa_faults = p->numa_faults; | ||
| 1560 | |||
| 1561 | if (grp) { | ||
| 1562 | spin_lock(&grp->lock); | ||
| 1563 | for (i = 0; i < 2*nr_node_ids; i++) | ||
| 1564 | grp->faults[i] -= p->numa_faults[i]; | ||
| 1565 | grp->total_faults -= p->total_numa_faults; | ||
| 1566 | |||
| 1567 | list_del(&p->numa_entry); | ||
| 1568 | grp->nr_tasks--; | ||
| 1569 | spin_unlock(&grp->lock); | ||
| 1570 | rcu_assign_pointer(p->numa_group, NULL); | ||
| 1571 | put_numa_group(grp); | ||
| 1572 | } | ||
| 1573 | |||
| 1574 | p->numa_faults = NULL; | ||
| 1575 | p->numa_faults_buffer = NULL; | ||
| 1576 | kfree(numa_faults); | ||
| 845 | } | 1577 | } |
| 846 | 1578 | ||
| 847 | /* | 1579 | /* |
| 848 | * Got a PROT_NONE fault for a page on @node. | 1580 | * Got a PROT_NONE fault for a page on @node. |
| 849 | */ | 1581 | */ |
| 850 | void task_numa_fault(int node, int pages, bool migrated) | 1582 | void task_numa_fault(int last_cpupid, int node, int pages, int flags) |
| 851 | { | 1583 | { |
| 852 | struct task_struct *p = current; | 1584 | struct task_struct *p = current; |
| 1585 | bool migrated = flags & TNF_MIGRATED; | ||
| 1586 | int priv; | ||
| 853 | 1587 | ||
| 854 | if (!numabalancing_enabled) | 1588 | if (!numabalancing_enabled) |
| 855 | return; | 1589 | return; |
| 856 | 1590 | ||
| 857 | /* FIXME: Allocate task-specific structure for placement policy here */ | 1591 | /* for example, ksmd faulting in a user's mm */ |
| 1592 | if (!p->mm) | ||
| 1593 | return; | ||
| 1594 | |||
| 1595 | /* Do not worry about placement if exiting */ | ||
| 1596 | if (p->state == TASK_DEAD) | ||
| 1597 | return; | ||
| 1598 | |||
| 1599 | /* Allocate buffer to track faults on a per-node basis */ | ||
| 1600 | if (unlikely(!p->numa_faults)) { | ||
| 1601 | int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; | ||
| 1602 | |||
| 1603 | /* numa_faults and numa_faults_buffer share the allocation */ | ||
| 1604 | p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); | ||
| 1605 | if (!p->numa_faults) | ||
| 1606 | return; | ||
| 1607 | |||
| 1608 | BUG_ON(p->numa_faults_buffer); | ||
| 1609 | p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); | ||
| 1610 | p->total_numa_faults = 0; | ||
| 1611 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | ||
| 1612 | } | ||
| 858 | 1613 | ||
| 859 | /* | 1614 | /* |
| 860 | * If pages are properly placed (did not migrate) then scan slower. | 1615 | * First accesses are treated as private, otherwise consider accesses |
| 861 | * This is reset periodically in case of phase changes | 1616 | * to be private if the accessing pid has not changed |
| 862 | */ | 1617 | */ |
| 863 | if (!migrated) | 1618 | if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { |
| 864 | p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, | 1619 | priv = 1; |
| 865 | p->numa_scan_period + jiffies_to_msecs(10)); | 1620 | } else { |
| 1621 | priv = cpupid_match_pid(p, last_cpupid); | ||
| 1622 | if (!priv && !(flags & TNF_NO_GROUP)) | ||
| 1623 | task_numa_group(p, last_cpupid, flags, &priv); | ||
| 1624 | } | ||
| 866 | 1625 | ||
| 867 | task_numa_placement(p); | 1626 | task_numa_placement(p); |
| 1627 | |||
| 1628 | /* | ||
| 1629 | * Retry task to preferred node migration periodically, in case it | ||
| 1630 | * case it previously failed, or the scheduler moved us. | ||
| 1631 | */ | ||
| 1632 | if (time_after(jiffies, p->numa_migrate_retry)) | ||
| 1633 | numa_migrate_preferred(p); | ||
| 1634 | |||
| 1635 | if (migrated) | ||
| 1636 | p->numa_pages_migrated += pages; | ||
| 1637 | |||
| 1638 | p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; | ||
| 1639 | p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; | ||
| 868 | } | 1640 | } |
| 869 | 1641 | ||
| 870 | static void reset_ptenuma_scan(struct task_struct *p) | 1642 | static void reset_ptenuma_scan(struct task_struct *p) |
| @@ -884,6 +1656,7 @@ void task_numa_work(struct callback_head *work) | |||
| 884 | struct mm_struct *mm = p->mm; | 1656 | struct mm_struct *mm = p->mm; |
| 885 | struct vm_area_struct *vma; | 1657 | struct vm_area_struct *vma; |
| 886 | unsigned long start, end; | 1658 | unsigned long start, end; |
| 1659 | unsigned long nr_pte_updates = 0; | ||
| 887 | long pages; | 1660 | long pages; |
| 888 | 1661 | ||
| 889 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | 1662 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); |
| @@ -900,35 +1673,9 @@ void task_numa_work(struct callback_head *work) | |||
| 900 | if (p->flags & PF_EXITING) | 1673 | if (p->flags & PF_EXITING) |
| 901 | return; | 1674 | return; |
| 902 | 1675 | ||
| 903 | /* | 1676 | if (!mm->numa_next_scan) { |
| 904 | * We do not care about task placement until a task runs on a node | 1677 | mm->numa_next_scan = now + |
| 905 | * other than the first one used by the address space. This is | 1678 | msecs_to_jiffies(sysctl_numa_balancing_scan_delay); |
| 906 | * largely because migrations are driven by what CPU the task | ||
| 907 | * is running on. If it's never scheduled on another node, it'll | ||
| 908 | * not migrate so why bother trapping the fault. | ||
| 909 | */ | ||
| 910 | if (mm->first_nid == NUMA_PTE_SCAN_INIT) | ||
| 911 | mm->first_nid = numa_node_id(); | ||
| 912 | if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { | ||
| 913 | /* Are we running on a new node yet? */ | ||
| 914 | if (numa_node_id() == mm->first_nid && | ||
| 915 | !sched_feat_numa(NUMA_FORCE)) | ||
| 916 | return; | ||
| 917 | |||
| 918 | mm->first_nid = NUMA_PTE_SCAN_ACTIVE; | ||
| 919 | } | ||
| 920 | |||
| 921 | /* | ||
| 922 | * Reset the scan period if enough time has gone by. Objective is that | ||
| 923 | * scanning will be reduced if pages are properly placed. As tasks | ||
| 924 | * can enter different phases this needs to be re-examined. Lacking | ||
| 925 | * proper tracking of reference behaviour, this blunt hammer is used. | ||
| 926 | */ | ||
| 927 | migrate = mm->numa_next_reset; | ||
| 928 | if (time_after(now, migrate)) { | ||
| 929 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
| 930 | next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); | ||
| 931 | xchg(&mm->numa_next_reset, next_scan); | ||
| 932 | } | 1679 | } |
| 933 | 1680 | ||
| 934 | /* | 1681 | /* |
| @@ -938,20 +1685,20 @@ void task_numa_work(struct callback_head *work) | |||
| 938 | if (time_before(now, migrate)) | 1685 | if (time_before(now, migrate)) |
| 939 | return; | 1686 | return; |
| 940 | 1687 | ||
| 941 | if (p->numa_scan_period == 0) | 1688 | if (p->numa_scan_period == 0) { |
| 942 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | 1689 | p->numa_scan_period_max = task_scan_max(p); |
| 1690 | p->numa_scan_period = task_scan_min(p); | ||
| 1691 | } | ||
| 943 | 1692 | ||
| 944 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); | 1693 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); |
| 945 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) | 1694 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) |
| 946 | return; | 1695 | return; |
| 947 | 1696 | ||
| 948 | /* | 1697 | /* |
| 949 | * Do not set pte_numa if the current running node is rate-limited. | 1698 | * Delay this task enough that another task of this mm will likely win |
| 950 | * This loses statistics on the fault but if we are unwilling to | 1699 | * the next time around. |
| 951 | * migrate to this node, it is less likely we can do useful work | ||
| 952 | */ | 1700 | */ |
| 953 | if (migrate_ratelimited(numa_node_id())) | 1701 | p->node_stamp += 2 * TICK_NSEC; |
| 954 | return; | ||
| 955 | 1702 | ||
| 956 | start = mm->numa_scan_offset; | 1703 | start = mm->numa_scan_offset; |
| 957 | pages = sysctl_numa_balancing_scan_size; | 1704 | pages = sysctl_numa_balancing_scan_size; |
| @@ -967,18 +1714,32 @@ void task_numa_work(struct callback_head *work) | |||
| 967 | vma = mm->mmap; | 1714 | vma = mm->mmap; |
| 968 | } | 1715 | } |
| 969 | for (; vma; vma = vma->vm_next) { | 1716 | for (; vma; vma = vma->vm_next) { |
| 970 | if (!vma_migratable(vma)) | 1717 | if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) |
| 971 | continue; | 1718 | continue; |
| 972 | 1719 | ||
| 973 | /* Skip small VMAs. They are not likely to be of relevance */ | 1720 | /* |
| 974 | if (vma->vm_end - vma->vm_start < HPAGE_SIZE) | 1721 | * Shared library pages mapped by multiple processes are not |
| 1722 | * migrated as it is expected they are cache replicated. Avoid | ||
| 1723 | * hinting faults in read-only file-backed mappings or the vdso | ||
| 1724 | * as migrating the pages will be of marginal benefit. | ||
| 1725 | */ | ||
| 1726 | if (!vma->vm_mm || | ||
| 1727 | (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) | ||
| 975 | continue; | 1728 | continue; |
| 976 | 1729 | ||
| 977 | do { | 1730 | do { |
| 978 | start = max(start, vma->vm_start); | 1731 | start = max(start, vma->vm_start); |
| 979 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); | 1732 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); |
| 980 | end = min(end, vma->vm_end); | 1733 | end = min(end, vma->vm_end); |
| 981 | pages -= change_prot_numa(vma, start, end); | 1734 | nr_pte_updates += change_prot_numa(vma, start, end); |
| 1735 | |||
| 1736 | /* | ||
| 1737 | * Scan sysctl_numa_balancing_scan_size but ensure that | ||
| 1738 | * at least one PTE is updated so that unused virtual | ||
| 1739 | * address space is quickly skipped. | ||
| 1740 | */ | ||
| 1741 | if (nr_pte_updates) | ||
| 1742 | pages -= (end - start) >> PAGE_SHIFT; | ||
| 982 | 1743 | ||
| 983 | start = end; | 1744 | start = end; |
| 984 | if (pages <= 0) | 1745 | if (pages <= 0) |
| @@ -988,10 +1749,10 @@ void task_numa_work(struct callback_head *work) | |||
| 988 | 1749 | ||
| 989 | out: | 1750 | out: |
| 990 | /* | 1751 | /* |
| 991 | * It is possible to reach the end of the VMA list but the last few VMAs are | 1752 | * It is possible to reach the end of the VMA list but the last few |
| 992 | * not guaranteed to the vma_migratable. If they are not, we would find the | 1753 | * VMAs are not guaranteed to the vma_migratable. If they are not, we |
| 993 | * !migratable VMA on the next scan but not reset the scanner to the start | 1754 | * would find the !migratable VMA on the next scan but not reset the |
| 994 | * so check it now. | 1755 | * scanner to the start so check it now. |
| 995 | */ | 1756 | */ |
| 996 | if (vma) | 1757 | if (vma) |
| 997 | mm->numa_scan_offset = start; | 1758 | mm->numa_scan_offset = start; |
| @@ -1025,8 +1786,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) | |||
| 1025 | 1786 | ||
| 1026 | if (now - curr->node_stamp > period) { | 1787 | if (now - curr->node_stamp > period) { |
| 1027 | if (!curr->node_stamp) | 1788 | if (!curr->node_stamp) |
| 1028 | curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; | 1789 | curr->numa_scan_period = task_scan_min(curr); |
| 1029 | curr->node_stamp = now; | 1790 | curr->node_stamp += period; |
| 1030 | 1791 | ||
| 1031 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { | 1792 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { |
| 1032 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ | 1793 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ |
| @@ -1038,6 +1799,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) | |||
| 1038 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) | 1799 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) |
| 1039 | { | 1800 | { |
| 1040 | } | 1801 | } |
| 1802 | |||
| 1803 | static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) | ||
| 1804 | { | ||
| 1805 | } | ||
| 1806 | |||
| 1807 | static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) | ||
| 1808 | { | ||
| 1809 | } | ||
| 1041 | #endif /* CONFIG_NUMA_BALANCING */ | 1810 | #endif /* CONFIG_NUMA_BALANCING */ |
| 1042 | 1811 | ||
| 1043 | static void | 1812 | static void |
| @@ -1047,8 +1816,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 1047 | if (!parent_entity(se)) | 1816 | if (!parent_entity(se)) |
| 1048 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | 1817 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
| 1049 | #ifdef CONFIG_SMP | 1818 | #ifdef CONFIG_SMP |
| 1050 | if (entity_is_task(se)) | 1819 | if (entity_is_task(se)) { |
| 1051 | list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); | 1820 | struct rq *rq = rq_of(cfs_rq); |
| 1821 | |||
| 1822 | account_numa_enqueue(rq, task_of(se)); | ||
| 1823 | list_add(&se->group_node, &rq->cfs_tasks); | ||
| 1824 | } | ||
| 1052 | #endif | 1825 | #endif |
| 1053 | cfs_rq->nr_running++; | 1826 | cfs_rq->nr_running++; |
| 1054 | } | 1827 | } |
| @@ -1059,8 +1832,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 1059 | update_load_sub(&cfs_rq->load, se->load.weight); | 1832 | update_load_sub(&cfs_rq->load, se->load.weight); |
| 1060 | if (!parent_entity(se)) | 1833 | if (!parent_entity(se)) |
| 1061 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); | 1834 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); |
| 1062 | if (entity_is_task(se)) | 1835 | if (entity_is_task(se)) { |
| 1836 | account_numa_dequeue(rq_of(cfs_rq), task_of(se)); | ||
| 1063 | list_del_init(&se->group_node); | 1837 | list_del_init(&se->group_node); |
| 1838 | } | ||
| 1064 | cfs_rq->nr_running--; | 1839 | cfs_rq->nr_running--; |
| 1065 | } | 1840 | } |
| 1066 | 1841 | ||
| @@ -2070,13 +2845,14 @@ static inline bool cfs_bandwidth_used(void) | |||
| 2070 | return static_key_false(&__cfs_bandwidth_used); | 2845 | return static_key_false(&__cfs_bandwidth_used); |
| 2071 | } | 2846 | } |
| 2072 | 2847 | ||
| 2073 | void account_cfs_bandwidth_used(int enabled, int was_enabled) | 2848 | void cfs_bandwidth_usage_inc(void) |
| 2849 | { | ||
| 2850 | static_key_slow_inc(&__cfs_bandwidth_used); | ||
| 2851 | } | ||
| 2852 | |||
| 2853 | void cfs_bandwidth_usage_dec(void) | ||
| 2074 | { | 2854 | { |
| 2075 | /* only need to count groups transitioning between enabled/!enabled */ | 2855 | static_key_slow_dec(&__cfs_bandwidth_used); |
| 2076 | if (enabled && !was_enabled) | ||
| 2077 | static_key_slow_inc(&__cfs_bandwidth_used); | ||
| 2078 | else if (!enabled && was_enabled) | ||
| 2079 | static_key_slow_dec(&__cfs_bandwidth_used); | ||
| 2080 | } | 2856 | } |
| 2081 | #else /* HAVE_JUMP_LABEL */ | 2857 | #else /* HAVE_JUMP_LABEL */ |
| 2082 | static bool cfs_bandwidth_used(void) | 2858 | static bool cfs_bandwidth_used(void) |
| @@ -2084,7 +2860,8 @@ static bool cfs_bandwidth_used(void) | |||
| 2084 | return true; | 2860 | return true; |
| 2085 | } | 2861 | } |
| 2086 | 2862 | ||
| 2087 | void account_cfs_bandwidth_used(int enabled, int was_enabled) {} | 2863 | void cfs_bandwidth_usage_inc(void) {} |
| 2864 | void cfs_bandwidth_usage_dec(void) {} | ||
| 2088 | #endif /* HAVE_JUMP_LABEL */ | 2865 | #endif /* HAVE_JUMP_LABEL */ |
| 2089 | 2866 | ||
| 2090 | /* | 2867 | /* |
| @@ -2335,6 +3112,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 2335 | cfs_rq->throttled_clock = rq_clock(rq); | 3112 | cfs_rq->throttled_clock = rq_clock(rq); |
| 2336 | raw_spin_lock(&cfs_b->lock); | 3113 | raw_spin_lock(&cfs_b->lock); |
| 2337 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | 3114 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); |
| 3115 | if (!cfs_b->timer_active) | ||
| 3116 | __start_cfs_bandwidth(cfs_b); | ||
| 2338 | raw_spin_unlock(&cfs_b->lock); | 3117 | raw_spin_unlock(&cfs_b->lock); |
| 2339 | } | 3118 | } |
| 2340 | 3119 | ||
| @@ -2448,6 +3227,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | |||
| 2448 | if (idle) | 3227 | if (idle) |
| 2449 | goto out_unlock; | 3228 | goto out_unlock; |
| 2450 | 3229 | ||
| 3230 | /* | ||
| 3231 | * if we have relooped after returning idle once, we need to update our | ||
| 3232 | * status as actually running, so that other cpus doing | ||
| 3233 | * __start_cfs_bandwidth will stop trying to cancel us. | ||
| 3234 | */ | ||
| 3235 | cfs_b->timer_active = 1; | ||
| 3236 | |||
| 2451 | __refill_cfs_bandwidth_runtime(cfs_b); | 3237 | __refill_cfs_bandwidth_runtime(cfs_b); |
| 2452 | 3238 | ||
| 2453 | if (!throttled) { | 3239 | if (!throttled) { |
| @@ -2508,7 +3294,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; | |||
| 2508 | /* how long we wait to gather additional slack before distributing */ | 3294 | /* how long we wait to gather additional slack before distributing */ |
| 2509 | static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; | 3295 | static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; |
| 2510 | 3296 | ||
| 2511 | /* are we near the end of the current quota period? */ | 3297 | /* |
| 3298 | * Are we near the end of the current quota period? | ||
| 3299 | * | ||
| 3300 | * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the | ||
| 3301 | * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of | ||
| 3302 | * migrate_hrtimers, base is never cleared, so we are fine. | ||
| 3303 | */ | ||
| 2512 | static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) | 3304 | static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) |
| 2513 | { | 3305 | { |
| 2514 | struct hrtimer *refresh_timer = &cfs_b->period_timer; | 3306 | struct hrtimer *refresh_timer = &cfs_b->period_timer; |
| @@ -2584,10 +3376,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | |||
| 2584 | u64 expires; | 3376 | u64 expires; |
| 2585 | 3377 | ||
| 2586 | /* confirm we're still not at a refresh boundary */ | 3378 | /* confirm we're still not at a refresh boundary */ |
| 2587 | if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) | 3379 | raw_spin_lock(&cfs_b->lock); |
| 3380 | if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { | ||
| 3381 | raw_spin_unlock(&cfs_b->lock); | ||
| 2588 | return; | 3382 | return; |
| 3383 | } | ||
| 2589 | 3384 | ||
| 2590 | raw_spin_lock(&cfs_b->lock); | ||
| 2591 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { | 3385 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { |
| 2592 | runtime = cfs_b->runtime; | 3386 | runtime = cfs_b->runtime; |
| 2593 | cfs_b->runtime = 0; | 3387 | cfs_b->runtime = 0; |
| @@ -2708,11 +3502,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
| 2708 | * (timer_active==0 becomes visible before the hrtimer call-back | 3502 | * (timer_active==0 becomes visible before the hrtimer call-back |
| 2709 | * terminates). In either case we ensure that it's re-programmed | 3503 | * terminates). In either case we ensure that it's re-programmed |
| 2710 | */ | 3504 | */ |
| 2711 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | 3505 | while (unlikely(hrtimer_active(&cfs_b->period_timer)) && |
| 3506 | hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { | ||
| 3507 | /* bounce the lock to allow do_sched_cfs_period_timer to run */ | ||
| 2712 | raw_spin_unlock(&cfs_b->lock); | 3508 | raw_spin_unlock(&cfs_b->lock); |
| 2713 | /* ensure cfs_b->lock is available while we wait */ | 3509 | cpu_relax(); |
| 2714 | hrtimer_cancel(&cfs_b->period_timer); | ||
| 2715 | |||
| 2716 | raw_spin_lock(&cfs_b->lock); | 3510 | raw_spin_lock(&cfs_b->lock); |
| 2717 | /* if someone else restarted the timer then we're done */ | 3511 | /* if someone else restarted the timer then we're done */ |
| 2718 | if (cfs_b->timer_active) | 3512 | if (cfs_b->timer_active) |
| @@ -3113,7 +3907,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
| 3113 | { | 3907 | { |
| 3114 | struct sched_entity *se = tg->se[cpu]; | 3908 | struct sched_entity *se = tg->se[cpu]; |
| 3115 | 3909 | ||
| 3116 | if (!tg->parent) /* the trivial, non-cgroup case */ | 3910 | if (!tg->parent || !wl) /* the trivial, non-cgroup case */ |
| 3117 | return wl; | 3911 | return wl; |
| 3118 | 3912 | ||
| 3119 | for_each_sched_entity(se) { | 3913 | for_each_sched_entity(se) { |
| @@ -3166,8 +3960,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
| 3166 | } | 3960 | } |
| 3167 | #else | 3961 | #else |
| 3168 | 3962 | ||
| 3169 | static inline unsigned long effective_load(struct task_group *tg, int cpu, | 3963 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
| 3170 | unsigned long wl, unsigned long wg) | ||
| 3171 | { | 3964 | { |
| 3172 | return wl; | 3965 | return wl; |
| 3173 | } | 3966 | } |
| @@ -3420,11 +4213,10 @@ done: | |||
| 3420 | * preempt must be disabled. | 4213 | * preempt must be disabled. |
| 3421 | */ | 4214 | */ |
| 3422 | static int | 4215 | static int |
| 3423 | select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | 4216 | select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) |
| 3424 | { | 4217 | { |
| 3425 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; | 4218 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
| 3426 | int cpu = smp_processor_id(); | 4219 | int cpu = smp_processor_id(); |
| 3427 | int prev_cpu = task_cpu(p); | ||
| 3428 | int new_cpu = cpu; | 4220 | int new_cpu = cpu; |
| 3429 | int want_affine = 0; | 4221 | int want_affine = 0; |
| 3430 | int sync = wake_flags & WF_SYNC; | 4222 | int sync = wake_flags & WF_SYNC; |
| @@ -3904,9 +4696,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
| 3904 | 4696 | ||
| 3905 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | 4697 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; |
| 3906 | 4698 | ||
| 4699 | enum fbq_type { regular, remote, all }; | ||
| 4700 | |||
| 3907 | #define LBF_ALL_PINNED 0x01 | 4701 | #define LBF_ALL_PINNED 0x01 |
| 3908 | #define LBF_NEED_BREAK 0x02 | 4702 | #define LBF_NEED_BREAK 0x02 |
| 3909 | #define LBF_SOME_PINNED 0x04 | 4703 | #define LBF_DST_PINNED 0x04 |
| 4704 | #define LBF_SOME_PINNED 0x08 | ||
| 3910 | 4705 | ||
| 3911 | struct lb_env { | 4706 | struct lb_env { |
| 3912 | struct sched_domain *sd; | 4707 | struct sched_domain *sd; |
| @@ -3929,6 +4724,8 @@ struct lb_env { | |||
| 3929 | unsigned int loop; | 4724 | unsigned int loop; |
| 3930 | unsigned int loop_break; | 4725 | unsigned int loop_break; |
| 3931 | unsigned int loop_max; | 4726 | unsigned int loop_max; |
| 4727 | |||
| 4728 | enum fbq_type fbq_type; | ||
| 3932 | }; | 4729 | }; |
| 3933 | 4730 | ||
| 3934 | /* | 4731 | /* |
| @@ -3975,6 +4772,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
| 3975 | return delta < (s64)sysctl_sched_migration_cost; | 4772 | return delta < (s64)sysctl_sched_migration_cost; |
| 3976 | } | 4773 | } |
| 3977 | 4774 | ||
| 4775 | #ifdef CONFIG_NUMA_BALANCING | ||
| 4776 | /* Returns true if the destination node has incurred more faults */ | ||
| 4777 | static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | ||
| 4778 | { | ||
| 4779 | int src_nid, dst_nid; | ||
| 4780 | |||
| 4781 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || | ||
| 4782 | !(env->sd->flags & SD_NUMA)) { | ||
| 4783 | return false; | ||
| 4784 | } | ||
| 4785 | |||
| 4786 | src_nid = cpu_to_node(env->src_cpu); | ||
| 4787 | dst_nid = cpu_to_node(env->dst_cpu); | ||
| 4788 | |||
| 4789 | if (src_nid == dst_nid) | ||
| 4790 | return false; | ||
| 4791 | |||
| 4792 | /* Always encourage migration to the preferred node. */ | ||
| 4793 | if (dst_nid == p->numa_preferred_nid) | ||
| 4794 | return true; | ||
| 4795 | |||
| 4796 | /* If both task and group weight improve, this move is a winner. */ | ||
| 4797 | if (task_weight(p, dst_nid) > task_weight(p, src_nid) && | ||
| 4798 | group_weight(p, dst_nid) > group_weight(p, src_nid)) | ||
| 4799 | return true; | ||
| 4800 | |||
| 4801 | return false; | ||
| 4802 | } | ||
| 4803 | |||
| 4804 | |||
| 4805 | static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | ||
| 4806 | { | ||
| 4807 | int src_nid, dst_nid; | ||
| 4808 | |||
| 4809 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) | ||
| 4810 | return false; | ||
| 4811 | |||
| 4812 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) | ||
| 4813 | return false; | ||
| 4814 | |||
| 4815 | src_nid = cpu_to_node(env->src_cpu); | ||
| 4816 | dst_nid = cpu_to_node(env->dst_cpu); | ||
| 4817 | |||
| 4818 | if (src_nid == dst_nid) | ||
| 4819 | return false; | ||
| 4820 | |||
| 4821 | /* Migrating away from the preferred node is always bad. */ | ||
| 4822 | if (src_nid == p->numa_preferred_nid) | ||
| 4823 | return true; | ||
| 4824 | |||
| 4825 | /* If either task or group weight get worse, don't do it. */ | ||
| 4826 | if (task_weight(p, dst_nid) < task_weight(p, src_nid) || | ||
| 4827 | group_weight(p, dst_nid) < group_weight(p, src_nid)) | ||
| 4828 | return true; | ||
| 4829 | |||
| 4830 | return false; | ||
| 4831 | } | ||
| 4832 | |||
| 4833 | #else | ||
| 4834 | static inline bool migrate_improves_locality(struct task_struct *p, | ||
| 4835 | struct lb_env *env) | ||
| 4836 | { | ||
| 4837 | return false; | ||
| 4838 | } | ||
| 4839 | |||
| 4840 | static inline bool migrate_degrades_locality(struct task_struct *p, | ||
| 4841 | struct lb_env *env) | ||
| 4842 | { | ||
| 4843 | return false; | ||
| 4844 | } | ||
| 4845 | #endif | ||
| 4846 | |||
| 3978 | /* | 4847 | /* |
| 3979 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 4848 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
| 3980 | */ | 4849 | */ |
| @@ -3997,6 +4866,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 3997 | 4866 | ||
| 3998 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 4867 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
| 3999 | 4868 | ||
| 4869 | env->flags |= LBF_SOME_PINNED; | ||
| 4870 | |||
| 4000 | /* | 4871 | /* |
| 4001 | * Remember if this task can be migrated to any other cpu in | 4872 | * Remember if this task can be migrated to any other cpu in |
| 4002 | * our sched_group. We may want to revisit it if we couldn't | 4873 | * our sched_group. We may want to revisit it if we couldn't |
| @@ -4005,13 +4876,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 4005 | * Also avoid computing new_dst_cpu if we have already computed | 4876 | * Also avoid computing new_dst_cpu if we have already computed |
| 4006 | * one in current iteration. | 4877 | * one in current iteration. |
| 4007 | */ | 4878 | */ |
| 4008 | if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) | 4879 | if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) |
| 4009 | return 0; | 4880 | return 0; |
| 4010 | 4881 | ||
| 4011 | /* Prevent to re-select dst_cpu via env's cpus */ | 4882 | /* Prevent to re-select dst_cpu via env's cpus */ |
| 4012 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { | 4883 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { |
| 4013 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { | 4884 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { |
| 4014 | env->flags |= LBF_SOME_PINNED; | 4885 | env->flags |= LBF_DST_PINNED; |
| 4015 | env->new_dst_cpu = cpu; | 4886 | env->new_dst_cpu = cpu; |
| 4016 | break; | 4887 | break; |
| 4017 | } | 4888 | } |
| @@ -4030,11 +4901,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 4030 | 4901 | ||
| 4031 | /* | 4902 | /* |
| 4032 | * Aggressive migration if: | 4903 | * Aggressive migration if: |
| 4033 | * 1) task is cache cold, or | 4904 | * 1) destination numa is preferred |
| 4034 | * 2) too many balance attempts have failed. | 4905 | * 2) task is cache cold, or |
| 4906 | * 3) too many balance attempts have failed. | ||
| 4035 | */ | 4907 | */ |
| 4036 | |||
| 4037 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); | 4908 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); |
| 4909 | if (!tsk_cache_hot) | ||
| 4910 | tsk_cache_hot = migrate_degrades_locality(p, env); | ||
| 4911 | |||
| 4912 | if (migrate_improves_locality(p, env)) { | ||
| 4913 | #ifdef CONFIG_SCHEDSTATS | ||
| 4914 | if (tsk_cache_hot) { | ||
| 4915 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | ||
| 4916 | schedstat_inc(p, se.statistics.nr_forced_migrations); | ||
| 4917 | } | ||
| 4918 | #endif | ||
| 4919 | return 1; | ||
| 4920 | } | ||
| 4921 | |||
| 4038 | if (!tsk_cache_hot || | 4922 | if (!tsk_cache_hot || |
| 4039 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | 4923 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
| 4040 | 4924 | ||
| @@ -4077,8 +4961,6 @@ static int move_one_task(struct lb_env *env) | |||
| 4077 | return 0; | 4961 | return 0; |
| 4078 | } | 4962 | } |
| 4079 | 4963 | ||
| 4080 | static unsigned long task_h_load(struct task_struct *p); | ||
| 4081 | |||
| 4082 | static const unsigned int sched_nr_migrate_break = 32; | 4964 | static const unsigned int sched_nr_migrate_break = 32; |
| 4083 | 4965 | ||
| 4084 | /* | 4966 | /* |
| @@ -4291,6 +5173,10 @@ struct sg_lb_stats { | |||
| 4291 | unsigned int group_weight; | 5173 | unsigned int group_weight; |
| 4292 | int group_imb; /* Is there an imbalance in the group ? */ | 5174 | int group_imb; /* Is there an imbalance in the group ? */ |
| 4293 | int group_has_capacity; /* Is there extra capacity in the group? */ | 5175 | int group_has_capacity; /* Is there extra capacity in the group? */ |
| 5176 | #ifdef CONFIG_NUMA_BALANCING | ||
| 5177 | unsigned int nr_numa_running; | ||
| 5178 | unsigned int nr_preferred_running; | ||
| 5179 | #endif | ||
| 4294 | }; | 5180 | }; |
| 4295 | 5181 | ||
| 4296 | /* | 5182 | /* |
| @@ -4330,7 +5216,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | |||
| 4330 | /** | 5216 | /** |
| 4331 | * get_sd_load_idx - Obtain the load index for a given sched domain. | 5217 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
| 4332 | * @sd: The sched_domain whose load_idx is to be obtained. | 5218 | * @sd: The sched_domain whose load_idx is to be obtained. |
| 4333 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | 5219 | * @idle: The idle status of the CPU for whose sd load_idx is obtained. |
| 4334 | * | 5220 | * |
| 4335 | * Return: The load index. | 5221 | * Return: The load index. |
| 4336 | */ | 5222 | */ |
| @@ -4447,7 +5333,7 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
| 4447 | { | 5333 | { |
| 4448 | struct sched_domain *child = sd->child; | 5334 | struct sched_domain *child = sd->child; |
| 4449 | struct sched_group *group, *sdg = sd->groups; | 5335 | struct sched_group *group, *sdg = sd->groups; |
| 4450 | unsigned long power; | 5336 | unsigned long power, power_orig; |
| 4451 | unsigned long interval; | 5337 | unsigned long interval; |
| 4452 | 5338 | ||
| 4453 | interval = msecs_to_jiffies(sd->balance_interval); | 5339 | interval = msecs_to_jiffies(sd->balance_interval); |
| @@ -4459,7 +5345,7 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
| 4459 | return; | 5345 | return; |
| 4460 | } | 5346 | } |
| 4461 | 5347 | ||
| 4462 | power = 0; | 5348 | power_orig = power = 0; |
| 4463 | 5349 | ||
| 4464 | if (child->flags & SD_OVERLAP) { | 5350 | if (child->flags & SD_OVERLAP) { |
| 4465 | /* | 5351 | /* |
| @@ -4467,8 +5353,12 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
| 4467 | * span the current group. | 5353 | * span the current group. |
| 4468 | */ | 5354 | */ |
| 4469 | 5355 | ||
| 4470 | for_each_cpu(cpu, sched_group_cpus(sdg)) | 5356 | for_each_cpu(cpu, sched_group_cpus(sdg)) { |
| 4471 | power += power_of(cpu); | 5357 | struct sched_group *sg = cpu_rq(cpu)->sd->groups; |
| 5358 | |||
| 5359 | power_orig += sg->sgp->power_orig; | ||
| 5360 | power += sg->sgp->power; | ||
| 5361 | } | ||
| 4472 | } else { | 5362 | } else { |
| 4473 | /* | 5363 | /* |
| 4474 | * !SD_OVERLAP domains can assume that child groups | 5364 | * !SD_OVERLAP domains can assume that child groups |
| @@ -4477,12 +5367,14 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
| 4477 | 5367 | ||
| 4478 | group = child->groups; | 5368 | group = child->groups; |
| 4479 | do { | 5369 | do { |
| 5370 | power_orig += group->sgp->power_orig; | ||
| 4480 | power += group->sgp->power; | 5371 | power += group->sgp->power; |
| 4481 | group = group->next; | 5372 | group = group->next; |
| 4482 | } while (group != child->groups); | 5373 | } while (group != child->groups); |
| 4483 | } | 5374 | } |
| 4484 | 5375 | ||
| 4485 | sdg->sgp->power_orig = sdg->sgp->power = power; | 5376 | sdg->sgp->power_orig = power_orig; |
| 5377 | sdg->sgp->power = power; | ||
| 4486 | } | 5378 | } |
| 4487 | 5379 | ||
| 4488 | /* | 5380 | /* |
| @@ -4526,13 +5418,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
| 4526 | * cpu 3 and leave one of the cpus in the second group unused. | 5418 | * cpu 3 and leave one of the cpus in the second group unused. |
| 4527 | * | 5419 | * |
| 4528 | * The current solution to this issue is detecting the skew in the first group | 5420 | * The current solution to this issue is detecting the skew in the first group |
| 4529 | * by noticing it has a cpu that is overloaded while the remaining cpus are | 5421 | * by noticing the lower domain failed to reach balance and had difficulty |
| 4530 | * idle -- or rather, there's a distinct imbalance in the cpus; see | 5422 | * moving tasks due to affinity constraints. |
| 4531 | * sg_imbalanced(). | ||
| 4532 | * | 5423 | * |
| 4533 | * When this is so detected; this group becomes a candidate for busiest; see | 5424 | * When this is so detected; this group becomes a candidate for busiest; see |
| 4534 | * update_sd_pick_busiest(). And calculcate_imbalance() and | 5425 | * update_sd_pick_busiest(). And calculate_imbalance() and |
| 4535 | * find_busiest_group() avoid some of the usual balance conditional to allow it | 5426 | * find_busiest_group() avoid some of the usual balance conditions to allow it |
| 4536 | * to create an effective group imbalance. | 5427 | * to create an effective group imbalance. |
| 4537 | * | 5428 | * |
| 4538 | * This is a somewhat tricky proposition since the next run might not find the | 5429 | * This is a somewhat tricky proposition since the next run might not find the |
| @@ -4540,49 +5431,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
| 4540 | * subtle and fragile situation. | 5431 | * subtle and fragile situation. |
| 4541 | */ | 5432 | */ |
| 4542 | 5433 | ||
| 4543 | struct sg_imb_stats { | 5434 | static inline int sg_imbalanced(struct sched_group *group) |
| 4544 | unsigned long max_nr_running, min_nr_running; | ||
| 4545 | unsigned long max_cpu_load, min_cpu_load; | ||
| 4546 | }; | ||
| 4547 | |||
| 4548 | static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) | ||
| 4549 | { | 5435 | { |
| 4550 | sgi->max_cpu_load = sgi->max_nr_running = 0UL; | 5436 | return group->sgp->imbalance; |
| 4551 | sgi->min_cpu_load = sgi->min_nr_running = ~0UL; | ||
| 4552 | } | 5437 | } |
| 4553 | 5438 | ||
| 4554 | static inline void | 5439 | /* |
| 4555 | update_sg_imb_stats(struct sg_imb_stats *sgi, | 5440 | * Compute the group capacity. |
| 4556 | unsigned long load, unsigned long nr_running) | 5441 | * |
| 5442 | * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by | ||
| 5443 | * first dividing out the smt factor and computing the actual number of cores | ||
| 5444 | * and limit power unit capacity with that. | ||
| 5445 | */ | ||
| 5446 | static inline int sg_capacity(struct lb_env *env, struct sched_group *group) | ||
| 4557 | { | 5447 | { |
| 4558 | if (load > sgi->max_cpu_load) | 5448 | unsigned int capacity, smt, cpus; |
| 4559 | sgi->max_cpu_load = load; | 5449 | unsigned int power, power_orig; |
| 4560 | if (sgi->min_cpu_load > load) | ||
| 4561 | sgi->min_cpu_load = load; | ||
| 4562 | 5450 | ||
| 4563 | if (nr_running > sgi->max_nr_running) | 5451 | power = group->sgp->power; |
| 4564 | sgi->max_nr_running = nr_running; | 5452 | power_orig = group->sgp->power_orig; |
| 4565 | if (sgi->min_nr_running > nr_running) | 5453 | cpus = group->group_weight; |
| 4566 | sgi->min_nr_running = nr_running; | ||
| 4567 | } | ||
| 4568 | 5454 | ||
| 4569 | static inline int | 5455 | /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ |
| 4570 | sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) | 5456 | smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); |
| 4571 | { | 5457 | capacity = cpus / smt; /* cores */ |
| 4572 | /* | ||
| 4573 | * Consider the group unbalanced when the imbalance is larger | ||
| 4574 | * than the average weight of a task. | ||
| 4575 | * | ||
| 4576 | * APZ: with cgroup the avg task weight can vary wildly and | ||
| 4577 | * might not be a suitable number - should we keep a | ||
| 4578 | * normalized nr_running number somewhere that negates | ||
| 4579 | * the hierarchy? | ||
| 4580 | */ | ||
| 4581 | if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && | ||
| 4582 | (sgi->max_nr_running - sgi->min_nr_running) > 1) | ||
| 4583 | return 1; | ||
| 4584 | 5458 | ||
| 4585 | return 0; | 5459 | capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); |
| 5460 | if (!capacity) | ||
| 5461 | capacity = fix_small_capacity(env->sd, group); | ||
| 5462 | |||
| 5463 | return capacity; | ||
| 4586 | } | 5464 | } |
| 4587 | 5465 | ||
| 4588 | /** | 5466 | /** |
| @@ -4597,12 +5475,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 4597 | struct sched_group *group, int load_idx, | 5475 | struct sched_group *group, int load_idx, |
| 4598 | int local_group, struct sg_lb_stats *sgs) | 5476 | int local_group, struct sg_lb_stats *sgs) |
| 4599 | { | 5477 | { |
| 4600 | struct sg_imb_stats sgi; | ||
| 4601 | unsigned long nr_running; | 5478 | unsigned long nr_running; |
| 4602 | unsigned long load; | 5479 | unsigned long load; |
| 4603 | int i; | 5480 | int i; |
| 4604 | 5481 | ||
| 4605 | init_sg_imb_stats(&sgi); | 5482 | memset(sgs, 0, sizeof(*sgs)); |
| 4606 | 5483 | ||
| 4607 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 5484 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
| 4608 | struct rq *rq = cpu_rq(i); | 5485 | struct rq *rq = cpu_rq(i); |
| @@ -4610,24 +5487,22 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 4610 | nr_running = rq->nr_running; | 5487 | nr_running = rq->nr_running; |
| 4611 | 5488 | ||
| 4612 | /* Bias balancing toward cpus of our domain */ | 5489 | /* Bias balancing toward cpus of our domain */ |
| 4613 | if (local_group) { | 5490 | if (local_group) |
| 4614 | load = target_load(i, load_idx); | 5491 | load = target_load(i, load_idx); |
| 4615 | } else { | 5492 | else |
| 4616 | load = source_load(i, load_idx); | 5493 | load = source_load(i, load_idx); |
| 4617 | update_sg_imb_stats(&sgi, load, nr_running); | ||
| 4618 | } | ||
| 4619 | 5494 | ||
| 4620 | sgs->group_load += load; | 5495 | sgs->group_load += load; |
| 4621 | sgs->sum_nr_running += nr_running; | 5496 | sgs->sum_nr_running += nr_running; |
| 5497 | #ifdef CONFIG_NUMA_BALANCING | ||
| 5498 | sgs->nr_numa_running += rq->nr_numa_running; | ||
| 5499 | sgs->nr_preferred_running += rq->nr_preferred_running; | ||
| 5500 | #endif | ||
| 4622 | sgs->sum_weighted_load += weighted_cpuload(i); | 5501 | sgs->sum_weighted_load += weighted_cpuload(i); |
| 4623 | if (idle_cpu(i)) | 5502 | if (idle_cpu(i)) |
| 4624 | sgs->idle_cpus++; | 5503 | sgs->idle_cpus++; |
| 4625 | } | 5504 | } |
| 4626 | 5505 | ||
| 4627 | if (local_group && (env->idle != CPU_NEWLY_IDLE || | ||
| 4628 | time_after_eq(jiffies, group->sgp->next_update))) | ||
| 4629 | update_group_power(env->sd, env->dst_cpu); | ||
| 4630 | |||
| 4631 | /* Adjust by relative CPU power of the group */ | 5506 | /* Adjust by relative CPU power of the group */ |
| 4632 | sgs->group_power = group->sgp->power; | 5507 | sgs->group_power = group->sgp->power; |
| 4633 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; | 5508 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; |
| @@ -4635,16 +5510,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 4635 | if (sgs->sum_nr_running) | 5510 | if (sgs->sum_nr_running) |
| 4636 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 5511 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
| 4637 | 5512 | ||
| 4638 | sgs->group_imb = sg_imbalanced(sgs, &sgi); | ||
| 4639 | |||
| 4640 | sgs->group_capacity = | ||
| 4641 | DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); | ||
| 4642 | |||
| 4643 | if (!sgs->group_capacity) | ||
| 4644 | sgs->group_capacity = fix_small_capacity(env->sd, group); | ||
| 4645 | |||
| 4646 | sgs->group_weight = group->group_weight; | 5513 | sgs->group_weight = group->group_weight; |
| 4647 | 5514 | ||
| 5515 | sgs->group_imb = sg_imbalanced(group); | ||
| 5516 | sgs->group_capacity = sg_capacity(env, group); | ||
| 5517 | |||
| 4648 | if (sgs->group_capacity > sgs->sum_nr_running) | 5518 | if (sgs->group_capacity > sgs->sum_nr_running) |
| 4649 | sgs->group_has_capacity = 1; | 5519 | sgs->group_has_capacity = 1; |
| 4650 | } | 5520 | } |
| @@ -4693,14 +5563,42 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
| 4693 | return false; | 5563 | return false; |
| 4694 | } | 5564 | } |
| 4695 | 5565 | ||
| 5566 | #ifdef CONFIG_NUMA_BALANCING | ||
| 5567 | static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) | ||
| 5568 | { | ||
| 5569 | if (sgs->sum_nr_running > sgs->nr_numa_running) | ||
| 5570 | return regular; | ||
| 5571 | if (sgs->sum_nr_running > sgs->nr_preferred_running) | ||
| 5572 | return remote; | ||
| 5573 | return all; | ||
| 5574 | } | ||
| 5575 | |||
| 5576 | static inline enum fbq_type fbq_classify_rq(struct rq *rq) | ||
| 5577 | { | ||
| 5578 | if (rq->nr_running > rq->nr_numa_running) | ||
| 5579 | return regular; | ||
| 5580 | if (rq->nr_running > rq->nr_preferred_running) | ||
| 5581 | return remote; | ||
| 5582 | return all; | ||
| 5583 | } | ||
| 5584 | #else | ||
| 5585 | static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) | ||
| 5586 | { | ||
| 5587 | return all; | ||
| 5588 | } | ||
| 5589 | |||
| 5590 | static inline enum fbq_type fbq_classify_rq(struct rq *rq) | ||
| 5591 | { | ||
| 5592 | return regular; | ||
| 5593 | } | ||
| 5594 | #endif /* CONFIG_NUMA_BALANCING */ | ||
| 5595 | |||
| 4696 | /** | 5596 | /** |
| 4697 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. | 5597 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
| 4698 | * @env: The load balancing environment. | 5598 | * @env: The load balancing environment. |
| 4699 | * @balance: Should we balance. | ||
| 4700 | * @sds: variable to hold the statistics for this sched_domain. | 5599 | * @sds: variable to hold the statistics for this sched_domain. |
| 4701 | */ | 5600 | */ |
| 4702 | static inline void update_sd_lb_stats(struct lb_env *env, | 5601 | static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) |
| 4703 | struct sd_lb_stats *sds) | ||
| 4704 | { | 5602 | { |
| 4705 | struct sched_domain *child = env->sd->child; | 5603 | struct sched_domain *child = env->sd->child; |
| 4706 | struct sched_group *sg = env->sd->groups; | 5604 | struct sched_group *sg = env->sd->groups; |
| @@ -4720,11 +5618,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
| 4720 | if (local_group) { | 5618 | if (local_group) { |
| 4721 | sds->local = sg; | 5619 | sds->local = sg; |
| 4722 | sgs = &sds->local_stat; | 5620 | sgs = &sds->local_stat; |
| 5621 | |||
| 5622 | if (env->idle != CPU_NEWLY_IDLE || | ||
| 5623 | time_after_eq(jiffies, sg->sgp->next_update)) | ||
| 5624 | update_group_power(env->sd, env->dst_cpu); | ||
| 4723 | } | 5625 | } |
| 4724 | 5626 | ||
| 4725 | memset(sgs, 0, sizeof(*sgs)); | ||
| 4726 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); | 5627 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); |
| 4727 | 5628 | ||
| 5629 | if (local_group) | ||
| 5630 | goto next_group; | ||
| 5631 | |||
| 4728 | /* | 5632 | /* |
| 4729 | * In case the child domain prefers tasks go to siblings | 5633 | * In case the child domain prefers tasks go to siblings |
| 4730 | * first, lower the sg capacity to one so that we'll try | 5634 | * first, lower the sg capacity to one so that we'll try |
| @@ -4735,21 +5639,25 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
| 4735 | * heaviest group when it is already under-utilized (possible | 5639 | * heaviest group when it is already under-utilized (possible |
| 4736 | * with a large weight task outweighs the tasks on the system). | 5640 | * with a large weight task outweighs the tasks on the system). |
| 4737 | */ | 5641 | */ |
| 4738 | if (prefer_sibling && !local_group && | 5642 | if (prefer_sibling && sds->local && |
| 4739 | sds->local && sds->local_stat.group_has_capacity) | 5643 | sds->local_stat.group_has_capacity) |
| 4740 | sgs->group_capacity = min(sgs->group_capacity, 1U); | 5644 | sgs->group_capacity = min(sgs->group_capacity, 1U); |
| 4741 | 5645 | ||
| 4742 | /* Now, start updating sd_lb_stats */ | 5646 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
| 4743 | sds->total_load += sgs->group_load; | ||
| 4744 | sds->total_pwr += sgs->group_power; | ||
| 4745 | |||
| 4746 | if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { | ||
| 4747 | sds->busiest = sg; | 5647 | sds->busiest = sg; |
| 4748 | sds->busiest_stat = *sgs; | 5648 | sds->busiest_stat = *sgs; |
| 4749 | } | 5649 | } |
| 4750 | 5650 | ||
| 5651 | next_group: | ||
| 5652 | /* Now, start updating sd_lb_stats */ | ||
| 5653 | sds->total_load += sgs->group_load; | ||
| 5654 | sds->total_pwr += sgs->group_power; | ||
| 5655 | |||
| 4751 | sg = sg->next; | 5656 | sg = sg->next; |
| 4752 | } while (sg != env->sd->groups); | 5657 | } while (sg != env->sd->groups); |
| 5658 | |||
| 5659 | if (env->sd->flags & SD_NUMA) | ||
| 5660 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); | ||
| 4753 | } | 5661 | } |
| 4754 | 5662 | ||
| 4755 | /** | 5663 | /** |
| @@ -5053,15 +5961,39 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 5053 | int i; | 5961 | int i; |
| 5054 | 5962 | ||
| 5055 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 5963 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
| 5056 | unsigned long power = power_of(i); | 5964 | unsigned long power, capacity, wl; |
| 5057 | unsigned long capacity = DIV_ROUND_CLOSEST(power, | 5965 | enum fbq_type rt; |
| 5058 | SCHED_POWER_SCALE); | 5966 | |
| 5059 | unsigned long wl; | 5967 | rq = cpu_rq(i); |
| 5968 | rt = fbq_classify_rq(rq); | ||
| 5969 | |||
| 5970 | /* | ||
| 5971 | * We classify groups/runqueues into three groups: | ||
| 5972 | * - regular: there are !numa tasks | ||
| 5973 | * - remote: there are numa tasks that run on the 'wrong' node | ||
| 5974 | * - all: there is no distinction | ||
| 5975 | * | ||
| 5976 | * In order to avoid migrating ideally placed numa tasks, | ||
| 5977 | * ignore those when there's better options. | ||
| 5978 | * | ||
| 5979 | * If we ignore the actual busiest queue to migrate another | ||
| 5980 | * task, the next balance pass can still reduce the busiest | ||
| 5981 | * queue by moving tasks around inside the node. | ||
| 5982 | * | ||
| 5983 | * If we cannot move enough load due to this classification | ||
| 5984 | * the next pass will adjust the group classification and | ||
| 5985 | * allow migration of more tasks. | ||
| 5986 | * | ||
| 5987 | * Both cases only affect the total convergence complexity. | ||
| 5988 | */ | ||
| 5989 | if (rt > env->fbq_type) | ||
| 5990 | continue; | ||
| 5060 | 5991 | ||
| 5992 | power = power_of(i); | ||
| 5993 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | ||
| 5061 | if (!capacity) | 5994 | if (!capacity) |
| 5062 | capacity = fix_small_capacity(env->sd, group); | 5995 | capacity = fix_small_capacity(env->sd, group); |
| 5063 | 5996 | ||
| 5064 | rq = cpu_rq(i); | ||
| 5065 | wl = weighted_cpuload(i); | 5997 | wl = weighted_cpuload(i); |
| 5066 | 5998 | ||
| 5067 | /* | 5999 | /* |
| @@ -5164,6 +6096,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 5164 | int *continue_balancing) | 6096 | int *continue_balancing) |
| 5165 | { | 6097 | { |
| 5166 | int ld_moved, cur_ld_moved, active_balance = 0; | 6098 | int ld_moved, cur_ld_moved, active_balance = 0; |
| 6099 | struct sched_domain *sd_parent = sd->parent; | ||
| 5167 | struct sched_group *group; | 6100 | struct sched_group *group; |
| 5168 | struct rq *busiest; | 6101 | struct rq *busiest; |
| 5169 | unsigned long flags; | 6102 | unsigned long flags; |
| @@ -5177,6 +6110,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 5177 | .idle = idle, | 6110 | .idle = idle, |
| 5178 | .loop_break = sched_nr_migrate_break, | 6111 | .loop_break = sched_nr_migrate_break, |
| 5179 | .cpus = cpus, | 6112 | .cpus = cpus, |
| 6113 | .fbq_type = all, | ||
| 5180 | }; | 6114 | }; |
| 5181 | 6115 | ||
| 5182 | /* | 6116 | /* |
| @@ -5268,17 +6202,17 @@ more_balance: | |||
| 5268 | * moreover subsequent load balance cycles should correct the | 6202 | * moreover subsequent load balance cycles should correct the |
| 5269 | * excess load moved. | 6203 | * excess load moved. |
| 5270 | */ | 6204 | */ |
| 5271 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | 6205 | if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { |
| 6206 | |||
| 6207 | /* Prevent to re-select dst_cpu via env's cpus */ | ||
| 6208 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | ||
| 5272 | 6209 | ||
| 5273 | env.dst_rq = cpu_rq(env.new_dst_cpu); | 6210 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
| 5274 | env.dst_cpu = env.new_dst_cpu; | 6211 | env.dst_cpu = env.new_dst_cpu; |
| 5275 | env.flags &= ~LBF_SOME_PINNED; | 6212 | env.flags &= ~LBF_DST_PINNED; |
| 5276 | env.loop = 0; | 6213 | env.loop = 0; |
| 5277 | env.loop_break = sched_nr_migrate_break; | 6214 | env.loop_break = sched_nr_migrate_break; |
| 5278 | 6215 | ||
| 5279 | /* Prevent to re-select dst_cpu via env's cpus */ | ||
| 5280 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | ||
| 5281 | |||
| 5282 | /* | 6216 | /* |
| 5283 | * Go back to "more_balance" rather than "redo" since we | 6217 | * Go back to "more_balance" rather than "redo" since we |
| 5284 | * need to continue with same src_cpu. | 6218 | * need to continue with same src_cpu. |
| @@ -5286,6 +6220,18 @@ more_balance: | |||
| 5286 | goto more_balance; | 6220 | goto more_balance; |
| 5287 | } | 6221 | } |
| 5288 | 6222 | ||
| 6223 | /* | ||
| 6224 | * We failed to reach balance because of affinity. | ||
| 6225 | */ | ||
| 6226 | if (sd_parent) { | ||
| 6227 | int *group_imbalance = &sd_parent->groups->sgp->imbalance; | ||
| 6228 | |||
| 6229 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | ||
| 6230 | *group_imbalance = 1; | ||
| 6231 | } else if (*group_imbalance) | ||
| 6232 | *group_imbalance = 0; | ||
| 6233 | } | ||
| 6234 | |||
| 5289 | /* All tasks on this runqueue were pinned by CPU affinity */ | 6235 | /* All tasks on this runqueue were pinned by CPU affinity */ |
| 5290 | if (unlikely(env.flags & LBF_ALL_PINNED)) { | 6236 | if (unlikely(env.flags & LBF_ALL_PINNED)) { |
| 5291 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 6237 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
| @@ -5393,6 +6339,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 5393 | struct sched_domain *sd; | 6339 | struct sched_domain *sd; |
| 5394 | int pulled_task = 0; | 6340 | int pulled_task = 0; |
| 5395 | unsigned long next_balance = jiffies + HZ; | 6341 | unsigned long next_balance = jiffies + HZ; |
| 6342 | u64 curr_cost = 0; | ||
| 5396 | 6343 | ||
| 5397 | this_rq->idle_stamp = rq_clock(this_rq); | 6344 | this_rq->idle_stamp = rq_clock(this_rq); |
| 5398 | 6345 | ||
| @@ -5409,15 +6356,27 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 5409 | for_each_domain(this_cpu, sd) { | 6356 | for_each_domain(this_cpu, sd) { |
| 5410 | unsigned long interval; | 6357 | unsigned long interval; |
| 5411 | int continue_balancing = 1; | 6358 | int continue_balancing = 1; |
| 6359 | u64 t0, domain_cost; | ||
| 5412 | 6360 | ||
| 5413 | if (!(sd->flags & SD_LOAD_BALANCE)) | 6361 | if (!(sd->flags & SD_LOAD_BALANCE)) |
| 5414 | continue; | 6362 | continue; |
| 5415 | 6363 | ||
| 6364 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) | ||
| 6365 | break; | ||
| 6366 | |||
| 5416 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 6367 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
| 6368 | t0 = sched_clock_cpu(this_cpu); | ||
| 6369 | |||
| 5417 | /* If we've pulled tasks over stop searching: */ | 6370 | /* If we've pulled tasks over stop searching: */ |
| 5418 | pulled_task = load_balance(this_cpu, this_rq, | 6371 | pulled_task = load_balance(this_cpu, this_rq, |
| 5419 | sd, CPU_NEWLY_IDLE, | 6372 | sd, CPU_NEWLY_IDLE, |
| 5420 | &continue_balancing); | 6373 | &continue_balancing); |
| 6374 | |||
| 6375 | domain_cost = sched_clock_cpu(this_cpu) - t0; | ||
| 6376 | if (domain_cost > sd->max_newidle_lb_cost) | ||
| 6377 | sd->max_newidle_lb_cost = domain_cost; | ||
| 6378 | |||
| 6379 | curr_cost += domain_cost; | ||
| 5421 | } | 6380 | } |
| 5422 | 6381 | ||
| 5423 | interval = msecs_to_jiffies(sd->balance_interval); | 6382 | interval = msecs_to_jiffies(sd->balance_interval); |
| @@ -5439,6 +6398,9 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 5439 | */ | 6398 | */ |
| 5440 | this_rq->next_balance = next_balance; | 6399 | this_rq->next_balance = next_balance; |
| 5441 | } | 6400 | } |
| 6401 | |||
| 6402 | if (curr_cost > this_rq->max_idle_balance_cost) | ||
| 6403 | this_rq->max_idle_balance_cost = curr_cost; | ||
| 5442 | } | 6404 | } |
| 5443 | 6405 | ||
| 5444 | /* | 6406 | /* |
| @@ -5572,16 +6534,16 @@ static inline void nohz_balance_exit_idle(int cpu) | |||
| 5572 | static inline void set_cpu_sd_state_busy(void) | 6534 | static inline void set_cpu_sd_state_busy(void) |
| 5573 | { | 6535 | { |
| 5574 | struct sched_domain *sd; | 6536 | struct sched_domain *sd; |
| 6537 | int cpu = smp_processor_id(); | ||
| 5575 | 6538 | ||
| 5576 | rcu_read_lock(); | 6539 | rcu_read_lock(); |
| 5577 | sd = rcu_dereference_check_sched_domain(this_rq()->sd); | 6540 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
| 5578 | 6541 | ||
| 5579 | if (!sd || !sd->nohz_idle) | 6542 | if (!sd || !sd->nohz_idle) |
| 5580 | goto unlock; | 6543 | goto unlock; |
| 5581 | sd->nohz_idle = 0; | 6544 | sd->nohz_idle = 0; |
| 5582 | 6545 | ||
| 5583 | for (; sd; sd = sd->parent) | 6546 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); |
| 5584 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); | ||
| 5585 | unlock: | 6547 | unlock: |
| 5586 | rcu_read_unlock(); | 6548 | rcu_read_unlock(); |
| 5587 | } | 6549 | } |
| @@ -5589,16 +6551,16 @@ unlock: | |||
| 5589 | void set_cpu_sd_state_idle(void) | 6551 | void set_cpu_sd_state_idle(void) |
| 5590 | { | 6552 | { |
| 5591 | struct sched_domain *sd; | 6553 | struct sched_domain *sd; |
| 6554 | int cpu = smp_processor_id(); | ||
| 5592 | 6555 | ||
| 5593 | rcu_read_lock(); | 6556 | rcu_read_lock(); |
| 5594 | sd = rcu_dereference_check_sched_domain(this_rq()->sd); | 6557 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
| 5595 | 6558 | ||
| 5596 | if (!sd || sd->nohz_idle) | 6559 | if (!sd || sd->nohz_idle) |
| 5597 | goto unlock; | 6560 | goto unlock; |
| 5598 | sd->nohz_idle = 1; | 6561 | sd->nohz_idle = 1; |
| 5599 | 6562 | ||
| 5600 | for (; sd; sd = sd->parent) | 6563 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); |
| 5601 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); | ||
| 5602 | unlock: | 6564 | unlock: |
| 5603 | rcu_read_unlock(); | 6565 | rcu_read_unlock(); |
| 5604 | } | 6566 | } |
| @@ -5662,15 +6624,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 5662 | /* Earliest time when we have to do rebalance again */ | 6624 | /* Earliest time when we have to do rebalance again */ |
| 5663 | unsigned long next_balance = jiffies + 60*HZ; | 6625 | unsigned long next_balance = jiffies + 60*HZ; |
| 5664 | int update_next_balance = 0; | 6626 | int update_next_balance = 0; |
| 5665 | int need_serialize; | 6627 | int need_serialize, need_decay = 0; |
| 6628 | u64 max_cost = 0; | ||
| 5666 | 6629 | ||
| 5667 | update_blocked_averages(cpu); | 6630 | update_blocked_averages(cpu); |
| 5668 | 6631 | ||
| 5669 | rcu_read_lock(); | 6632 | rcu_read_lock(); |
| 5670 | for_each_domain(cpu, sd) { | 6633 | for_each_domain(cpu, sd) { |
| 6634 | /* | ||
| 6635 | * Decay the newidle max times here because this is a regular | ||
| 6636 | * visit to all the domains. Decay ~1% per second. | ||
| 6637 | */ | ||
| 6638 | if (time_after(jiffies, sd->next_decay_max_lb_cost)) { | ||
| 6639 | sd->max_newidle_lb_cost = | ||
| 6640 | (sd->max_newidle_lb_cost * 253) / 256; | ||
| 6641 | sd->next_decay_max_lb_cost = jiffies + HZ; | ||
| 6642 | need_decay = 1; | ||
| 6643 | } | ||
| 6644 | max_cost += sd->max_newidle_lb_cost; | ||
| 6645 | |||
| 5671 | if (!(sd->flags & SD_LOAD_BALANCE)) | 6646 | if (!(sd->flags & SD_LOAD_BALANCE)) |
| 5672 | continue; | 6647 | continue; |
| 5673 | 6648 | ||
| 6649 | /* | ||
| 6650 | * Stop the load balance at this level. There is another | ||
| 6651 | * CPU in our sched group which is doing load balancing more | ||
| 6652 | * actively. | ||
| 6653 | */ | ||
| 6654 | if (!continue_balancing) { | ||
| 6655 | if (need_decay) | ||
| 6656 | continue; | ||
| 6657 | break; | ||
| 6658 | } | ||
| 6659 | |||
| 5674 | interval = sd->balance_interval; | 6660 | interval = sd->balance_interval; |
| 5675 | if (idle != CPU_IDLE) | 6661 | if (idle != CPU_IDLE) |
| 5676 | interval *= sd->busy_factor; | 6662 | interval *= sd->busy_factor; |
| @@ -5689,7 +6675,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 5689 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 6675 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
| 5690 | if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { | 6676 | if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { |
| 5691 | /* | 6677 | /* |
| 5692 | * The LBF_SOME_PINNED logic could have changed | 6678 | * The LBF_DST_PINNED logic could have changed |
| 5693 | * env->dst_cpu, so we can't know our idle | 6679 | * env->dst_cpu, so we can't know our idle |
| 5694 | * state even if we migrated tasks. Update it. | 6680 | * state even if we migrated tasks. Update it. |
| 5695 | */ | 6681 | */ |
| @@ -5704,14 +6690,14 @@ out: | |||
| 5704 | next_balance = sd->last_balance + interval; | 6690 | next_balance = sd->last_balance + interval; |
| 5705 | update_next_balance = 1; | 6691 | update_next_balance = 1; |
| 5706 | } | 6692 | } |
| 5707 | 6693 | } | |
| 6694 | if (need_decay) { | ||
| 5708 | /* | 6695 | /* |
| 5709 | * Stop the load balance at this level. There is another | 6696 | * Ensure the rq-wide value also decays but keep it at a |
| 5710 | * CPU in our sched group which is doing load balancing more | 6697 | * reasonable floor to avoid funnies with rq->avg_idle. |
| 5711 | * actively. | ||
| 5712 | */ | 6698 | */ |
| 5713 | if (!continue_balancing) | 6699 | rq->max_idle_balance_cost = |
| 5714 | break; | 6700 | max((u64)sysctl_sched_migration_cost, max_cost); |
| 5715 | } | 6701 | } |
| 5716 | rcu_read_unlock(); | 6702 | rcu_read_unlock(); |
| 5717 | 6703 | ||
| @@ -5781,6 +6767,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
| 5781 | { | 6767 | { |
| 5782 | unsigned long now = jiffies; | 6768 | unsigned long now = jiffies; |
| 5783 | struct sched_domain *sd; | 6769 | struct sched_domain *sd; |
| 6770 | struct sched_group_power *sgp; | ||
| 6771 | int nr_busy; | ||
| 5784 | 6772 | ||
| 5785 | if (unlikely(idle_cpu(cpu))) | 6773 | if (unlikely(idle_cpu(cpu))) |
| 5786 | return 0; | 6774 | return 0; |
| @@ -5806,22 +6794,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
| 5806 | goto need_kick; | 6794 | goto need_kick; |
| 5807 | 6795 | ||
| 5808 | rcu_read_lock(); | 6796 | rcu_read_lock(); |
| 5809 | for_each_domain(cpu, sd) { | 6797 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
| 5810 | struct sched_group *sg = sd->groups; | ||
| 5811 | struct sched_group_power *sgp = sg->sgp; | ||
| 5812 | int nr_busy = atomic_read(&sgp->nr_busy_cpus); | ||
| 5813 | 6798 | ||
| 5814 | if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) | 6799 | if (sd) { |
| 5815 | goto need_kick_unlock; | 6800 | sgp = sd->groups->sgp; |
| 6801 | nr_busy = atomic_read(&sgp->nr_busy_cpus); | ||
| 5816 | 6802 | ||
| 5817 | if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight | 6803 | if (nr_busy > 1) |
| 5818 | && (cpumask_first_and(nohz.idle_cpus_mask, | ||
| 5819 | sched_domain_span(sd)) < cpu)) | ||
| 5820 | goto need_kick_unlock; | 6804 | goto need_kick_unlock; |
| 5821 | |||
| 5822 | if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) | ||
| 5823 | break; | ||
| 5824 | } | 6805 | } |
| 6806 | |||
| 6807 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | ||
| 6808 | |||
| 6809 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, | ||
| 6810 | sched_domain_span(sd)) < cpu)) | ||
| 6811 | goto need_kick_unlock; | ||
| 6812 | |||
| 5825 | rcu_read_unlock(); | 6813 | rcu_read_unlock(); |
| 5826 | return 0; | 6814 | return 0; |
| 5827 | 6815 | ||
| @@ -6214,7 +7202,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
| 6214 | se->cfs_rq = parent->my_q; | 7202 | se->cfs_rq = parent->my_q; |
| 6215 | 7203 | ||
| 6216 | se->my_q = cfs_rq; | 7204 | se->my_q = cfs_rq; |
| 6217 | update_load_set(&se->load, 0); | 7205 | /* guarantee group entities always have weight */ |
| 7206 | update_load_set(&se->load, NICE_0_LOAD); | ||
| 6218 | se->parent = parent; | 7207 | se->parent = parent; |
| 6219 | } | 7208 | } |
| 6220 | 7209 | ||
