aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c1397
1 files changed, 1193 insertions, 204 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c70201fbc61..df77c605c7a6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
681} 681}
682 682
683#ifdef CONFIG_SMP 683#ifdef CONFIG_SMP
684static unsigned long task_h_load(struct task_struct *p);
685
684static inline void __update_task_entity_contrib(struct sched_entity *se); 686static inline void __update_task_entity_contrib(struct sched_entity *se);
685 687
686/* Give new task start runnable values to heavy its load in infant time */ 688/* Give new task start runnable values to heavy its load in infant time */
@@ -818,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
818 820
819#ifdef CONFIG_NUMA_BALANCING 821#ifdef CONFIG_NUMA_BALANCING
820/* 822/*
821 * numa task sample period in ms 823 * Approximate time to scan a full NUMA task in ms. The task scan period is
824 * calculated based on the tasks virtual memory size and
825 * numa_balancing_scan_size.
822 */ 826 */
823unsigned int sysctl_numa_balancing_scan_period_min = 100; 827unsigned int sysctl_numa_balancing_scan_period_min = 1000;
824unsigned int sysctl_numa_balancing_scan_period_max = 100*50; 828unsigned int sysctl_numa_balancing_scan_period_max = 60000;
825unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
826 829
827/* Portion of address space to scan in MB */ 830/* Portion of address space to scan in MB */
828unsigned int sysctl_numa_balancing_scan_size = 256; 831unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,41 +833,810 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
830/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 833/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
831unsigned int sysctl_numa_balancing_scan_delay = 1000; 834unsigned int sysctl_numa_balancing_scan_delay = 1000;
832 835
833static void task_numa_placement(struct task_struct *p) 836/*
837 * After skipping a page migration on a shared page, skip N more numa page
838 * migrations unconditionally. This reduces the number of NUMA migrations
839 * in shared memory workloads, and has the effect of pulling tasks towards
840 * where their memory lives, over pulling the memory towards the task.
841 */
842unsigned int sysctl_numa_balancing_migrate_deferred = 16;
843
844static unsigned int task_nr_scan_windows(struct task_struct *p)
845{
846 unsigned long rss = 0;
847 unsigned long nr_scan_pages;
848
849 /*
850 * Calculations based on RSS as non-present and empty pages are skipped
851 * by the PTE scanner and NUMA hinting faults should be trapped based
852 * on resident pages
853 */
854 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
855 rss = get_mm_rss(p->mm);
856 if (!rss)
857 rss = nr_scan_pages;
858
859 rss = round_up(rss, nr_scan_pages);
860 return rss / nr_scan_pages;
861}
862
863/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
864#define MAX_SCAN_WINDOW 2560
865
866static unsigned int task_scan_min(struct task_struct *p)
867{
868 unsigned int scan, floor;
869 unsigned int windows = 1;
870
871 if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
872 windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
873 floor = 1000 / windows;
874
875 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
876 return max_t(unsigned int, floor, scan);
877}
878
879static unsigned int task_scan_max(struct task_struct *p)
880{
881 unsigned int smin = task_scan_min(p);
882 unsigned int smax;
883
884 /* Watch for min being lower than max due to floor calculations */
885 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
886 return max(smin, smax);
887}
888
889/*
890 * Once a preferred node is selected the scheduler balancer will prefer moving
891 * a task to that node for sysctl_numa_balancing_settle_count number of PTE
892 * scans. This will give the process the chance to accumulate more faults on
893 * the preferred node but still allow the scheduler to move the task again if
894 * the nodes CPUs are overloaded.
895 */
896unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
897
898static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
899{
900 rq->nr_numa_running += (p->numa_preferred_nid != -1);
901 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
902}
903
904static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
905{
906 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
907 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
908}
909
910struct numa_group {
911 atomic_t refcount;
912
913 spinlock_t lock; /* nr_tasks, tasks */
914 int nr_tasks;
915 pid_t gid;
916 struct list_head task_list;
917
918 struct rcu_head rcu;
919 unsigned long total_faults;
920 unsigned long faults[0];
921};
922
923pid_t task_numa_group_id(struct task_struct *p)
924{
925 return p->numa_group ? p->numa_group->gid : 0;
926}
927
928static inline int task_faults_idx(int nid, int priv)
929{
930 return 2 * nid + priv;
931}
932
933static inline unsigned long task_faults(struct task_struct *p, int nid)
934{
935 if (!p->numa_faults)
936 return 0;
937
938 return p->numa_faults[task_faults_idx(nid, 0)] +
939 p->numa_faults[task_faults_idx(nid, 1)];
940}
941
942static inline unsigned long group_faults(struct task_struct *p, int nid)
943{
944 if (!p->numa_group)
945 return 0;
946
947 return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
948}
949
950/*
951 * These return the fraction of accesses done by a particular task, or
952 * task group, on a particular numa node. The group weight is given a
953 * larger multiplier, in order to group tasks together that are almost
954 * evenly spread out between numa nodes.
955 */
956static inline unsigned long task_weight(struct task_struct *p, int nid)
957{
958 unsigned long total_faults;
959
960 if (!p->numa_faults)
961 return 0;
962
963 total_faults = p->total_numa_faults;
964
965 if (!total_faults)
966 return 0;
967
968 return 1000 * task_faults(p, nid) / total_faults;
969}
970
971static inline unsigned long group_weight(struct task_struct *p, int nid)
834{ 972{
835 int seq; 973 if (!p->numa_group || !p->numa_group->total_faults)
974 return 0;
836 975
837 if (!p->mm) /* for example, ksmd faulting in a user's mm */ 976 return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
977}
978
979static unsigned long weighted_cpuload(const int cpu);
980static unsigned long source_load(int cpu, int type);
981static unsigned long target_load(int cpu, int type);
982static unsigned long power_of(int cpu);
983static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
984
985/* Cached statistics for all CPUs within a node */
986struct numa_stats {
987 unsigned long nr_running;
988 unsigned long load;
989
990 /* Total compute capacity of CPUs on a node */
991 unsigned long power;
992
993 /* Approximate capacity in terms of runnable tasks on a node */
994 unsigned long capacity;
995 int has_capacity;
996};
997
998/*
999 * XXX borrowed from update_sg_lb_stats
1000 */
1001static void update_numa_stats(struct numa_stats *ns, int nid)
1002{
1003 int cpu;
1004
1005 memset(ns, 0, sizeof(*ns));
1006 for_each_cpu(cpu, cpumask_of_node(nid)) {
1007 struct rq *rq = cpu_rq(cpu);
1008
1009 ns->nr_running += rq->nr_running;
1010 ns->load += weighted_cpuload(cpu);
1011 ns->power += power_of(cpu);
1012 }
1013
1014 ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
1015 ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
1016 ns->has_capacity = (ns->nr_running < ns->capacity);
1017}
1018
1019struct task_numa_env {
1020 struct task_struct *p;
1021
1022 int src_cpu, src_nid;
1023 int dst_cpu, dst_nid;
1024
1025 struct numa_stats src_stats, dst_stats;
1026
1027 int imbalance_pct, idx;
1028
1029 struct task_struct *best_task;
1030 long best_imp;
1031 int best_cpu;
1032};
1033
1034static void task_numa_assign(struct task_numa_env *env,
1035 struct task_struct *p, long imp)
1036{
1037 if (env->best_task)
1038 put_task_struct(env->best_task);
1039 if (p)
1040 get_task_struct(p);
1041
1042 env->best_task = p;
1043 env->best_imp = imp;
1044 env->best_cpu = env->dst_cpu;
1045}
1046
1047/*
1048 * This checks if the overall compute and NUMA accesses of the system would
1049 * be improved if the source tasks was migrated to the target dst_cpu taking
1050 * into account that it might be best if task running on the dst_cpu should
1051 * be exchanged with the source task
1052 */
1053static void task_numa_compare(struct task_numa_env *env,
1054 long taskimp, long groupimp)
1055{
1056 struct rq *src_rq = cpu_rq(env->src_cpu);
1057 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1058 struct task_struct *cur;
1059 long dst_load, src_load;
1060 long load;
1061 long imp = (groupimp > 0) ? groupimp : taskimp;
1062
1063 rcu_read_lock();
1064 cur = ACCESS_ONCE(dst_rq->curr);
1065 if (cur->pid == 0) /* idle */
1066 cur = NULL;
1067
1068 /*
1069 * "imp" is the fault differential for the source task between the
1070 * source and destination node. Calculate the total differential for
1071 * the source task and potential destination task. The more negative
1072 * the value is, the more rmeote accesses that would be expected to
1073 * be incurred if the tasks were swapped.
1074 */
1075 if (cur) {
1076 /* Skip this swap candidate if cannot move to the source cpu */
1077 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1078 goto unlock;
1079
1080 /*
1081 * If dst and source tasks are in the same NUMA group, or not
1082 * in any group then look only at task weights.
1083 */
1084 if (cur->numa_group == env->p->numa_group) {
1085 imp = taskimp + task_weight(cur, env->src_nid) -
1086 task_weight(cur, env->dst_nid);
1087 /*
1088 * Add some hysteresis to prevent swapping the
1089 * tasks within a group over tiny differences.
1090 */
1091 if (cur->numa_group)
1092 imp -= imp/16;
1093 } else {
1094 /*
1095 * Compare the group weights. If a task is all by
1096 * itself (not part of a group), use the task weight
1097 * instead.
1098 */
1099 if (env->p->numa_group)
1100 imp = groupimp;
1101 else
1102 imp = taskimp;
1103
1104 if (cur->numa_group)
1105 imp += group_weight(cur, env->src_nid) -
1106 group_weight(cur, env->dst_nid);
1107 else
1108 imp += task_weight(cur, env->src_nid) -
1109 task_weight(cur, env->dst_nid);
1110 }
1111 }
1112
1113 if (imp < env->best_imp)
1114 goto unlock;
1115
1116 if (!cur) {
1117 /* Is there capacity at our destination? */
1118 if (env->src_stats.has_capacity &&
1119 !env->dst_stats.has_capacity)
1120 goto unlock;
1121
1122 goto balance;
1123 }
1124
1125 /* Balance doesn't matter much if we're running a task per cpu */
1126 if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
1127 goto assign;
1128
1129 /*
1130 * In the overloaded case, try and keep the load balanced.
1131 */
1132balance:
1133 dst_load = env->dst_stats.load;
1134 src_load = env->src_stats.load;
1135
1136 /* XXX missing power terms */
1137 load = task_h_load(env->p);
1138 dst_load += load;
1139 src_load -= load;
1140
1141 if (cur) {
1142 load = task_h_load(cur);
1143 dst_load -= load;
1144 src_load += load;
1145 }
1146
1147 /* make src_load the smaller */
1148 if (dst_load < src_load)
1149 swap(dst_load, src_load);
1150
1151 if (src_load * env->imbalance_pct < dst_load * 100)
1152 goto unlock;
1153
1154assign:
1155 task_numa_assign(env, cur, imp);
1156unlock:
1157 rcu_read_unlock();
1158}
1159
1160static void task_numa_find_cpu(struct task_numa_env *env,
1161 long taskimp, long groupimp)
1162{
1163 int cpu;
1164
1165 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1166 /* Skip this CPU if the source task cannot migrate */
1167 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1168 continue;
1169
1170 env->dst_cpu = cpu;
1171 task_numa_compare(env, taskimp, groupimp);
1172 }
1173}
1174
1175static int task_numa_migrate(struct task_struct *p)
1176{
1177 struct task_numa_env env = {
1178 .p = p,
1179
1180 .src_cpu = task_cpu(p),
1181 .src_nid = task_node(p),
1182
1183 .imbalance_pct = 112,
1184
1185 .best_task = NULL,
1186 .best_imp = 0,
1187 .best_cpu = -1
1188 };
1189 struct sched_domain *sd;
1190 unsigned long taskweight, groupweight;
1191 int nid, ret;
1192 long taskimp, groupimp;
1193
1194 /*
1195 * Pick the lowest SD_NUMA domain, as that would have the smallest
1196 * imbalance and would be the first to start moving tasks about.
1197 *
1198 * And we want to avoid any moving of tasks about, as that would create
1199 * random movement of tasks -- counter the numa conditions we're trying
1200 * to satisfy here.
1201 */
1202 rcu_read_lock();
1203 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1204 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1205 rcu_read_unlock();
1206
1207 taskweight = task_weight(p, env.src_nid);
1208 groupweight = group_weight(p, env.src_nid);
1209 update_numa_stats(&env.src_stats, env.src_nid);
1210 env.dst_nid = p->numa_preferred_nid;
1211 taskimp = task_weight(p, env.dst_nid) - taskweight;
1212 groupimp = group_weight(p, env.dst_nid) - groupweight;
1213 update_numa_stats(&env.dst_stats, env.dst_nid);
1214
1215 /* If the preferred nid has capacity, try to use it. */
1216 if (env.dst_stats.has_capacity)
1217 task_numa_find_cpu(&env, taskimp, groupimp);
1218
1219 /* No space available on the preferred nid. Look elsewhere. */
1220 if (env.best_cpu == -1) {
1221 for_each_online_node(nid) {
1222 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1223 continue;
1224
1225 /* Only consider nodes where both task and groups benefit */
1226 taskimp = task_weight(p, nid) - taskweight;
1227 groupimp = group_weight(p, nid) - groupweight;
1228 if (taskimp < 0 && groupimp < 0)
1229 continue;
1230
1231 env.dst_nid = nid;
1232 update_numa_stats(&env.dst_stats, env.dst_nid);
1233 task_numa_find_cpu(&env, taskimp, groupimp);
1234 }
1235 }
1236
1237 /* No better CPU than the current one was found. */
1238 if (env.best_cpu == -1)
1239 return -EAGAIN;
1240
1241 sched_setnuma(p, env.dst_nid);
1242
1243 /*
1244 * Reset the scan period if the task is being rescheduled on an
1245 * alternative node to recheck if the tasks is now properly placed.
1246 */
1247 p->numa_scan_period = task_scan_min(p);
1248
1249 if (env.best_task == NULL) {
1250 int ret = migrate_task_to(p, env.best_cpu);
1251 return ret;
1252 }
1253
1254 ret = migrate_swap(p, env.best_task);
1255 put_task_struct(env.best_task);
1256 return ret;
1257}
1258
1259/* Attempt to migrate a task to a CPU on the preferred node. */
1260static void numa_migrate_preferred(struct task_struct *p)
1261{
1262 /* This task has no NUMA fault statistics yet */
1263 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1264 return;
1265
1266 /* Periodically retry migrating the task to the preferred node */
1267 p->numa_migrate_retry = jiffies + HZ;
1268
1269 /* Success if task is already running on preferred CPU */
1270 if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
838 return; 1271 return;
1272
1273 /* Otherwise, try migrate to a CPU on the preferred node */
1274 task_numa_migrate(p);
1275}
1276
1277/*
1278 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1279 * increments. The more local the fault statistics are, the higher the scan
1280 * period will be for the next scan window. If local/remote ratio is below
1281 * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
1282 * scan period will decrease
1283 */
1284#define NUMA_PERIOD_SLOTS 10
1285#define NUMA_PERIOD_THRESHOLD 3
1286
1287/*
1288 * Increase the scan period (slow down scanning) if the majority of
1289 * our memory is already on our local node, or if the majority of
1290 * the page accesses are shared with other processes.
1291 * Otherwise, decrease the scan period.
1292 */
1293static void update_task_scan_period(struct task_struct *p,
1294 unsigned long shared, unsigned long private)
1295{
1296 unsigned int period_slot;
1297 int ratio;
1298 int diff;
1299
1300 unsigned long remote = p->numa_faults_locality[0];
1301 unsigned long local = p->numa_faults_locality[1];
1302
1303 /*
1304 * If there were no record hinting faults then either the task is
1305 * completely idle or all activity is areas that are not of interest
1306 * to automatic numa balancing. Scan slower
1307 */
1308 if (local + shared == 0) {
1309 p->numa_scan_period = min(p->numa_scan_period_max,
1310 p->numa_scan_period << 1);
1311
1312 p->mm->numa_next_scan = jiffies +
1313 msecs_to_jiffies(p->numa_scan_period);
1314
1315 return;
1316 }
1317
1318 /*
1319 * Prepare to scale scan period relative to the current period.
1320 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1321 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1322 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1323 */
1324 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1325 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1326 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1327 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1328 if (!slot)
1329 slot = 1;
1330 diff = slot * period_slot;
1331 } else {
1332 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1333
1334 /*
1335 * Scale scan rate increases based on sharing. There is an
1336 * inverse relationship between the degree of sharing and
1337 * the adjustment made to the scanning period. Broadly
1338 * speaking the intent is that there is little point
1339 * scanning faster if shared accesses dominate as it may
1340 * simply bounce migrations uselessly
1341 */
1342 period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
1343 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1344 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1345 }
1346
1347 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1348 task_scan_min(p), task_scan_max(p));
1349 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1350}
1351
1352static void task_numa_placement(struct task_struct *p)
1353{
1354 int seq, nid, max_nid = -1, max_group_nid = -1;
1355 unsigned long max_faults = 0, max_group_faults = 0;
1356 unsigned long fault_types[2] = { 0, 0 };
1357 spinlock_t *group_lock = NULL;
1358
839 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1359 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
840 if (p->numa_scan_seq == seq) 1360 if (p->numa_scan_seq == seq)
841 return; 1361 return;
842 p->numa_scan_seq = seq; 1362 p->numa_scan_seq = seq;
1363 p->numa_scan_period_max = task_scan_max(p);
1364
1365 /* If the task is part of a group prevent parallel updates to group stats */
1366 if (p->numa_group) {
1367 group_lock = &p->numa_group->lock;
1368 spin_lock(group_lock);
1369 }
1370
1371 /* Find the node with the highest number of faults */
1372 for_each_online_node(nid) {
1373 unsigned long faults = 0, group_faults = 0;
1374 int priv, i;
1375
1376 for (priv = 0; priv < 2; priv++) {
1377 long diff;
1378
1379 i = task_faults_idx(nid, priv);
1380 diff = -p->numa_faults[i];
1381
1382 /* Decay existing window, copy faults since last scan */
1383 p->numa_faults[i] >>= 1;
1384 p->numa_faults[i] += p->numa_faults_buffer[i];
1385 fault_types[priv] += p->numa_faults_buffer[i];
1386 p->numa_faults_buffer[i] = 0;
1387
1388 faults += p->numa_faults[i];
1389 diff += p->numa_faults[i];
1390 p->total_numa_faults += diff;
1391 if (p->numa_group) {
1392 /* safe because we can only change our own group */
1393 p->numa_group->faults[i] += diff;
1394 p->numa_group->total_faults += diff;
1395 group_faults += p->numa_group->faults[i];
1396 }
1397 }
1398
1399 if (faults > max_faults) {
1400 max_faults = faults;
1401 max_nid = nid;
1402 }
1403
1404 if (group_faults > max_group_faults) {
1405 max_group_faults = group_faults;
1406 max_group_nid = nid;
1407 }
1408 }
1409
1410 update_task_scan_period(p, fault_types[0], fault_types[1]);
1411
1412 if (p->numa_group) {
1413 /*
1414 * If the preferred task and group nids are different,
1415 * iterate over the nodes again to find the best place.
1416 */
1417 if (max_nid != max_group_nid) {
1418 unsigned long weight, max_weight = 0;
1419
1420 for_each_online_node(nid) {
1421 weight = task_weight(p, nid) + group_weight(p, nid);
1422 if (weight > max_weight) {
1423 max_weight = weight;
1424 max_nid = nid;
1425 }
1426 }
1427 }
1428
1429 spin_unlock(group_lock);
1430 }
1431
1432 /* Preferred node as the node with the most faults */
1433 if (max_faults && max_nid != p->numa_preferred_nid) {
1434 /* Update the preferred nid and migrate task if possible */
1435 sched_setnuma(p, max_nid);
1436 numa_migrate_preferred(p);
1437 }
1438}
1439
1440static inline int get_numa_group(struct numa_group *grp)
1441{
1442 return atomic_inc_not_zero(&grp->refcount);
1443}
1444
1445static inline void put_numa_group(struct numa_group *grp)
1446{
1447 if (atomic_dec_and_test(&grp->refcount))
1448 kfree_rcu(grp, rcu);
1449}
1450
1451static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1452 int *priv)
1453{
1454 struct numa_group *grp, *my_grp;
1455 struct task_struct *tsk;
1456 bool join = false;
1457 int cpu = cpupid_to_cpu(cpupid);
1458 int i;
1459
1460 if (unlikely(!p->numa_group)) {
1461 unsigned int size = sizeof(struct numa_group) +
1462 2*nr_node_ids*sizeof(unsigned long);
1463
1464 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1465 if (!grp)
1466 return;
1467
1468 atomic_set(&grp->refcount, 1);
1469 spin_lock_init(&grp->lock);
1470 INIT_LIST_HEAD(&grp->task_list);
1471 grp->gid = p->pid;
1472
1473 for (i = 0; i < 2*nr_node_ids; i++)
1474 grp->faults[i] = p->numa_faults[i];
1475
1476 grp->total_faults = p->total_numa_faults;
1477
1478 list_add(&p->numa_entry, &grp->task_list);
1479 grp->nr_tasks++;
1480 rcu_assign_pointer(p->numa_group, grp);
1481 }
1482
1483 rcu_read_lock();
1484 tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
1485
1486 if (!cpupid_match_pid(tsk, cpupid))
1487 goto no_join;
1488
1489 grp = rcu_dereference(tsk->numa_group);
1490 if (!grp)
1491 goto no_join;
1492
1493 my_grp = p->numa_group;
1494 if (grp == my_grp)
1495 goto no_join;
1496
1497 /*
1498 * Only join the other group if its bigger; if we're the bigger group,
1499 * the other task will join us.
1500 */
1501 if (my_grp->nr_tasks > grp->nr_tasks)
1502 goto no_join;
1503
1504 /*
1505 * Tie-break on the grp address.
1506 */
1507 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1508 goto no_join;
1509
1510 /* Always join threads in the same process. */
1511 if (tsk->mm == current->mm)
1512 join = true;
1513
1514 /* Simple filter to avoid false positives due to PID collisions */
1515 if (flags & TNF_SHARED)
1516 join = true;
1517
1518 /* Update priv based on whether false sharing was detected */
1519 *priv = !join;
1520
1521 if (join && !get_numa_group(grp))
1522 goto no_join;
843 1523
844 /* FIXME: Scheduling placement policy hints go here */ 1524 rcu_read_unlock();
1525
1526 if (!join)
1527 return;
1528
1529 double_lock(&my_grp->lock, &grp->lock);
1530
1531 for (i = 0; i < 2*nr_node_ids; i++) {
1532 my_grp->faults[i] -= p->numa_faults[i];
1533 grp->faults[i] += p->numa_faults[i];
1534 }
1535 my_grp->total_faults -= p->total_numa_faults;
1536 grp->total_faults += p->total_numa_faults;
1537
1538 list_move(&p->numa_entry, &grp->task_list);
1539 my_grp->nr_tasks--;
1540 grp->nr_tasks++;
1541
1542 spin_unlock(&my_grp->lock);
1543 spin_unlock(&grp->lock);
1544
1545 rcu_assign_pointer(p->numa_group, grp);
1546
1547 put_numa_group(my_grp);
1548 return;
1549
1550no_join:
1551 rcu_read_unlock();
1552 return;
1553}
1554
1555void task_numa_free(struct task_struct *p)
1556{
1557 struct numa_group *grp = p->numa_group;
1558 int i;
1559 void *numa_faults = p->numa_faults;
1560
1561 if (grp) {
1562 spin_lock(&grp->lock);
1563 for (i = 0; i < 2*nr_node_ids; i++)
1564 grp->faults[i] -= p->numa_faults[i];
1565 grp->total_faults -= p->total_numa_faults;
1566
1567 list_del(&p->numa_entry);
1568 grp->nr_tasks--;
1569 spin_unlock(&grp->lock);
1570 rcu_assign_pointer(p->numa_group, NULL);
1571 put_numa_group(grp);
1572 }
1573
1574 p->numa_faults = NULL;
1575 p->numa_faults_buffer = NULL;
1576 kfree(numa_faults);
845} 1577}
846 1578
847/* 1579/*
848 * Got a PROT_NONE fault for a page on @node. 1580 * Got a PROT_NONE fault for a page on @node.
849 */ 1581 */
850void task_numa_fault(int node, int pages, bool migrated) 1582void task_numa_fault(int last_cpupid, int node, int pages, int flags)
851{ 1583{
852 struct task_struct *p = current; 1584 struct task_struct *p = current;
1585 bool migrated = flags & TNF_MIGRATED;
1586 int priv;
853 1587
854 if (!numabalancing_enabled) 1588 if (!numabalancing_enabled)
855 return; 1589 return;
856 1590
857 /* FIXME: Allocate task-specific structure for placement policy here */ 1591 /* for example, ksmd faulting in a user's mm */
1592 if (!p->mm)
1593 return;
1594
1595 /* Do not worry about placement if exiting */
1596 if (p->state == TASK_DEAD)
1597 return;
1598
1599 /* Allocate buffer to track faults on a per-node basis */
1600 if (unlikely(!p->numa_faults)) {
1601 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
1602
1603 /* numa_faults and numa_faults_buffer share the allocation */
1604 p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
1605 if (!p->numa_faults)
1606 return;
1607
1608 BUG_ON(p->numa_faults_buffer);
1609 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
1610 p->total_numa_faults = 0;
1611 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1612 }
858 1613
859 /* 1614 /*
860 * If pages are properly placed (did not migrate) then scan slower. 1615 * First accesses are treated as private, otherwise consider accesses
861 * This is reset periodically in case of phase changes 1616 * to be private if the accessing pid has not changed
862 */ 1617 */
863 if (!migrated) 1618 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
864 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, 1619 priv = 1;
865 p->numa_scan_period + jiffies_to_msecs(10)); 1620 } else {
1621 priv = cpupid_match_pid(p, last_cpupid);
1622 if (!priv && !(flags & TNF_NO_GROUP))
1623 task_numa_group(p, last_cpupid, flags, &priv);
1624 }
866 1625
867 task_numa_placement(p); 1626 task_numa_placement(p);
1627
1628 /*
1629 * Retry task to preferred node migration periodically, in case it
1630 * case it previously failed, or the scheduler moved us.
1631 */
1632 if (time_after(jiffies, p->numa_migrate_retry))
1633 numa_migrate_preferred(p);
1634
1635 if (migrated)
1636 p->numa_pages_migrated += pages;
1637
1638 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
1639 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
868} 1640}
869 1641
870static void reset_ptenuma_scan(struct task_struct *p) 1642static void reset_ptenuma_scan(struct task_struct *p)
@@ -884,6 +1656,7 @@ void task_numa_work(struct callback_head *work)
884 struct mm_struct *mm = p->mm; 1656 struct mm_struct *mm = p->mm;
885 struct vm_area_struct *vma; 1657 struct vm_area_struct *vma;
886 unsigned long start, end; 1658 unsigned long start, end;
1659 unsigned long nr_pte_updates = 0;
887 long pages; 1660 long pages;
888 1661
889 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); 1662 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -900,35 +1673,9 @@ void task_numa_work(struct callback_head *work)
900 if (p->flags & PF_EXITING) 1673 if (p->flags & PF_EXITING)
901 return; 1674 return;
902 1675
903 /* 1676 if (!mm->numa_next_scan) {
904 * We do not care about task placement until a task runs on a node 1677 mm->numa_next_scan = now +
905 * other than the first one used by the address space. This is 1678 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
906 * largely because migrations are driven by what CPU the task
907 * is running on. If it's never scheduled on another node, it'll
908 * not migrate so why bother trapping the fault.
909 */
910 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
911 mm->first_nid = numa_node_id();
912 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
913 /* Are we running on a new node yet? */
914 if (numa_node_id() == mm->first_nid &&
915 !sched_feat_numa(NUMA_FORCE))
916 return;
917
918 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
919 }
920
921 /*
922 * Reset the scan period if enough time has gone by. Objective is that
923 * scanning will be reduced if pages are properly placed. As tasks
924 * can enter different phases this needs to be re-examined. Lacking
925 * proper tracking of reference behaviour, this blunt hammer is used.
926 */
927 migrate = mm->numa_next_reset;
928 if (time_after(now, migrate)) {
929 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
930 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
931 xchg(&mm->numa_next_reset, next_scan);
932 } 1679 }
933 1680
934 /* 1681 /*
@@ -938,20 +1685,20 @@ void task_numa_work(struct callback_head *work)
938 if (time_before(now, migrate)) 1685 if (time_before(now, migrate))
939 return; 1686 return;
940 1687
941 if (p->numa_scan_period == 0) 1688 if (p->numa_scan_period == 0) {
942 p->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1689 p->numa_scan_period_max = task_scan_max(p);
1690 p->numa_scan_period = task_scan_min(p);
1691 }
943 1692
944 next_scan = now + msecs_to_jiffies(p->numa_scan_period); 1693 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
945 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) 1694 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
946 return; 1695 return;
947 1696
948 /* 1697 /*
949 * Do not set pte_numa if the current running node is rate-limited. 1698 * Delay this task enough that another task of this mm will likely win
950 * This loses statistics on the fault but if we are unwilling to 1699 * the next time around.
951 * migrate to this node, it is less likely we can do useful work
952 */ 1700 */
953 if (migrate_ratelimited(numa_node_id())) 1701 p->node_stamp += 2 * TICK_NSEC;
954 return;
955 1702
956 start = mm->numa_scan_offset; 1703 start = mm->numa_scan_offset;
957 pages = sysctl_numa_balancing_scan_size; 1704 pages = sysctl_numa_balancing_scan_size;
@@ -967,18 +1714,32 @@ void task_numa_work(struct callback_head *work)
967 vma = mm->mmap; 1714 vma = mm->mmap;
968 } 1715 }
969 for (; vma; vma = vma->vm_next) { 1716 for (; vma; vma = vma->vm_next) {
970 if (!vma_migratable(vma)) 1717 if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
971 continue; 1718 continue;
972 1719
973 /* Skip small VMAs. They are not likely to be of relevance */ 1720 /*
974 if (vma->vm_end - vma->vm_start < HPAGE_SIZE) 1721 * Shared library pages mapped by multiple processes are not
1722 * migrated as it is expected they are cache replicated. Avoid
1723 * hinting faults in read-only file-backed mappings or the vdso
1724 * as migrating the pages will be of marginal benefit.
1725 */
1726 if (!vma->vm_mm ||
1727 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
975 continue; 1728 continue;
976 1729
977 do { 1730 do {
978 start = max(start, vma->vm_start); 1731 start = max(start, vma->vm_start);
979 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 1732 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
980 end = min(end, vma->vm_end); 1733 end = min(end, vma->vm_end);
981 pages -= change_prot_numa(vma, start, end); 1734 nr_pte_updates += change_prot_numa(vma, start, end);
1735
1736 /*
1737 * Scan sysctl_numa_balancing_scan_size but ensure that
1738 * at least one PTE is updated so that unused virtual
1739 * address space is quickly skipped.
1740 */
1741 if (nr_pte_updates)
1742 pages -= (end - start) >> PAGE_SHIFT;
982 1743
983 start = end; 1744 start = end;
984 if (pages <= 0) 1745 if (pages <= 0)
@@ -988,10 +1749,10 @@ void task_numa_work(struct callback_head *work)
988 1749
989out: 1750out:
990 /* 1751 /*
991 * It is possible to reach the end of the VMA list but the last few VMAs are 1752 * It is possible to reach the end of the VMA list but the last few
992 * not guaranteed to the vma_migratable. If they are not, we would find the 1753 * VMAs are not guaranteed to the vma_migratable. If they are not, we
993 * !migratable VMA on the next scan but not reset the scanner to the start 1754 * would find the !migratable VMA on the next scan but not reset the
994 * so check it now. 1755 * scanner to the start so check it now.
995 */ 1756 */
996 if (vma) 1757 if (vma)
997 mm->numa_scan_offset = start; 1758 mm->numa_scan_offset = start;
@@ -1025,8 +1786,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1025 1786
1026 if (now - curr->node_stamp > period) { 1787 if (now - curr->node_stamp > period) {
1027 if (!curr->node_stamp) 1788 if (!curr->node_stamp)
1028 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1789 curr->numa_scan_period = task_scan_min(curr);
1029 curr->node_stamp = now; 1790 curr->node_stamp += period;
1030 1791
1031 if (!time_before(jiffies, curr->mm->numa_next_scan)) { 1792 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1032 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ 1793 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -1038,6 +1799,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1038static void task_tick_numa(struct rq *rq, struct task_struct *curr) 1799static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1039{ 1800{
1040} 1801}
1802
1803static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1804{
1805}
1806
1807static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1808{
1809}
1041#endif /* CONFIG_NUMA_BALANCING */ 1810#endif /* CONFIG_NUMA_BALANCING */
1042 1811
1043static void 1812static void
@@ -1047,8 +1816,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1047 if (!parent_entity(se)) 1816 if (!parent_entity(se))
1048 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 1817 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1049#ifdef CONFIG_SMP 1818#ifdef CONFIG_SMP
1050 if (entity_is_task(se)) 1819 if (entity_is_task(se)) {
1051 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); 1820 struct rq *rq = rq_of(cfs_rq);
1821
1822 account_numa_enqueue(rq, task_of(se));
1823 list_add(&se->group_node, &rq->cfs_tasks);
1824 }
1052#endif 1825#endif
1053 cfs_rq->nr_running++; 1826 cfs_rq->nr_running++;
1054} 1827}
@@ -1059,8 +1832,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1059 update_load_sub(&cfs_rq->load, se->load.weight); 1832 update_load_sub(&cfs_rq->load, se->load.weight);
1060 if (!parent_entity(se)) 1833 if (!parent_entity(se))
1061 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); 1834 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1062 if (entity_is_task(se)) 1835 if (entity_is_task(se)) {
1836 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
1063 list_del_init(&se->group_node); 1837 list_del_init(&se->group_node);
1838 }
1064 cfs_rq->nr_running--; 1839 cfs_rq->nr_running--;
1065} 1840}
1066 1841
@@ -2070,13 +2845,14 @@ static inline bool cfs_bandwidth_used(void)
2070 return static_key_false(&__cfs_bandwidth_used); 2845 return static_key_false(&__cfs_bandwidth_used);
2071} 2846}
2072 2847
2073void account_cfs_bandwidth_used(int enabled, int was_enabled) 2848void cfs_bandwidth_usage_inc(void)
2849{
2850 static_key_slow_inc(&__cfs_bandwidth_used);
2851}
2852
2853void cfs_bandwidth_usage_dec(void)
2074{ 2854{
2075 /* only need to count groups transitioning between enabled/!enabled */ 2855 static_key_slow_dec(&__cfs_bandwidth_used);
2076 if (enabled && !was_enabled)
2077 static_key_slow_inc(&__cfs_bandwidth_used);
2078 else if (!enabled && was_enabled)
2079 static_key_slow_dec(&__cfs_bandwidth_used);
2080} 2856}
2081#else /* HAVE_JUMP_LABEL */ 2857#else /* HAVE_JUMP_LABEL */
2082static bool cfs_bandwidth_used(void) 2858static bool cfs_bandwidth_used(void)
@@ -2084,7 +2860,8 @@ static bool cfs_bandwidth_used(void)
2084 return true; 2860 return true;
2085} 2861}
2086 2862
2087void account_cfs_bandwidth_used(int enabled, int was_enabled) {} 2863void cfs_bandwidth_usage_inc(void) {}
2864void cfs_bandwidth_usage_dec(void) {}
2088#endif /* HAVE_JUMP_LABEL */ 2865#endif /* HAVE_JUMP_LABEL */
2089 2866
2090/* 2867/*
@@ -2335,6 +3112,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2335 cfs_rq->throttled_clock = rq_clock(rq); 3112 cfs_rq->throttled_clock = rq_clock(rq);
2336 raw_spin_lock(&cfs_b->lock); 3113 raw_spin_lock(&cfs_b->lock);
2337 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 3114 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3115 if (!cfs_b->timer_active)
3116 __start_cfs_bandwidth(cfs_b);
2338 raw_spin_unlock(&cfs_b->lock); 3117 raw_spin_unlock(&cfs_b->lock);
2339} 3118}
2340 3119
@@ -2448,6 +3227,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2448 if (idle) 3227 if (idle)
2449 goto out_unlock; 3228 goto out_unlock;
2450 3229
3230 /*
3231 * if we have relooped after returning idle once, we need to update our
3232 * status as actually running, so that other cpus doing
3233 * __start_cfs_bandwidth will stop trying to cancel us.
3234 */
3235 cfs_b->timer_active = 1;
3236
2451 __refill_cfs_bandwidth_runtime(cfs_b); 3237 __refill_cfs_bandwidth_runtime(cfs_b);
2452 3238
2453 if (!throttled) { 3239 if (!throttled) {
@@ -2508,7 +3294,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2508/* how long we wait to gather additional slack before distributing */ 3294/* how long we wait to gather additional slack before distributing */
2509static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; 3295static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2510 3296
2511/* are we near the end of the current quota period? */ 3297/*
3298 * Are we near the end of the current quota period?
3299 *
3300 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3301 * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
3302 * migrate_hrtimers, base is never cleared, so we are fine.
3303 */
2512static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) 3304static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2513{ 3305{
2514 struct hrtimer *refresh_timer = &cfs_b->period_timer; 3306 struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -2584,10 +3376,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2584 u64 expires; 3376 u64 expires;
2585 3377
2586 /* confirm we're still not at a refresh boundary */ 3378 /* confirm we're still not at a refresh boundary */
2587 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) 3379 raw_spin_lock(&cfs_b->lock);
3380 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3381 raw_spin_unlock(&cfs_b->lock);
2588 return; 3382 return;
3383 }
2589 3384
2590 raw_spin_lock(&cfs_b->lock);
2591 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { 3385 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2592 runtime = cfs_b->runtime; 3386 runtime = cfs_b->runtime;
2593 cfs_b->runtime = 0; 3387 cfs_b->runtime = 0;
@@ -2708,11 +3502,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2708 * (timer_active==0 becomes visible before the hrtimer call-back 3502 * (timer_active==0 becomes visible before the hrtimer call-back
2709 * terminates). In either case we ensure that it's re-programmed 3503 * terminates). In either case we ensure that it's re-programmed
2710 */ 3504 */
2711 while (unlikely(hrtimer_active(&cfs_b->period_timer))) { 3505 while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
3506 hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
3507 /* bounce the lock to allow do_sched_cfs_period_timer to run */
2712 raw_spin_unlock(&cfs_b->lock); 3508 raw_spin_unlock(&cfs_b->lock);
2713 /* ensure cfs_b->lock is available while we wait */ 3509 cpu_relax();
2714 hrtimer_cancel(&cfs_b->period_timer);
2715
2716 raw_spin_lock(&cfs_b->lock); 3510 raw_spin_lock(&cfs_b->lock);
2717 /* if someone else restarted the timer then we're done */ 3511 /* if someone else restarted the timer then we're done */
2718 if (cfs_b->timer_active) 3512 if (cfs_b->timer_active)
@@ -3113,7 +3907,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3113{ 3907{
3114 struct sched_entity *se = tg->se[cpu]; 3908 struct sched_entity *se = tg->se[cpu];
3115 3909
3116 if (!tg->parent) /* the trivial, non-cgroup case */ 3910 if (!tg->parent || !wl) /* the trivial, non-cgroup case */
3117 return wl; 3911 return wl;
3118 3912
3119 for_each_sched_entity(se) { 3913 for_each_sched_entity(se) {
@@ -3166,8 +3960,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3166} 3960}
3167#else 3961#else
3168 3962
3169static inline unsigned long effective_load(struct task_group *tg, int cpu, 3963static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3170 unsigned long wl, unsigned long wg)
3171{ 3964{
3172 return wl; 3965 return wl;
3173} 3966}
@@ -3420,11 +4213,10 @@ done:
3420 * preempt must be disabled. 4213 * preempt must be disabled.
3421 */ 4214 */
3422static int 4215static int
3423select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 4216select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
3424{ 4217{
3425 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 4218 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
3426 int cpu = smp_processor_id(); 4219 int cpu = smp_processor_id();
3427 int prev_cpu = task_cpu(p);
3428 int new_cpu = cpu; 4220 int new_cpu = cpu;
3429 int want_affine = 0; 4221 int want_affine = 0;
3430 int sync = wake_flags & WF_SYNC; 4222 int sync = wake_flags & WF_SYNC;
@@ -3904,9 +4696,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
3904 4696
3905static unsigned long __read_mostly max_load_balance_interval = HZ/10; 4697static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3906 4698
4699enum fbq_type { regular, remote, all };
4700
3907#define LBF_ALL_PINNED 0x01 4701#define LBF_ALL_PINNED 0x01
3908#define LBF_NEED_BREAK 0x02 4702#define LBF_NEED_BREAK 0x02
3909#define LBF_SOME_PINNED 0x04 4703#define LBF_DST_PINNED 0x04
4704#define LBF_SOME_PINNED 0x08
3910 4705
3911struct lb_env { 4706struct lb_env {
3912 struct sched_domain *sd; 4707 struct sched_domain *sd;
@@ -3929,6 +4724,8 @@ struct lb_env {
3929 unsigned int loop; 4724 unsigned int loop;
3930 unsigned int loop_break; 4725 unsigned int loop_break;
3931 unsigned int loop_max; 4726 unsigned int loop_max;
4727
4728 enum fbq_type fbq_type;
3932}; 4729};
3933 4730
3934/* 4731/*
@@ -3975,6 +4772,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3975 return delta < (s64)sysctl_sched_migration_cost; 4772 return delta < (s64)sysctl_sched_migration_cost;
3976} 4773}
3977 4774
4775#ifdef CONFIG_NUMA_BALANCING
4776/* Returns true if the destination node has incurred more faults */
4777static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4778{
4779 int src_nid, dst_nid;
4780
4781 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
4782 !(env->sd->flags & SD_NUMA)) {
4783 return false;
4784 }
4785
4786 src_nid = cpu_to_node(env->src_cpu);
4787 dst_nid = cpu_to_node(env->dst_cpu);
4788
4789 if (src_nid == dst_nid)
4790 return false;
4791
4792 /* Always encourage migration to the preferred node. */
4793 if (dst_nid == p->numa_preferred_nid)
4794 return true;
4795
4796 /* If both task and group weight improve, this move is a winner. */
4797 if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
4798 group_weight(p, dst_nid) > group_weight(p, src_nid))
4799 return true;
4800
4801 return false;
4802}
4803
4804
4805static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4806{
4807 int src_nid, dst_nid;
4808
4809 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
4810 return false;
4811
4812 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
4813 return false;
4814
4815 src_nid = cpu_to_node(env->src_cpu);
4816 dst_nid = cpu_to_node(env->dst_cpu);
4817
4818 if (src_nid == dst_nid)
4819 return false;
4820
4821 /* Migrating away from the preferred node is always bad. */
4822 if (src_nid == p->numa_preferred_nid)
4823 return true;
4824
4825 /* If either task or group weight get worse, don't do it. */
4826 if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
4827 group_weight(p, dst_nid) < group_weight(p, src_nid))
4828 return true;
4829
4830 return false;
4831}
4832
4833#else
4834static inline bool migrate_improves_locality(struct task_struct *p,
4835 struct lb_env *env)
4836{
4837 return false;
4838}
4839
4840static inline bool migrate_degrades_locality(struct task_struct *p,
4841 struct lb_env *env)
4842{
4843 return false;
4844}
4845#endif
4846
3978/* 4847/*
3979 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 4848 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3980 */ 4849 */
@@ -3997,6 +4866,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3997 4866
3998 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 4867 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3999 4868
4869 env->flags |= LBF_SOME_PINNED;
4870
4000 /* 4871 /*
4001 * Remember if this task can be migrated to any other cpu in 4872 * Remember if this task can be migrated to any other cpu in
4002 * our sched_group. We may want to revisit it if we couldn't 4873 * our sched_group. We may want to revisit it if we couldn't
@@ -4005,13 +4876,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4005 * Also avoid computing new_dst_cpu if we have already computed 4876 * Also avoid computing new_dst_cpu if we have already computed
4006 * one in current iteration. 4877 * one in current iteration.
4007 */ 4878 */
4008 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) 4879 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
4009 return 0; 4880 return 0;
4010 4881
4011 /* Prevent to re-select dst_cpu via env's cpus */ 4882 /* Prevent to re-select dst_cpu via env's cpus */
4012 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 4883 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
4013 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 4884 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
4014 env->flags |= LBF_SOME_PINNED; 4885 env->flags |= LBF_DST_PINNED;
4015 env->new_dst_cpu = cpu; 4886 env->new_dst_cpu = cpu;
4016 break; 4887 break;
4017 } 4888 }
@@ -4030,11 +4901,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4030 4901
4031 /* 4902 /*
4032 * Aggressive migration if: 4903 * Aggressive migration if:
4033 * 1) task is cache cold, or 4904 * 1) destination numa is preferred
4034 * 2) too many balance attempts have failed. 4905 * 2) task is cache cold, or
4906 * 3) too many balance attempts have failed.
4035 */ 4907 */
4036
4037 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); 4908 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
4909 if (!tsk_cache_hot)
4910 tsk_cache_hot = migrate_degrades_locality(p, env);
4911
4912 if (migrate_improves_locality(p, env)) {
4913#ifdef CONFIG_SCHEDSTATS
4914 if (tsk_cache_hot) {
4915 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4916 schedstat_inc(p, se.statistics.nr_forced_migrations);
4917 }
4918#endif
4919 return 1;
4920 }
4921
4038 if (!tsk_cache_hot || 4922 if (!tsk_cache_hot ||
4039 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 4923 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4040 4924
@@ -4077,8 +4961,6 @@ static int move_one_task(struct lb_env *env)
4077 return 0; 4961 return 0;
4078} 4962}
4079 4963
4080static unsigned long task_h_load(struct task_struct *p);
4081
4082static const unsigned int sched_nr_migrate_break = 32; 4964static const unsigned int sched_nr_migrate_break = 32;
4083 4965
4084/* 4966/*
@@ -4291,6 +5173,10 @@ struct sg_lb_stats {
4291 unsigned int group_weight; 5173 unsigned int group_weight;
4292 int group_imb; /* Is there an imbalance in the group ? */ 5174 int group_imb; /* Is there an imbalance in the group ? */
4293 int group_has_capacity; /* Is there extra capacity in the group? */ 5175 int group_has_capacity; /* Is there extra capacity in the group? */
5176#ifdef CONFIG_NUMA_BALANCING
5177 unsigned int nr_numa_running;
5178 unsigned int nr_preferred_running;
5179#endif
4294}; 5180};
4295 5181
4296/* 5182/*
@@ -4330,7 +5216,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
4330/** 5216/**
4331 * get_sd_load_idx - Obtain the load index for a given sched domain. 5217 * get_sd_load_idx - Obtain the load index for a given sched domain.
4332 * @sd: The sched_domain whose load_idx is to be obtained. 5218 * @sd: The sched_domain whose load_idx is to be obtained.
4333 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 5219 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
4334 * 5220 *
4335 * Return: The load index. 5221 * Return: The load index.
4336 */ 5222 */
@@ -4447,7 +5333,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
4447{ 5333{
4448 struct sched_domain *child = sd->child; 5334 struct sched_domain *child = sd->child;
4449 struct sched_group *group, *sdg = sd->groups; 5335 struct sched_group *group, *sdg = sd->groups;
4450 unsigned long power; 5336 unsigned long power, power_orig;
4451 unsigned long interval; 5337 unsigned long interval;
4452 5338
4453 interval = msecs_to_jiffies(sd->balance_interval); 5339 interval = msecs_to_jiffies(sd->balance_interval);
@@ -4459,7 +5345,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
4459 return; 5345 return;
4460 } 5346 }
4461 5347
4462 power = 0; 5348 power_orig = power = 0;
4463 5349
4464 if (child->flags & SD_OVERLAP) { 5350 if (child->flags & SD_OVERLAP) {
4465 /* 5351 /*
@@ -4467,8 +5353,12 @@ void update_group_power(struct sched_domain *sd, int cpu)
4467 * span the current group. 5353 * span the current group.
4468 */ 5354 */
4469 5355
4470 for_each_cpu(cpu, sched_group_cpus(sdg)) 5356 for_each_cpu(cpu, sched_group_cpus(sdg)) {
4471 power += power_of(cpu); 5357 struct sched_group *sg = cpu_rq(cpu)->sd->groups;
5358
5359 power_orig += sg->sgp->power_orig;
5360 power += sg->sgp->power;
5361 }
4472 } else { 5362 } else {
4473 /* 5363 /*
4474 * !SD_OVERLAP domains can assume that child groups 5364 * !SD_OVERLAP domains can assume that child groups
@@ -4477,12 +5367,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
4477 5367
4478 group = child->groups; 5368 group = child->groups;
4479 do { 5369 do {
5370 power_orig += group->sgp->power_orig;
4480 power += group->sgp->power; 5371 power += group->sgp->power;
4481 group = group->next; 5372 group = group->next;
4482 } while (group != child->groups); 5373 } while (group != child->groups);
4483 } 5374 }
4484 5375
4485 sdg->sgp->power_orig = sdg->sgp->power = power; 5376 sdg->sgp->power_orig = power_orig;
5377 sdg->sgp->power = power;
4486} 5378}
4487 5379
4488/* 5380/*
@@ -4526,13 +5418,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4526 * cpu 3 and leave one of the cpus in the second group unused. 5418 * cpu 3 and leave one of the cpus in the second group unused.
4527 * 5419 *
4528 * The current solution to this issue is detecting the skew in the first group 5420 * The current solution to this issue is detecting the skew in the first group
4529 * by noticing it has a cpu that is overloaded while the remaining cpus are 5421 * by noticing the lower domain failed to reach balance and had difficulty
4530 * idle -- or rather, there's a distinct imbalance in the cpus; see 5422 * moving tasks due to affinity constraints.
4531 * sg_imbalanced().
4532 * 5423 *
4533 * When this is so detected; this group becomes a candidate for busiest; see 5424 * When this is so detected; this group becomes a candidate for busiest; see
4534 * update_sd_pick_busiest(). And calculcate_imbalance() and 5425 * update_sd_pick_busiest(). And calculate_imbalance() and
4535 * find_busiest_group() avoid some of the usual balance conditional to allow it 5426 * find_busiest_group() avoid some of the usual balance conditions to allow it
4536 * to create an effective group imbalance. 5427 * to create an effective group imbalance.
4537 * 5428 *
4538 * This is a somewhat tricky proposition since the next run might not find the 5429 * This is a somewhat tricky proposition since the next run might not find the
@@ -4540,49 +5431,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4540 * subtle and fragile situation. 5431 * subtle and fragile situation.
4541 */ 5432 */
4542 5433
4543struct sg_imb_stats { 5434static inline int sg_imbalanced(struct sched_group *group)
4544 unsigned long max_nr_running, min_nr_running;
4545 unsigned long max_cpu_load, min_cpu_load;
4546};
4547
4548static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
4549{ 5435{
4550 sgi->max_cpu_load = sgi->max_nr_running = 0UL; 5436 return group->sgp->imbalance;
4551 sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
4552} 5437}
4553 5438
4554static inline void 5439/*
4555update_sg_imb_stats(struct sg_imb_stats *sgi, 5440 * Compute the group capacity.
4556 unsigned long load, unsigned long nr_running) 5441 *
5442 * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
5443 * first dividing out the smt factor and computing the actual number of cores
5444 * and limit power unit capacity with that.
5445 */
5446static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
4557{ 5447{
4558 if (load > sgi->max_cpu_load) 5448 unsigned int capacity, smt, cpus;
4559 sgi->max_cpu_load = load; 5449 unsigned int power, power_orig;
4560 if (sgi->min_cpu_load > load)
4561 sgi->min_cpu_load = load;
4562 5450
4563 if (nr_running > sgi->max_nr_running) 5451 power = group->sgp->power;
4564 sgi->max_nr_running = nr_running; 5452 power_orig = group->sgp->power_orig;
4565 if (sgi->min_nr_running > nr_running) 5453 cpus = group->group_weight;
4566 sgi->min_nr_running = nr_running;
4567}
4568 5454
4569static inline int 5455 /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
4570sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) 5456 smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
4571{ 5457 capacity = cpus / smt; /* cores */
4572 /*
4573 * Consider the group unbalanced when the imbalance is larger
4574 * than the average weight of a task.
4575 *
4576 * APZ: with cgroup the avg task weight can vary wildly and
4577 * might not be a suitable number - should we keep a
4578 * normalized nr_running number somewhere that negates
4579 * the hierarchy?
4580 */
4581 if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
4582 (sgi->max_nr_running - sgi->min_nr_running) > 1)
4583 return 1;
4584 5458
4585 return 0; 5459 capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
5460 if (!capacity)
5461 capacity = fix_small_capacity(env->sd, group);
5462
5463 return capacity;
4586} 5464}
4587 5465
4588/** 5466/**
@@ -4597,12 +5475,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4597 struct sched_group *group, int load_idx, 5475 struct sched_group *group, int load_idx,
4598 int local_group, struct sg_lb_stats *sgs) 5476 int local_group, struct sg_lb_stats *sgs)
4599{ 5477{
4600 struct sg_imb_stats sgi;
4601 unsigned long nr_running; 5478 unsigned long nr_running;
4602 unsigned long load; 5479 unsigned long load;
4603 int i; 5480 int i;
4604 5481
4605 init_sg_imb_stats(&sgi); 5482 memset(sgs, 0, sizeof(*sgs));
4606 5483
4607 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5484 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4608 struct rq *rq = cpu_rq(i); 5485 struct rq *rq = cpu_rq(i);
@@ -4610,24 +5487,22 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4610 nr_running = rq->nr_running; 5487 nr_running = rq->nr_running;
4611 5488
4612 /* Bias balancing toward cpus of our domain */ 5489 /* Bias balancing toward cpus of our domain */
4613 if (local_group) { 5490 if (local_group)
4614 load = target_load(i, load_idx); 5491 load = target_load(i, load_idx);
4615 } else { 5492 else
4616 load = source_load(i, load_idx); 5493 load = source_load(i, load_idx);
4617 update_sg_imb_stats(&sgi, load, nr_running);
4618 }
4619 5494
4620 sgs->group_load += load; 5495 sgs->group_load += load;
4621 sgs->sum_nr_running += nr_running; 5496 sgs->sum_nr_running += nr_running;
5497#ifdef CONFIG_NUMA_BALANCING
5498 sgs->nr_numa_running += rq->nr_numa_running;
5499 sgs->nr_preferred_running += rq->nr_preferred_running;
5500#endif
4622 sgs->sum_weighted_load += weighted_cpuload(i); 5501 sgs->sum_weighted_load += weighted_cpuload(i);
4623 if (idle_cpu(i)) 5502 if (idle_cpu(i))
4624 sgs->idle_cpus++; 5503 sgs->idle_cpus++;
4625 } 5504 }
4626 5505
4627 if (local_group && (env->idle != CPU_NEWLY_IDLE ||
4628 time_after_eq(jiffies, group->sgp->next_update)))
4629 update_group_power(env->sd, env->dst_cpu);
4630
4631 /* Adjust by relative CPU power of the group */ 5506 /* Adjust by relative CPU power of the group */
4632 sgs->group_power = group->sgp->power; 5507 sgs->group_power = group->sgp->power;
4633 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; 5508 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
@@ -4635,16 +5510,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4635 if (sgs->sum_nr_running) 5510 if (sgs->sum_nr_running)
4636 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 5511 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4637 5512
4638 sgs->group_imb = sg_imbalanced(sgs, &sgi);
4639
4640 sgs->group_capacity =
4641 DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
4642
4643 if (!sgs->group_capacity)
4644 sgs->group_capacity = fix_small_capacity(env->sd, group);
4645
4646 sgs->group_weight = group->group_weight; 5513 sgs->group_weight = group->group_weight;
4647 5514
5515 sgs->group_imb = sg_imbalanced(group);
5516 sgs->group_capacity = sg_capacity(env, group);
5517
4648 if (sgs->group_capacity > sgs->sum_nr_running) 5518 if (sgs->group_capacity > sgs->sum_nr_running)
4649 sgs->group_has_capacity = 1; 5519 sgs->group_has_capacity = 1;
4650} 5520}
@@ -4693,14 +5563,42 @@ static bool update_sd_pick_busiest(struct lb_env *env,
4693 return false; 5563 return false;
4694} 5564}
4695 5565
5566#ifdef CONFIG_NUMA_BALANCING
5567static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5568{
5569 if (sgs->sum_nr_running > sgs->nr_numa_running)
5570 return regular;
5571 if (sgs->sum_nr_running > sgs->nr_preferred_running)
5572 return remote;
5573 return all;
5574}
5575
5576static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5577{
5578 if (rq->nr_running > rq->nr_numa_running)
5579 return regular;
5580 if (rq->nr_running > rq->nr_preferred_running)
5581 return remote;
5582 return all;
5583}
5584#else
5585static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5586{
5587 return all;
5588}
5589
5590static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5591{
5592 return regular;
5593}
5594#endif /* CONFIG_NUMA_BALANCING */
5595
4696/** 5596/**
4697 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 5597 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
4698 * @env: The load balancing environment. 5598 * @env: The load balancing environment.
4699 * @balance: Should we balance.
4700 * @sds: variable to hold the statistics for this sched_domain. 5599 * @sds: variable to hold the statistics for this sched_domain.
4701 */ 5600 */
4702static inline void update_sd_lb_stats(struct lb_env *env, 5601static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
4703 struct sd_lb_stats *sds)
4704{ 5602{
4705 struct sched_domain *child = env->sd->child; 5603 struct sched_domain *child = env->sd->child;
4706 struct sched_group *sg = env->sd->groups; 5604 struct sched_group *sg = env->sd->groups;
@@ -4720,11 +5618,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4720 if (local_group) { 5618 if (local_group) {
4721 sds->local = sg; 5619 sds->local = sg;
4722 sgs = &sds->local_stat; 5620 sgs = &sds->local_stat;
5621
5622 if (env->idle != CPU_NEWLY_IDLE ||
5623 time_after_eq(jiffies, sg->sgp->next_update))
5624 update_group_power(env->sd, env->dst_cpu);
4723 } 5625 }
4724 5626
4725 memset(sgs, 0, sizeof(*sgs));
4726 update_sg_lb_stats(env, sg, load_idx, local_group, sgs); 5627 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
4727 5628
5629 if (local_group)
5630 goto next_group;
5631
4728 /* 5632 /*
4729 * In case the child domain prefers tasks go to siblings 5633 * In case the child domain prefers tasks go to siblings
4730 * first, lower the sg capacity to one so that we'll try 5634 * first, lower the sg capacity to one so that we'll try
@@ -4735,21 +5639,25 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4735 * heaviest group when it is already under-utilized (possible 5639 * heaviest group when it is already under-utilized (possible
4736 * with a large weight task outweighs the tasks on the system). 5640 * with a large weight task outweighs the tasks on the system).
4737 */ 5641 */
4738 if (prefer_sibling && !local_group && 5642 if (prefer_sibling && sds->local &&
4739 sds->local && sds->local_stat.group_has_capacity) 5643 sds->local_stat.group_has_capacity)
4740 sgs->group_capacity = min(sgs->group_capacity, 1U); 5644 sgs->group_capacity = min(sgs->group_capacity, 1U);
4741 5645
4742 /* Now, start updating sd_lb_stats */ 5646 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
4743 sds->total_load += sgs->group_load;
4744 sds->total_pwr += sgs->group_power;
4745
4746 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
4747 sds->busiest = sg; 5647 sds->busiest = sg;
4748 sds->busiest_stat = *sgs; 5648 sds->busiest_stat = *sgs;
4749 } 5649 }
4750 5650
5651next_group:
5652 /* Now, start updating sd_lb_stats */
5653 sds->total_load += sgs->group_load;
5654 sds->total_pwr += sgs->group_power;
5655
4751 sg = sg->next; 5656 sg = sg->next;
4752 } while (sg != env->sd->groups); 5657 } while (sg != env->sd->groups);
5658
5659 if (env->sd->flags & SD_NUMA)
5660 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
4753} 5661}
4754 5662
4755/** 5663/**
@@ -5053,15 +5961,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
5053 int i; 5961 int i;
5054 5962
5055 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5963 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5056 unsigned long power = power_of(i); 5964 unsigned long power, capacity, wl;
5057 unsigned long capacity = DIV_ROUND_CLOSEST(power, 5965 enum fbq_type rt;
5058 SCHED_POWER_SCALE); 5966
5059 unsigned long wl; 5967 rq = cpu_rq(i);
5968 rt = fbq_classify_rq(rq);
5969
5970 /*
5971 * We classify groups/runqueues into three groups:
5972 * - regular: there are !numa tasks
5973 * - remote: there are numa tasks that run on the 'wrong' node
5974 * - all: there is no distinction
5975 *
5976 * In order to avoid migrating ideally placed numa tasks,
5977 * ignore those when there's better options.
5978 *
5979 * If we ignore the actual busiest queue to migrate another
5980 * task, the next balance pass can still reduce the busiest
5981 * queue by moving tasks around inside the node.
5982 *
5983 * If we cannot move enough load due to this classification
5984 * the next pass will adjust the group classification and
5985 * allow migration of more tasks.
5986 *
5987 * Both cases only affect the total convergence complexity.
5988 */
5989 if (rt > env->fbq_type)
5990 continue;
5060 5991
5992 power = power_of(i);
5993 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
5061 if (!capacity) 5994 if (!capacity)
5062 capacity = fix_small_capacity(env->sd, group); 5995 capacity = fix_small_capacity(env->sd, group);
5063 5996
5064 rq = cpu_rq(i);
5065 wl = weighted_cpuload(i); 5997 wl = weighted_cpuload(i);
5066 5998
5067 /* 5999 /*
@@ -5164,6 +6096,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5164 int *continue_balancing) 6096 int *continue_balancing)
5165{ 6097{
5166 int ld_moved, cur_ld_moved, active_balance = 0; 6098 int ld_moved, cur_ld_moved, active_balance = 0;
6099 struct sched_domain *sd_parent = sd->parent;
5167 struct sched_group *group; 6100 struct sched_group *group;
5168 struct rq *busiest; 6101 struct rq *busiest;
5169 unsigned long flags; 6102 unsigned long flags;
@@ -5177,6 +6110,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5177 .idle = idle, 6110 .idle = idle,
5178 .loop_break = sched_nr_migrate_break, 6111 .loop_break = sched_nr_migrate_break,
5179 .cpus = cpus, 6112 .cpus = cpus,
6113 .fbq_type = all,
5180 }; 6114 };
5181 6115
5182 /* 6116 /*
@@ -5268,17 +6202,17 @@ more_balance:
5268 * moreover subsequent load balance cycles should correct the 6202 * moreover subsequent load balance cycles should correct the
5269 * excess load moved. 6203 * excess load moved.
5270 */ 6204 */
5271 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { 6205 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
6206
6207 /* Prevent to re-select dst_cpu via env's cpus */
6208 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5272 6209
5273 env.dst_rq = cpu_rq(env.new_dst_cpu); 6210 env.dst_rq = cpu_rq(env.new_dst_cpu);
5274 env.dst_cpu = env.new_dst_cpu; 6211 env.dst_cpu = env.new_dst_cpu;
5275 env.flags &= ~LBF_SOME_PINNED; 6212 env.flags &= ~LBF_DST_PINNED;
5276 env.loop = 0; 6213 env.loop = 0;
5277 env.loop_break = sched_nr_migrate_break; 6214 env.loop_break = sched_nr_migrate_break;
5278 6215
5279 /* Prevent to re-select dst_cpu via env's cpus */
5280 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5281
5282 /* 6216 /*
5283 * Go back to "more_balance" rather than "redo" since we 6217 * Go back to "more_balance" rather than "redo" since we
5284 * need to continue with same src_cpu. 6218 * need to continue with same src_cpu.
@@ -5286,6 +6220,18 @@ more_balance:
5286 goto more_balance; 6220 goto more_balance;
5287 } 6221 }
5288 6222
6223 /*
6224 * We failed to reach balance because of affinity.
6225 */
6226 if (sd_parent) {
6227 int *group_imbalance = &sd_parent->groups->sgp->imbalance;
6228
6229 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
6230 *group_imbalance = 1;
6231 } else if (*group_imbalance)
6232 *group_imbalance = 0;
6233 }
6234
5289 /* All tasks on this runqueue were pinned by CPU affinity */ 6235 /* All tasks on this runqueue were pinned by CPU affinity */
5290 if (unlikely(env.flags & LBF_ALL_PINNED)) { 6236 if (unlikely(env.flags & LBF_ALL_PINNED)) {
5291 cpumask_clear_cpu(cpu_of(busiest), cpus); 6237 cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5393,6 +6339,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5393 struct sched_domain *sd; 6339 struct sched_domain *sd;
5394 int pulled_task = 0; 6340 int pulled_task = 0;
5395 unsigned long next_balance = jiffies + HZ; 6341 unsigned long next_balance = jiffies + HZ;
6342 u64 curr_cost = 0;
5396 6343
5397 this_rq->idle_stamp = rq_clock(this_rq); 6344 this_rq->idle_stamp = rq_clock(this_rq);
5398 6345
@@ -5409,15 +6356,27 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5409 for_each_domain(this_cpu, sd) { 6356 for_each_domain(this_cpu, sd) {
5410 unsigned long interval; 6357 unsigned long interval;
5411 int continue_balancing = 1; 6358 int continue_balancing = 1;
6359 u64 t0, domain_cost;
5412 6360
5413 if (!(sd->flags & SD_LOAD_BALANCE)) 6361 if (!(sd->flags & SD_LOAD_BALANCE))
5414 continue; 6362 continue;
5415 6363
6364 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
6365 break;
6366
5416 if (sd->flags & SD_BALANCE_NEWIDLE) { 6367 if (sd->flags & SD_BALANCE_NEWIDLE) {
6368 t0 = sched_clock_cpu(this_cpu);
6369
5417 /* If we've pulled tasks over stop searching: */ 6370 /* If we've pulled tasks over stop searching: */
5418 pulled_task = load_balance(this_cpu, this_rq, 6371 pulled_task = load_balance(this_cpu, this_rq,
5419 sd, CPU_NEWLY_IDLE, 6372 sd, CPU_NEWLY_IDLE,
5420 &continue_balancing); 6373 &continue_balancing);
6374
6375 domain_cost = sched_clock_cpu(this_cpu) - t0;
6376 if (domain_cost > sd->max_newidle_lb_cost)
6377 sd->max_newidle_lb_cost = domain_cost;
6378
6379 curr_cost += domain_cost;
5421 } 6380 }
5422 6381
5423 interval = msecs_to_jiffies(sd->balance_interval); 6382 interval = msecs_to_jiffies(sd->balance_interval);
@@ -5439,6 +6398,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5439 */ 6398 */
5440 this_rq->next_balance = next_balance; 6399 this_rq->next_balance = next_balance;
5441 } 6400 }
6401
6402 if (curr_cost > this_rq->max_idle_balance_cost)
6403 this_rq->max_idle_balance_cost = curr_cost;
5442} 6404}
5443 6405
5444/* 6406/*
@@ -5572,16 +6534,16 @@ static inline void nohz_balance_exit_idle(int cpu)
5572static inline void set_cpu_sd_state_busy(void) 6534static inline void set_cpu_sd_state_busy(void)
5573{ 6535{
5574 struct sched_domain *sd; 6536 struct sched_domain *sd;
6537 int cpu = smp_processor_id();
5575 6538
5576 rcu_read_lock(); 6539 rcu_read_lock();
5577 sd = rcu_dereference_check_sched_domain(this_rq()->sd); 6540 sd = rcu_dereference(per_cpu(sd_busy, cpu));
5578 6541
5579 if (!sd || !sd->nohz_idle) 6542 if (!sd || !sd->nohz_idle)
5580 goto unlock; 6543 goto unlock;
5581 sd->nohz_idle = 0; 6544 sd->nohz_idle = 0;
5582 6545
5583 for (; sd; sd = sd->parent) 6546 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5584 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5585unlock: 6547unlock:
5586 rcu_read_unlock(); 6548 rcu_read_unlock();
5587} 6549}
@@ -5589,16 +6551,16 @@ unlock:
5589void set_cpu_sd_state_idle(void) 6551void set_cpu_sd_state_idle(void)
5590{ 6552{
5591 struct sched_domain *sd; 6553 struct sched_domain *sd;
6554 int cpu = smp_processor_id();
5592 6555
5593 rcu_read_lock(); 6556 rcu_read_lock();
5594 sd = rcu_dereference_check_sched_domain(this_rq()->sd); 6557 sd = rcu_dereference(per_cpu(sd_busy, cpu));
5595 6558
5596 if (!sd || sd->nohz_idle) 6559 if (!sd || sd->nohz_idle)
5597 goto unlock; 6560 goto unlock;
5598 sd->nohz_idle = 1; 6561 sd->nohz_idle = 1;
5599 6562
5600 for (; sd; sd = sd->parent) 6563 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5601 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5602unlock: 6564unlock:
5603 rcu_read_unlock(); 6565 rcu_read_unlock();
5604} 6566}
@@ -5662,15 +6624,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5662 /* Earliest time when we have to do rebalance again */ 6624 /* Earliest time when we have to do rebalance again */
5663 unsigned long next_balance = jiffies + 60*HZ; 6625 unsigned long next_balance = jiffies + 60*HZ;
5664 int update_next_balance = 0; 6626 int update_next_balance = 0;
5665 int need_serialize; 6627 int need_serialize, need_decay = 0;
6628 u64 max_cost = 0;
5666 6629
5667 update_blocked_averages(cpu); 6630 update_blocked_averages(cpu);
5668 6631
5669 rcu_read_lock(); 6632 rcu_read_lock();
5670 for_each_domain(cpu, sd) { 6633 for_each_domain(cpu, sd) {
6634 /*
6635 * Decay the newidle max times here because this is a regular
6636 * visit to all the domains. Decay ~1% per second.
6637 */
6638 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
6639 sd->max_newidle_lb_cost =
6640 (sd->max_newidle_lb_cost * 253) / 256;
6641 sd->next_decay_max_lb_cost = jiffies + HZ;
6642 need_decay = 1;
6643 }
6644 max_cost += sd->max_newidle_lb_cost;
6645
5671 if (!(sd->flags & SD_LOAD_BALANCE)) 6646 if (!(sd->flags & SD_LOAD_BALANCE))
5672 continue; 6647 continue;
5673 6648
6649 /*
6650 * Stop the load balance at this level. There is another
6651 * CPU in our sched group which is doing load balancing more
6652 * actively.
6653 */
6654 if (!continue_balancing) {
6655 if (need_decay)
6656 continue;
6657 break;
6658 }
6659
5674 interval = sd->balance_interval; 6660 interval = sd->balance_interval;
5675 if (idle != CPU_IDLE) 6661 if (idle != CPU_IDLE)
5676 interval *= sd->busy_factor; 6662 interval *= sd->busy_factor;
@@ -5689,7 +6675,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5689 if (time_after_eq(jiffies, sd->last_balance + interval)) { 6675 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5690 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { 6676 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
5691 /* 6677 /*
5692 * The LBF_SOME_PINNED logic could have changed 6678 * The LBF_DST_PINNED logic could have changed
5693 * env->dst_cpu, so we can't know our idle 6679 * env->dst_cpu, so we can't know our idle
5694 * state even if we migrated tasks. Update it. 6680 * state even if we migrated tasks. Update it.
5695 */ 6681 */
@@ -5704,14 +6690,14 @@ out:
5704 next_balance = sd->last_balance + interval; 6690 next_balance = sd->last_balance + interval;
5705 update_next_balance = 1; 6691 update_next_balance = 1;
5706 } 6692 }
5707 6693 }
6694 if (need_decay) {
5708 /* 6695 /*
5709 * Stop the load balance at this level. There is another 6696 * Ensure the rq-wide value also decays but keep it at a
5710 * CPU in our sched group which is doing load balancing more 6697 * reasonable floor to avoid funnies with rq->avg_idle.
5711 * actively.
5712 */ 6698 */
5713 if (!continue_balancing) 6699 rq->max_idle_balance_cost =
5714 break; 6700 max((u64)sysctl_sched_migration_cost, max_cost);
5715 } 6701 }
5716 rcu_read_unlock(); 6702 rcu_read_unlock();
5717 6703
@@ -5781,6 +6767,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
5781{ 6767{
5782 unsigned long now = jiffies; 6768 unsigned long now = jiffies;
5783 struct sched_domain *sd; 6769 struct sched_domain *sd;
6770 struct sched_group_power *sgp;
6771 int nr_busy;
5784 6772
5785 if (unlikely(idle_cpu(cpu))) 6773 if (unlikely(idle_cpu(cpu)))
5786 return 0; 6774 return 0;
@@ -5806,22 +6794,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
5806 goto need_kick; 6794 goto need_kick;
5807 6795
5808 rcu_read_lock(); 6796 rcu_read_lock();
5809 for_each_domain(cpu, sd) { 6797 sd = rcu_dereference(per_cpu(sd_busy, cpu));
5810 struct sched_group *sg = sd->groups;
5811 struct sched_group_power *sgp = sg->sgp;
5812 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
5813 6798
5814 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) 6799 if (sd) {
5815 goto need_kick_unlock; 6800 sgp = sd->groups->sgp;
6801 nr_busy = atomic_read(&sgp->nr_busy_cpus);
5816 6802
5817 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight 6803 if (nr_busy > 1)
5818 && (cpumask_first_and(nohz.idle_cpus_mask,
5819 sched_domain_span(sd)) < cpu))
5820 goto need_kick_unlock; 6804 goto need_kick_unlock;
5821
5822 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5823 break;
5824 } 6805 }
6806
6807 sd = rcu_dereference(per_cpu(sd_asym, cpu));
6808
6809 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
6810 sched_domain_span(sd)) < cpu))
6811 goto need_kick_unlock;
6812
5825 rcu_read_unlock(); 6813 rcu_read_unlock();
5826 return 0; 6814 return 0;
5827 6815
@@ -6214,7 +7202,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
6214 se->cfs_rq = parent->my_q; 7202 se->cfs_rq = parent->my_q;
6215 7203
6216 se->my_q = cfs_rq; 7204 se->my_q = cfs_rq;
6217 update_load_set(&se->load, 0); 7205 /* guarantee group entities always have weight */
7206 update_load_set(&se->load, NICE_0_LOAD);
6218 se->parent = parent; 7207 se->parent = parent;
6219} 7208}
6220 7209