aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c761
1 files changed, 706 insertions, 55 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bc8ee9993814..5c9e67923b7c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
89 */ 89 */
90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; 90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
91 91
92#ifdef CONFIG_CFS_BANDWIDTH
93/*
94 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
95 * each time a cfs_rq requests quota.
96 *
97 * Note: in the case that the slice exceeds the runtime remaining (either due
98 * to consumption or the quota being specified to be smaller than the slice)
99 * we will always only issue the remaining available time.
100 *
101 * default: 5 msec, units: microseconds
102 */
103unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
104#endif
105
92static const struct sched_class fair_sched_class; 106static const struct sched_class fair_sched_class;
93 107
94/************************************************************** 108/**************************************************************
@@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
292 306
293#endif /* CONFIG_FAIR_GROUP_SCHED */ 307#endif /* CONFIG_FAIR_GROUP_SCHED */
294 308
309static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
310 unsigned long delta_exec);
295 311
296/************************************************************** 312/**************************************************************
297 * Scheduling class tree data structure manipulation methods: 313 * Scheduling class tree data structure manipulation methods:
@@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
583 cpuacct_charge(curtask, delta_exec); 599 cpuacct_charge(curtask, delta_exec);
584 account_group_exec_runtime(curtask, delta_exec); 600 account_group_exec_runtime(curtask, delta_exec);
585 } 601 }
602
603 account_cfs_rq_runtime(cfs_rq, delta_exec);
586} 604}
587 605
588static inline void 606static inline void
@@ -688,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
688} 706}
689 707
690#ifdef CONFIG_FAIR_GROUP_SCHED 708#ifdef CONFIG_FAIR_GROUP_SCHED
709/* we need this in update_cfs_load and load-balance functions below */
710static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
691# ifdef CONFIG_SMP 711# ifdef CONFIG_SMP
692static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, 712static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
693 int global_update) 713 int global_update)
@@ -710,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
710 u64 now, delta; 730 u64 now, delta;
711 unsigned long load = cfs_rq->load.weight; 731 unsigned long load = cfs_rq->load.weight;
712 732
713 if (cfs_rq->tg == &root_task_group) 733 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
714 return; 734 return;
715 735
716 now = rq_of(cfs_rq)->clock_task; 736 now = rq_of(cfs_rq)->clock_task;
@@ -819,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
819 839
820 tg = cfs_rq->tg; 840 tg = cfs_rq->tg;
821 se = tg->se[cpu_of(rq_of(cfs_rq))]; 841 se = tg->se[cpu_of(rq_of(cfs_rq))];
822 if (!se) 842 if (!se || throttled_hierarchy(cfs_rq))
823 return; 843 return;
824#ifndef CONFIG_SMP 844#ifndef CONFIG_SMP
825 if (likely(se->load.weight == tg->shares)) 845 if (likely(se->load.weight == tg->shares))
@@ -950,6 +970,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
950 se->vruntime = vruntime; 970 se->vruntime = vruntime;
951} 971}
952 972
973static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
974
953static void 975static void
954enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 976enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
955{ 977{
@@ -979,8 +1001,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
979 __enqueue_entity(cfs_rq, se); 1001 __enqueue_entity(cfs_rq, se);
980 se->on_rq = 1; 1002 se->on_rq = 1;
981 1003
982 if (cfs_rq->nr_running == 1) 1004 if (cfs_rq->nr_running == 1) {
983 list_add_leaf_cfs_rq(cfs_rq); 1005 list_add_leaf_cfs_rq(cfs_rq);
1006 check_enqueue_throttle(cfs_rq);
1007 }
984} 1008}
985 1009
986static void __clear_buddies_last(struct sched_entity *se) 1010static void __clear_buddies_last(struct sched_entity *se)
@@ -1028,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1028 __clear_buddies_skip(se); 1052 __clear_buddies_skip(se);
1029} 1053}
1030 1054
1055static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1056
1031static void 1057static void
1032dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 1058dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1033{ 1059{
@@ -1066,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1066 if (!(flags & DEQUEUE_SLEEP)) 1092 if (!(flags & DEQUEUE_SLEEP))
1067 se->vruntime -= cfs_rq->min_vruntime; 1093 se->vruntime -= cfs_rq->min_vruntime;
1068 1094
1095 /* return excess runtime on last dequeue */
1096 return_cfs_rq_runtime(cfs_rq);
1097
1069 update_min_vruntime(cfs_rq); 1098 update_min_vruntime(cfs_rq);
1070 update_cfs_shares(cfs_rq); 1099 update_cfs_shares(cfs_rq);
1071} 1100}
@@ -1077,6 +1106,8 @@ static void
1077check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 1106check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1078{ 1107{
1079 unsigned long ideal_runtime, delta_exec; 1108 unsigned long ideal_runtime, delta_exec;
1109 struct sched_entity *se;
1110 s64 delta;
1080 1111
1081 ideal_runtime = sched_slice(cfs_rq, curr); 1112 ideal_runtime = sched_slice(cfs_rq, curr);
1082 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 1113 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -1095,22 +1126,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1095 * narrow margin doesn't have to wait for a full slice. 1126 * narrow margin doesn't have to wait for a full slice.
1096 * This also mitigates buddy induced latencies under load. 1127 * This also mitigates buddy induced latencies under load.
1097 */ 1128 */
1098 if (!sched_feat(WAKEUP_PREEMPT))
1099 return;
1100
1101 if (delta_exec < sysctl_sched_min_granularity) 1129 if (delta_exec < sysctl_sched_min_granularity)
1102 return; 1130 return;
1103 1131
1104 if (cfs_rq->nr_running > 1) { 1132 se = __pick_first_entity(cfs_rq);
1105 struct sched_entity *se = __pick_first_entity(cfs_rq); 1133 delta = curr->vruntime - se->vruntime;
1106 s64 delta = curr->vruntime - se->vruntime;
1107 1134
1108 if (delta < 0) 1135 if (delta < 0)
1109 return; 1136 return;
1110 1137
1111 if (delta > ideal_runtime) 1138 if (delta > ideal_runtime)
1112 resched_task(rq_of(cfs_rq)->curr); 1139 resched_task(rq_of(cfs_rq)->curr);
1113 }
1114} 1140}
1115 1141
1116static void 1142static void
@@ -1185,6 +1211,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1185 return se; 1211 return se;
1186} 1212}
1187 1213
1214static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1215
1188static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 1216static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1189{ 1217{
1190 /* 1218 /*
@@ -1194,6 +1222,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1194 if (prev->on_rq) 1222 if (prev->on_rq)
1195 update_curr(cfs_rq); 1223 update_curr(cfs_rq);
1196 1224
1225 /* throttle cfs_rqs exceeding runtime */
1226 check_cfs_rq_runtime(cfs_rq);
1227
1197 check_spread(cfs_rq, prev); 1228 check_spread(cfs_rq, prev);
1198 if (prev->on_rq) { 1229 if (prev->on_rq) {
1199 update_stats_wait_start(cfs_rq, prev); 1230 update_stats_wait_start(cfs_rq, prev);
@@ -1233,10 +1264,583 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1233 return; 1264 return;
1234#endif 1265#endif
1235 1266
1236 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) 1267 if (cfs_rq->nr_running > 1)
1237 check_preempt_tick(cfs_rq, curr); 1268 check_preempt_tick(cfs_rq, curr);
1238} 1269}
1239 1270
1271
1272/**************************************************
1273 * CFS bandwidth control machinery
1274 */
1275
1276#ifdef CONFIG_CFS_BANDWIDTH
1277/*
1278 * default period for cfs group bandwidth.
1279 * default: 0.1s, units: nanoseconds
1280 */
1281static inline u64 default_cfs_period(void)
1282{
1283 return 100000000ULL;
1284}
1285
1286static inline u64 sched_cfs_bandwidth_slice(void)
1287{
1288 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
1289}
1290
1291/*
1292 * Replenish runtime according to assigned quota and update expiration time.
1293 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
1294 * additional synchronization around rq->lock.
1295 *
1296 * requires cfs_b->lock
1297 */
1298static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1299{
1300 u64 now;
1301
1302 if (cfs_b->quota == RUNTIME_INF)
1303 return;
1304
1305 now = sched_clock_cpu(smp_processor_id());
1306 cfs_b->runtime = cfs_b->quota;
1307 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1308}
1309
1310/* returns 0 on failure to allocate runtime */
1311static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1312{
1313 struct task_group *tg = cfs_rq->tg;
1314 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
1315 u64 amount = 0, min_amount, expires;
1316
1317 /* note: this is a positive sum as runtime_remaining <= 0 */
1318 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
1319
1320 raw_spin_lock(&cfs_b->lock);
1321 if (cfs_b->quota == RUNTIME_INF)
1322 amount = min_amount;
1323 else {
1324 /*
1325 * If the bandwidth pool has become inactive, then at least one
1326 * period must have elapsed since the last consumption.
1327 * Refresh the global state and ensure bandwidth timer becomes
1328 * active.
1329 */
1330 if (!cfs_b->timer_active) {
1331 __refill_cfs_bandwidth_runtime(cfs_b);
1332 __start_cfs_bandwidth(cfs_b);
1333 }
1334
1335 if (cfs_b->runtime > 0) {
1336 amount = min(cfs_b->runtime, min_amount);
1337 cfs_b->runtime -= amount;
1338 cfs_b->idle = 0;
1339 }
1340 }
1341 expires = cfs_b->runtime_expires;
1342 raw_spin_unlock(&cfs_b->lock);
1343
1344 cfs_rq->runtime_remaining += amount;
1345 /*
1346 * we may have advanced our local expiration to account for allowed
1347 * spread between our sched_clock and the one on which runtime was
1348 * issued.
1349 */
1350 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
1351 cfs_rq->runtime_expires = expires;
1352
1353 return cfs_rq->runtime_remaining > 0;
1354}
1355
1356/*
1357 * Note: This depends on the synchronization provided by sched_clock and the
1358 * fact that rq->clock snapshots this value.
1359 */
1360static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1361{
1362 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1363 struct rq *rq = rq_of(cfs_rq);
1364
1365 /* if the deadline is ahead of our clock, nothing to do */
1366 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
1367 return;
1368
1369 if (cfs_rq->runtime_remaining < 0)
1370 return;
1371
1372 /*
1373 * If the local deadline has passed we have to consider the
1374 * possibility that our sched_clock is 'fast' and the global deadline
1375 * has not truly expired.
1376 *
1377 * Fortunately we can check determine whether this the case by checking
1378 * whether the global deadline has advanced.
1379 */
1380
1381 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
1382 /* extend local deadline, drift is bounded above by 2 ticks */
1383 cfs_rq->runtime_expires += TICK_NSEC;
1384 } else {
1385 /* global deadline is ahead, expiration has passed */
1386 cfs_rq->runtime_remaining = 0;
1387 }
1388}
1389
1390static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1391 unsigned long delta_exec)
1392{
1393 /* dock delta_exec before expiring quota (as it could span periods) */
1394 cfs_rq->runtime_remaining -= delta_exec;
1395 expire_cfs_rq_runtime(cfs_rq);
1396
1397 if (likely(cfs_rq->runtime_remaining > 0))
1398 return;
1399
1400 /*
1401 * if we're unable to extend our runtime we resched so that the active
1402 * hierarchy can be throttled
1403 */
1404 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
1405 resched_task(rq_of(cfs_rq)->curr);
1406}
1407
1408static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1409 unsigned long delta_exec)
1410{
1411 if (!cfs_rq->runtime_enabled)
1412 return;
1413
1414 __account_cfs_rq_runtime(cfs_rq, delta_exec);
1415}
1416
1417static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1418{
1419 return cfs_rq->throttled;
1420}
1421
1422/* check whether cfs_rq, or any parent, is throttled */
1423static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1424{
1425 return cfs_rq->throttle_count;
1426}
1427
1428/*
1429 * Ensure that neither of the group entities corresponding to src_cpu or
1430 * dest_cpu are members of a throttled hierarchy when performing group
1431 * load-balance operations.
1432 */
1433static inline int throttled_lb_pair(struct task_group *tg,
1434 int src_cpu, int dest_cpu)
1435{
1436 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
1437
1438 src_cfs_rq = tg->cfs_rq[src_cpu];
1439 dest_cfs_rq = tg->cfs_rq[dest_cpu];
1440
1441 return throttled_hierarchy(src_cfs_rq) ||
1442 throttled_hierarchy(dest_cfs_rq);
1443}
1444
1445/* updated child weight may affect parent so we have to do this bottom up */
1446static int tg_unthrottle_up(struct task_group *tg, void *data)
1447{
1448 struct rq *rq = data;
1449 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1450
1451 cfs_rq->throttle_count--;
1452#ifdef CONFIG_SMP
1453 if (!cfs_rq->throttle_count) {
1454 u64 delta = rq->clock_task - cfs_rq->load_stamp;
1455
1456 /* leaving throttled state, advance shares averaging windows */
1457 cfs_rq->load_stamp += delta;
1458 cfs_rq->load_last += delta;
1459
1460 /* update entity weight now that we are on_rq again */
1461 update_cfs_shares(cfs_rq);
1462 }
1463#endif
1464
1465 return 0;
1466}
1467
1468static int tg_throttle_down(struct task_group *tg, void *data)
1469{
1470 struct rq *rq = data;
1471 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1472
1473 /* group is entering throttled state, record last load */
1474 if (!cfs_rq->throttle_count)
1475 update_cfs_load(cfs_rq, 0);
1476 cfs_rq->throttle_count++;
1477
1478 return 0;
1479}
1480
1481static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1482{
1483 struct rq *rq = rq_of(cfs_rq);
1484 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1485 struct sched_entity *se;
1486 long task_delta, dequeue = 1;
1487
1488 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1489
1490 /* account load preceding throttle */
1491 rcu_read_lock();
1492 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1493 rcu_read_unlock();
1494
1495 task_delta = cfs_rq->h_nr_running;
1496 for_each_sched_entity(se) {
1497 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
1498 /* throttled entity or throttle-on-deactivate */
1499 if (!se->on_rq)
1500 break;
1501
1502 if (dequeue)
1503 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
1504 qcfs_rq->h_nr_running -= task_delta;
1505
1506 if (qcfs_rq->load.weight)
1507 dequeue = 0;
1508 }
1509
1510 if (!se)
1511 rq->nr_running -= task_delta;
1512
1513 cfs_rq->throttled = 1;
1514 cfs_rq->throttled_timestamp = rq->clock;
1515 raw_spin_lock(&cfs_b->lock);
1516 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1517 raw_spin_unlock(&cfs_b->lock);
1518}
1519
1520static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1521{
1522 struct rq *rq = rq_of(cfs_rq);
1523 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1524 struct sched_entity *se;
1525 int enqueue = 1;
1526 long task_delta;
1527
1528 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1529
1530 cfs_rq->throttled = 0;
1531 raw_spin_lock(&cfs_b->lock);
1532 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
1533 list_del_rcu(&cfs_rq->throttled_list);
1534 raw_spin_unlock(&cfs_b->lock);
1535 cfs_rq->throttled_timestamp = 0;
1536
1537 update_rq_clock(rq);
1538 /* update hierarchical throttle state */
1539 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
1540
1541 if (!cfs_rq->load.weight)
1542 return;
1543
1544 task_delta = cfs_rq->h_nr_running;
1545 for_each_sched_entity(se) {
1546 if (se->on_rq)
1547 enqueue = 0;
1548
1549 cfs_rq = cfs_rq_of(se);
1550 if (enqueue)
1551 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
1552 cfs_rq->h_nr_running += task_delta;
1553
1554 if (cfs_rq_throttled(cfs_rq))
1555 break;
1556 }
1557
1558 if (!se)
1559 rq->nr_running += task_delta;
1560
1561 /* determine whether we need to wake up potentially idle cpu */
1562 if (rq->curr == rq->idle && rq->cfs.nr_running)
1563 resched_task(rq->curr);
1564}
1565
1566static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
1567 u64 remaining, u64 expires)
1568{
1569 struct cfs_rq *cfs_rq;
1570 u64 runtime = remaining;
1571
1572 rcu_read_lock();
1573 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
1574 throttled_list) {
1575 struct rq *rq = rq_of(cfs_rq);
1576
1577 raw_spin_lock(&rq->lock);
1578 if (!cfs_rq_throttled(cfs_rq))
1579 goto next;
1580
1581 runtime = -cfs_rq->runtime_remaining + 1;
1582 if (runtime > remaining)
1583 runtime = remaining;
1584 remaining -= runtime;
1585
1586 cfs_rq->runtime_remaining += runtime;
1587 cfs_rq->runtime_expires = expires;
1588
1589 /* we check whether we're throttled above */
1590 if (cfs_rq->runtime_remaining > 0)
1591 unthrottle_cfs_rq(cfs_rq);
1592
1593next:
1594 raw_spin_unlock(&rq->lock);
1595
1596 if (!remaining)
1597 break;
1598 }
1599 rcu_read_unlock();
1600
1601 return remaining;
1602}
1603
1604/*
1605 * Responsible for refilling a task_group's bandwidth and unthrottling its
1606 * cfs_rqs as appropriate. If there has been no activity within the last
1607 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
1608 * used to track this state.
1609 */
1610static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
1611{
1612 u64 runtime, runtime_expires;
1613 int idle = 1, throttled;
1614
1615 raw_spin_lock(&cfs_b->lock);
1616 /* no need to continue the timer with no bandwidth constraint */
1617 if (cfs_b->quota == RUNTIME_INF)
1618 goto out_unlock;
1619
1620 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1621 /* idle depends on !throttled (for the case of a large deficit) */
1622 idle = cfs_b->idle && !throttled;
1623 cfs_b->nr_periods += overrun;
1624
1625 /* if we're going inactive then everything else can be deferred */
1626 if (idle)
1627 goto out_unlock;
1628
1629 __refill_cfs_bandwidth_runtime(cfs_b);
1630
1631 if (!throttled) {
1632 /* mark as potentially idle for the upcoming period */
1633 cfs_b->idle = 1;
1634 goto out_unlock;
1635 }
1636
1637 /* account preceding periods in which throttling occurred */
1638 cfs_b->nr_throttled += overrun;
1639
1640 /*
1641 * There are throttled entities so we must first use the new bandwidth
1642 * to unthrottle them before making it generally available. This
1643 * ensures that all existing debts will be paid before a new cfs_rq is
1644 * allowed to run.
1645 */
1646 runtime = cfs_b->runtime;
1647 runtime_expires = cfs_b->runtime_expires;
1648 cfs_b->runtime = 0;
1649
1650 /*
1651 * This check is repeated as we are holding onto the new bandwidth
1652 * while we unthrottle. This can potentially race with an unthrottled
1653 * group trying to acquire new bandwidth from the global pool.
1654 */
1655 while (throttled && runtime > 0) {
1656 raw_spin_unlock(&cfs_b->lock);
1657 /* we can't nest cfs_b->lock while distributing bandwidth */
1658 runtime = distribute_cfs_runtime(cfs_b, runtime,
1659 runtime_expires);
1660 raw_spin_lock(&cfs_b->lock);
1661
1662 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1663 }
1664
1665 /* return (any) remaining runtime */
1666 cfs_b->runtime = runtime;
1667 /*
1668 * While we are ensured activity in the period following an
1669 * unthrottle, this also covers the case in which the new bandwidth is
1670 * insufficient to cover the existing bandwidth deficit. (Forcing the
1671 * timer to remain active while there are any throttled entities.)
1672 */
1673 cfs_b->idle = 0;
1674out_unlock:
1675 if (idle)
1676 cfs_b->timer_active = 0;
1677 raw_spin_unlock(&cfs_b->lock);
1678
1679 return idle;
1680}
1681
1682/* a cfs_rq won't donate quota below this amount */
1683static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
1684/* minimum remaining period time to redistribute slack quota */
1685static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
1686/* how long we wait to gather additional slack before distributing */
1687static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
1688
1689/* are we near the end of the current quota period? */
1690static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
1691{
1692 struct hrtimer *refresh_timer = &cfs_b->period_timer;
1693 u64 remaining;
1694
1695 /* if the call-back is running a quota refresh is already occurring */
1696 if (hrtimer_callback_running(refresh_timer))
1697 return 1;
1698
1699 /* is a quota refresh about to occur? */
1700 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
1701 if (remaining < min_expire)
1702 return 1;
1703
1704 return 0;
1705}
1706
1707static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
1708{
1709 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
1710
1711 /* if there's a quota refresh soon don't bother with slack */
1712 if (runtime_refresh_within(cfs_b, min_left))
1713 return;
1714
1715 start_bandwidth_timer(&cfs_b->slack_timer,
1716 ns_to_ktime(cfs_bandwidth_slack_period));
1717}
1718
1719/* we know any runtime found here is valid as update_curr() precedes return */
1720static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1721{
1722 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1723 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
1724
1725 if (slack_runtime <= 0)
1726 return;
1727
1728 raw_spin_lock(&cfs_b->lock);
1729 if (cfs_b->quota != RUNTIME_INF &&
1730 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
1731 cfs_b->runtime += slack_runtime;
1732
1733 /* we are under rq->lock, defer unthrottling using a timer */
1734 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
1735 !list_empty(&cfs_b->throttled_cfs_rq))
1736 start_cfs_slack_bandwidth(cfs_b);
1737 }
1738 raw_spin_unlock(&cfs_b->lock);
1739
1740 /* even if it's not valid for return we don't want to try again */
1741 cfs_rq->runtime_remaining -= slack_runtime;
1742}
1743
1744static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1745{
1746 if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
1747 return;
1748
1749 __return_cfs_rq_runtime(cfs_rq);
1750}
1751
1752/*
1753 * This is done with a timer (instead of inline with bandwidth return) since
1754 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
1755 */
1756static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1757{
1758 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
1759 u64 expires;
1760
1761 /* confirm we're still not at a refresh boundary */
1762 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
1763 return;
1764
1765 raw_spin_lock(&cfs_b->lock);
1766 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
1767 runtime = cfs_b->runtime;
1768 cfs_b->runtime = 0;
1769 }
1770 expires = cfs_b->runtime_expires;
1771 raw_spin_unlock(&cfs_b->lock);
1772
1773 if (!runtime)
1774 return;
1775
1776 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
1777
1778 raw_spin_lock(&cfs_b->lock);
1779 if (expires == cfs_b->runtime_expires)
1780 cfs_b->runtime = runtime;
1781 raw_spin_unlock(&cfs_b->lock);
1782}
1783
1784/*
1785 * When a group wakes up we want to make sure that its quota is not already
1786 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
1787 * runtime as update_curr() throttling can not not trigger until it's on-rq.
1788 */
1789static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1790{
1791 /* an active group must be handled by the update_curr()->put() path */
1792 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1793 return;
1794
1795 /* ensure the group is not already throttled */
1796 if (cfs_rq_throttled(cfs_rq))
1797 return;
1798
1799 /* update runtime allocation */
1800 account_cfs_rq_runtime(cfs_rq, 0);
1801 if (cfs_rq->runtime_remaining <= 0)
1802 throttle_cfs_rq(cfs_rq);
1803}
1804
1805/* conditionally throttle active cfs_rq's from put_prev_entity() */
1806static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1807{
1808 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1809 return;
1810
1811 /*
1812 * it's possible for a throttled entity to be forced into a running
1813 * state (e.g. set_curr_task), in this case we're finished.
1814 */
1815 if (cfs_rq_throttled(cfs_rq))
1816 return;
1817
1818 throttle_cfs_rq(cfs_rq);
1819}
1820#else
1821static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1822 unsigned long delta_exec) {}
1823static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1824static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
1825static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1826
1827static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1828{
1829 return 0;
1830}
1831
1832static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1833{
1834 return 0;
1835}
1836
1837static inline int throttled_lb_pair(struct task_group *tg,
1838 int src_cpu, int dest_cpu)
1839{
1840 return 0;
1841}
1842#endif
1843
1240/************************************************** 1844/**************************************************
1241 * CFS operations on tasks: 1845 * CFS operations on tasks:
1242 */ 1846 */
@@ -1313,16 +1917,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1313 break; 1917 break;
1314 cfs_rq = cfs_rq_of(se); 1918 cfs_rq = cfs_rq_of(se);
1315 enqueue_entity(cfs_rq, se, flags); 1919 enqueue_entity(cfs_rq, se, flags);
1920
1921 /*
1922 * end evaluation on encountering a throttled cfs_rq
1923 *
1924 * note: in the case of encountering a throttled cfs_rq we will
1925 * post the final h_nr_running increment below.
1926 */
1927 if (cfs_rq_throttled(cfs_rq))
1928 break;
1929 cfs_rq->h_nr_running++;
1930
1316 flags = ENQUEUE_WAKEUP; 1931 flags = ENQUEUE_WAKEUP;
1317 } 1932 }
1318 1933
1319 for_each_sched_entity(se) { 1934 for_each_sched_entity(se) {
1320 cfs_rq = cfs_rq_of(se); 1935 cfs_rq = cfs_rq_of(se);
1936 cfs_rq->h_nr_running++;
1937
1938 if (cfs_rq_throttled(cfs_rq))
1939 break;
1321 1940
1322 update_cfs_load(cfs_rq, 0); 1941 update_cfs_load(cfs_rq, 0);
1323 update_cfs_shares(cfs_rq); 1942 update_cfs_shares(cfs_rq);
1324 } 1943 }
1325 1944
1945 if (!se)
1946 inc_nr_running(rq);
1326 hrtick_update(rq); 1947 hrtick_update(rq);
1327} 1948}
1328 1949
@@ -1343,6 +1964,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1343 cfs_rq = cfs_rq_of(se); 1964 cfs_rq = cfs_rq_of(se);
1344 dequeue_entity(cfs_rq, se, flags); 1965 dequeue_entity(cfs_rq, se, flags);
1345 1966
1967 /*
1968 * end evaluation on encountering a throttled cfs_rq
1969 *
1970 * note: in the case of encountering a throttled cfs_rq we will
1971 * post the final h_nr_running decrement below.
1972 */
1973 if (cfs_rq_throttled(cfs_rq))
1974 break;
1975 cfs_rq->h_nr_running--;
1976
1346 /* Don't dequeue parent if it has other entities besides us */ 1977 /* Don't dequeue parent if it has other entities besides us */
1347 if (cfs_rq->load.weight) { 1978 if (cfs_rq->load.weight) {
1348 /* 1979 /*
@@ -1361,11 +1992,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1361 1992
1362 for_each_sched_entity(se) { 1993 for_each_sched_entity(se) {
1363 cfs_rq = cfs_rq_of(se); 1994 cfs_rq = cfs_rq_of(se);
1995 cfs_rq->h_nr_running--;
1996
1997 if (cfs_rq_throttled(cfs_rq))
1998 break;
1364 1999
1365 update_cfs_load(cfs_rq, 0); 2000 update_cfs_load(cfs_rq, 0);
1366 update_cfs_shares(cfs_rq); 2001 update_cfs_shares(cfs_rq);
1367 } 2002 }
1368 2003
2004 if (!se)
2005 dec_nr_running(rq);
1369 hrtick_update(rq); 2006 hrtick_update(rq);
1370} 2007}
1371 2008
@@ -1434,7 +2071,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1434 2071
1435 return wl; 2072 return wl;
1436} 2073}
1437
1438#else 2074#else
1439 2075
1440static inline unsigned long effective_load(struct task_group *tg, int cpu, 2076static inline unsigned long effective_load(struct task_group *tg, int cpu,
@@ -1547,7 +2183,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1547 2183
1548 /* Skip over this group if it has no CPUs allowed */ 2184 /* Skip over this group if it has no CPUs allowed */
1549 if (!cpumask_intersects(sched_group_cpus(group), 2185 if (!cpumask_intersects(sched_group_cpus(group),
1550 &p->cpus_allowed)) 2186 tsk_cpus_allowed(p)))
1551 continue; 2187 continue;
1552 2188
1553 local_group = cpumask_test_cpu(this_cpu, 2189 local_group = cpumask_test_cpu(this_cpu,
@@ -1593,7 +2229,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1593 int i; 2229 int i;
1594 2230
1595 /* Traverse only the allowed CPUs */ 2231 /* Traverse only the allowed CPUs */
1596 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { 2232 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
1597 load = weighted_cpuload(i); 2233 load = weighted_cpuload(i);
1598 2234
1599 if (load < min_load || (load == min_load && i == this_cpu)) { 2235 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1637,7 +2273,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1637 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 2273 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1638 break; 2274 break;
1639 2275
1640 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { 2276 for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {
1641 if (idle_cpu(i)) { 2277 if (idle_cpu(i)) {
1642 target = i; 2278 target = i;
1643 break; 2279 break;
@@ -1680,7 +2316,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1680 int sync = wake_flags & WF_SYNC; 2316 int sync = wake_flags & WF_SYNC;
1681 2317
1682 if (sd_flag & SD_BALANCE_WAKE) { 2318 if (sd_flag & SD_BALANCE_WAKE) {
1683 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) 2319 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1684 want_affine = 1; 2320 want_affine = 1;
1685 new_cpu = prev_cpu; 2321 new_cpu = prev_cpu;
1686 } 2322 }
@@ -1875,6 +2511,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1875 if (unlikely(se == pse)) 2511 if (unlikely(se == pse))
1876 return; 2512 return;
1877 2513
2514 /*
2515 * This is possible from callers such as pull_task(), in which we
2516 * unconditionally check_prempt_curr() after an enqueue (which may have
2517 * lead to a throttle). This both saves work and prevents false
2518 * next-buddy nomination below.
2519 */
2520 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
2521 return;
2522
1878 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { 2523 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1879 set_next_buddy(pse); 2524 set_next_buddy(pse);
1880 next_buddy_marked = 1; 2525 next_buddy_marked = 1;
@@ -1883,6 +2528,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1883 /* 2528 /*
1884 * We can come here with TIF_NEED_RESCHED already set from new task 2529 * We can come here with TIF_NEED_RESCHED already set from new task
1885 * wake up path. 2530 * wake up path.
2531 *
2532 * Note: this also catches the edge-case of curr being in a throttled
2533 * group (e.g. via set_curr_task), since update_curr() (in the
2534 * enqueue of curr) will have resulted in resched being set. This
2535 * prevents us from potentially nominating it as a false LAST_BUDDY
2536 * below.
1886 */ 2537 */
1887 if (test_tsk_need_resched(curr)) 2538 if (test_tsk_need_resched(curr))
1888 return; 2539 return;
@@ -1899,10 +2550,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1899 if (unlikely(p->policy != SCHED_NORMAL)) 2550 if (unlikely(p->policy != SCHED_NORMAL))
1900 return; 2551 return;
1901 2552
1902
1903 if (!sched_feat(WAKEUP_PREEMPT))
1904 return;
1905
1906 find_matching_se(&se, &pse); 2553 find_matching_se(&se, &pse);
1907 update_curr(cfs_rq_of(se)); 2554 update_curr(cfs_rq_of(se));
1908 BUG_ON(!pse); 2555 BUG_ON(!pse);
@@ -2005,7 +2652,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
2005{ 2652{
2006 struct sched_entity *se = &p->se; 2653 struct sched_entity *se = &p->se;
2007 2654
2008 if (!se->on_rq) 2655 /* throttled hierarchies are not runnable */
2656 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
2009 return false; 2657 return false;
2010 2658
2011 /* Tell the scheduler that we'd really like pse to run next. */ 2659 /* Tell the scheduler that we'd really like pse to run next. */
@@ -2049,7 +2697,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2049 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2697 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2050 * 3) are cache-hot on their current CPU. 2698 * 3) are cache-hot on their current CPU.
2051 */ 2699 */
2052 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { 2700 if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
2053 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 2701 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
2054 return 0; 2702 return 0;
2055 } 2703 }
@@ -2102,6 +2750,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2102 2750
2103 for_each_leaf_cfs_rq(busiest, cfs_rq) { 2751 for_each_leaf_cfs_rq(busiest, cfs_rq) {
2104 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { 2752 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
2753 if (throttled_lb_pair(task_group(p),
2754 busiest->cpu, this_cpu))
2755 break;
2105 2756
2106 if (!can_migrate_task(p, busiest, this_cpu, 2757 if (!can_migrate_task(p, busiest, this_cpu,
2107 sd, idle, &pinned)) 2758 sd, idle, &pinned))
@@ -2217,8 +2868,13 @@ static void update_shares(int cpu)
2217 * Iterates the task_group tree in a bottom up fashion, see 2868 * Iterates the task_group tree in a bottom up fashion, see
2218 * list_add_leaf_cfs_rq() for details. 2869 * list_add_leaf_cfs_rq() for details.
2219 */ 2870 */
2220 for_each_leaf_cfs_rq(rq, cfs_rq) 2871 for_each_leaf_cfs_rq(rq, cfs_rq) {
2872 /* throttled entities do not contribute to load */
2873 if (throttled_hierarchy(cfs_rq))
2874 continue;
2875
2221 update_shares_cpu(cfs_rq->tg, cpu); 2876 update_shares_cpu(cfs_rq->tg, cpu);
2877 }
2222 rcu_read_unlock(); 2878 rcu_read_unlock();
2223} 2879}
2224 2880
@@ -2268,9 +2924,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2268 u64 rem_load, moved_load; 2924 u64 rem_load, moved_load;
2269 2925
2270 /* 2926 /*
2271 * empty group 2927 * empty group or part of a throttled hierarchy
2272 */ 2928 */
2273 if (!busiest_cfs_rq->task_weight) 2929 if (!busiest_cfs_rq->task_weight ||
2930 throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
2274 continue; 2931 continue;
2275 2932
2276 rem_load = (u64)rem_load_move * busiest_weight; 2933 rem_load = (u64)rem_load_move * busiest_weight;
@@ -3430,7 +4087,7 @@ redo:
3430 * moved to this_cpu 4087 * moved to this_cpu
3431 */ 4088 */
3432 if (!cpumask_test_cpu(this_cpu, 4089 if (!cpumask_test_cpu(this_cpu,
3433 &busiest->curr->cpus_allowed)) { 4090 tsk_cpus_allowed(busiest->curr))) {
3434 raw_spin_unlock_irqrestore(&busiest->lock, 4091 raw_spin_unlock_irqrestore(&busiest->lock,
3435 flags); 4092 flags);
3436 all_pinned = 1; 4093 all_pinned = 1;
@@ -3612,22 +4269,6 @@ out_unlock:
3612} 4269}
3613 4270
3614#ifdef CONFIG_NO_HZ 4271#ifdef CONFIG_NO_HZ
3615
3616static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3617
3618static void trigger_sched_softirq(void *data)
3619{
3620 raise_softirq_irqoff(SCHED_SOFTIRQ);
3621}
3622
3623static inline void init_sched_softirq_csd(struct call_single_data *csd)
3624{
3625 csd->func = trigger_sched_softirq;
3626 csd->info = NULL;
3627 csd->flags = 0;
3628 csd->priv = 0;
3629}
3630
3631/* 4272/*
3632 * idle load balancing details 4273 * idle load balancing details
3633 * - One of the idle CPUs nominates itself as idle load_balancer, while 4274 * - One of the idle CPUs nominates itself as idle load_balancer, while
@@ -3667,7 +4308,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3667 struct sched_domain *sd; 4308 struct sched_domain *sd;
3668 4309
3669 for_each_domain(cpu, sd) 4310 for_each_domain(cpu, sd)
3670 if (sd && (sd->flags & flag)) 4311 if (sd->flags & flag)
3671 break; 4312 break;
3672 4313
3673 return sd; 4314 return sd;
@@ -3793,11 +4434,16 @@ static void nohz_balancer_kick(int cpu)
3793 } 4434 }
3794 4435
3795 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { 4436 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3796 struct call_single_data *cp;
3797
3798 cpu_rq(ilb_cpu)->nohz_balance_kick = 1; 4437 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3799 cp = &per_cpu(remote_sched_softirq_cb, cpu); 4438
3800 __smp_call_function_single(ilb_cpu, cp, 0); 4439 smp_mb();
4440 /*
4441 * Use smp_send_reschedule() instead of resched_cpu().
4442 * This way we generate a sched IPI on the target cpu which
4443 * is idle. And the softirq performing nohz idle load balance
4444 * will be run before returning from the IPI.
4445 */
4446 smp_send_reschedule(ilb_cpu);
3801 } 4447 }
3802 return; 4448 return;
3803} 4449}
@@ -4030,7 +4676,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
4030 if (time_before(now, nohz.next_balance)) 4676 if (time_before(now, nohz.next_balance))
4031 return 0; 4677 return 0;
4032 4678
4033 if (rq->idle_at_tick) 4679 if (idle_cpu(cpu))
4034 return 0; 4680 return 0;
4035 4681
4036 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 4682 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
@@ -4066,7 +4712,7 @@ static void run_rebalance_domains(struct softirq_action *h)
4066{ 4712{
4067 int this_cpu = smp_processor_id(); 4713 int this_cpu = smp_processor_id();
4068 struct rq *this_rq = cpu_rq(this_cpu); 4714 struct rq *this_rq = cpu_rq(this_cpu);
4069 enum cpu_idle_type idle = this_rq->idle_at_tick ? 4715 enum cpu_idle_type idle = this_rq->idle_balance ?
4070 CPU_IDLE : CPU_NOT_IDLE; 4716 CPU_IDLE : CPU_NOT_IDLE;
4071 4717
4072 rebalance_domains(this_cpu, idle); 4718 rebalance_domains(this_cpu, idle);
@@ -4251,8 +4897,13 @@ static void set_curr_task_fair(struct rq *rq)
4251{ 4897{
4252 struct sched_entity *se = &rq->curr->se; 4898 struct sched_entity *se = &rq->curr->se;
4253 4899
4254 for_each_sched_entity(se) 4900 for_each_sched_entity(se) {
4255 set_next_entity(cfs_rq_of(se), se); 4901 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4902
4903 set_next_entity(cfs_rq, se);
4904 /* ensure bandwidth has been allocated on our new cfs_rq */
4905 account_cfs_rq_runtime(cfs_rq, 0);
4906 }
4256} 4907}
4257 4908
4258#ifdef CONFIG_FAIR_GROUP_SCHED 4909#ifdef CONFIG_FAIR_GROUP_SCHED