aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>2018-10-05 18:39:38 -0400
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2018-10-05 18:39:38 -0400
commit8be673735e5144e13fe739fba5a0a33fc50f3a16 (patch)
tree66612de0994ff7ff115a5071903f2a1301f6d4b1 /kernel
parent1df377db3d0131057fa33b4dcda05c3e341308ab (diff)
parent37355bdc5a129899f6b245900a8eb944a092f7fd (diff)
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Ingo writes: "scheduler fixes: These fixes address a rather involved performance regression between v4.17->v4.19 in the sched/numa auto-balancing code. Since distros really need this fix we accelerated it to sched/urgent for a faster upstream merge. NUMA scheduling and balancing performance is now largely back to v4.17 levels, without reintroducing the NUMA placement bugs that v4.18 and v4.19 fixed. Many thanks to Srikar Dronamraju, Mel Gorman and Jirka Hladky, for reporting, testing, re-testing and solving this rather complex set of bugs." * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/numa: Migrate pages to local nodes quicker early in the lifetime of a task mm, sched/numa: Remove rate-limiting of automatic NUMA balancing migration sched/numa: Avoid task migration for small NUMA improvement mm/migrate: Use spin_trylock() while resetting rate limit sched/numa: Limit the conditions where scan period is reset sched/numa: Reset scan rate whenever task moves across nodes sched/numa: Pass destination CPU as a parameter to migrate_task_rq sched/numa: Stop multiple tasks from moving to the CPU at the same time
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/fair.c104
-rw-r--r--kernel/sched/sched.h3
4 files changed, 95 insertions, 16 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 625bc9897f62..ad97f3ba5ec5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1167,7 +1167,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1167 1167
1168 if (task_cpu(p) != new_cpu) { 1168 if (task_cpu(p) != new_cpu) {
1169 if (p->sched_class->migrate_task_rq) 1169 if (p->sched_class->migrate_task_rq)
1170 p->sched_class->migrate_task_rq(p); 1170 p->sched_class->migrate_task_rq(p, new_cpu);
1171 p->se.nr_migrations++; 1171 p->se.nr_migrations++;
1172 rseq_migrate(p); 1172 rseq_migrate(p);
1173 perf_event_task_migrate(p); 1173 perf_event_task_migrate(p);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 997ea7b839fa..91e4202b0634 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1607,7 +1607,7 @@ out:
1607 return cpu; 1607 return cpu;
1608} 1608}
1609 1609
1610static void migrate_task_rq_dl(struct task_struct *p) 1610static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
1611{ 1611{
1612 struct rq *rq; 1612 struct rq *rq;
1613 1613
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f808ddf2a868..7fc4a371bdd2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1392,6 +1392,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1392 int last_cpupid, this_cpupid; 1392 int last_cpupid, this_cpupid;
1393 1393
1394 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); 1394 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1395 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1396
1397 /*
1398 * Allow first faults or private faults to migrate immediately early in
1399 * the lifetime of a task. The magic number 4 is based on waiting for
1400 * two full passes of the "multi-stage node selection" test that is
1401 * executed below.
1402 */
1403 if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
1404 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1405 return true;
1395 1406
1396 /* 1407 /*
1397 * Multi-stage node selection is used in conjunction with a periodic 1408 * Multi-stage node selection is used in conjunction with a periodic
@@ -1410,7 +1421,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1410 * This quadric squishes small probabilities, making it less likely we 1421 * This quadric squishes small probabilities, making it less likely we
1411 * act on an unlikely task<->page relation. 1422 * act on an unlikely task<->page relation.
1412 */ 1423 */
1413 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1414 if (!cpupid_pid_unset(last_cpupid) && 1424 if (!cpupid_pid_unset(last_cpupid) &&
1415 cpupid_to_nid(last_cpupid) != dst_nid) 1425 cpupid_to_nid(last_cpupid) != dst_nid)
1416 return false; 1426 return false;
@@ -1514,6 +1524,21 @@ struct task_numa_env {
1514static void task_numa_assign(struct task_numa_env *env, 1524static void task_numa_assign(struct task_numa_env *env,
1515 struct task_struct *p, long imp) 1525 struct task_struct *p, long imp)
1516{ 1526{
1527 struct rq *rq = cpu_rq(env->dst_cpu);
1528
1529 /* Bail out if run-queue part of active NUMA balance. */
1530 if (xchg(&rq->numa_migrate_on, 1))
1531 return;
1532
1533 /*
1534 * Clear previous best_cpu/rq numa-migrate flag, since task now
1535 * found a better CPU to move/swap.
1536 */
1537 if (env->best_cpu != -1) {
1538 rq = cpu_rq(env->best_cpu);
1539 WRITE_ONCE(rq->numa_migrate_on, 0);
1540 }
1541
1517 if (env->best_task) 1542 if (env->best_task)
1518 put_task_struct(env->best_task); 1543 put_task_struct(env->best_task);
1519 if (p) 1544 if (p)
@@ -1553,6 +1578,13 @@ static bool load_too_imbalanced(long src_load, long dst_load,
1553} 1578}
1554 1579
1555/* 1580/*
1581 * Maximum NUMA importance can be 1998 (2*999);
1582 * SMALLIMP @ 30 would be close to 1998/64.
1583 * Used to deter task migration.
1584 */
1585#define SMALLIMP 30
1586
1587/*
1556 * This checks if the overall compute and NUMA accesses of the system would 1588 * This checks if the overall compute and NUMA accesses of the system would
1557 * be improved if the source tasks was migrated to the target dst_cpu taking 1589 * be improved if the source tasks was migrated to the target dst_cpu taking
1558 * into account that it might be best if task running on the dst_cpu should 1590 * into account that it might be best if task running on the dst_cpu should
@@ -1569,6 +1601,9 @@ static void task_numa_compare(struct task_numa_env *env,
1569 long moveimp = imp; 1601 long moveimp = imp;
1570 int dist = env->dist; 1602 int dist = env->dist;
1571 1603
1604 if (READ_ONCE(dst_rq->numa_migrate_on))
1605 return;
1606
1572 rcu_read_lock(); 1607 rcu_read_lock();
1573 cur = task_rcu_dereference(&dst_rq->curr); 1608 cur = task_rcu_dereference(&dst_rq->curr);
1574 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) 1609 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
@@ -1582,7 +1617,7 @@ static void task_numa_compare(struct task_numa_env *env,
1582 goto unlock; 1617 goto unlock;
1583 1618
1584 if (!cur) { 1619 if (!cur) {
1585 if (maymove || imp > env->best_imp) 1620 if (maymove && moveimp >= env->best_imp)
1586 goto assign; 1621 goto assign;
1587 else 1622 else
1588 goto unlock; 1623 goto unlock;
@@ -1625,16 +1660,22 @@ static void task_numa_compare(struct task_numa_env *env,
1625 task_weight(cur, env->dst_nid, dist); 1660 task_weight(cur, env->dst_nid, dist);
1626 } 1661 }
1627 1662
1628 if (imp <= env->best_imp)
1629 goto unlock;
1630
1631 if (maymove && moveimp > imp && moveimp > env->best_imp) { 1663 if (maymove && moveimp > imp && moveimp > env->best_imp) {
1632 imp = moveimp - 1; 1664 imp = moveimp;
1633 cur = NULL; 1665 cur = NULL;
1634 goto assign; 1666 goto assign;
1635 } 1667 }
1636 1668
1637 /* 1669 /*
1670 * If the NUMA importance is less than SMALLIMP,
1671 * task migration might only result in ping pong
1672 * of tasks and also hurt performance due to cache
1673 * misses.
1674 */
1675 if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1676 goto unlock;
1677
1678 /*
1638 * In the overloaded case, try and keep the load balanced. 1679 * In the overloaded case, try and keep the load balanced.
1639 */ 1680 */
1640 load = task_h_load(env->p) - task_h_load(cur); 1681 load = task_h_load(env->p) - task_h_load(cur);
@@ -1710,6 +1751,7 @@ static int task_numa_migrate(struct task_struct *p)
1710 .best_cpu = -1, 1751 .best_cpu = -1,
1711 }; 1752 };
1712 struct sched_domain *sd; 1753 struct sched_domain *sd;
1754 struct rq *best_rq;
1713 unsigned long taskweight, groupweight; 1755 unsigned long taskweight, groupweight;
1714 int nid, ret, dist; 1756 int nid, ret, dist;
1715 long taskimp, groupimp; 1757 long taskimp, groupimp;
@@ -1805,20 +1847,17 @@ static int task_numa_migrate(struct task_struct *p)
1805 if (env.best_cpu == -1) 1847 if (env.best_cpu == -1)
1806 return -EAGAIN; 1848 return -EAGAIN;
1807 1849
1808 /* 1850 best_rq = cpu_rq(env.best_cpu);
1809 * Reset the scan period if the task is being rescheduled on an
1810 * alternative node to recheck if the tasks is now properly placed.
1811 */
1812 p->numa_scan_period = task_scan_start(p);
1813
1814 if (env.best_task == NULL) { 1851 if (env.best_task == NULL) {
1815 ret = migrate_task_to(p, env.best_cpu); 1852 ret = migrate_task_to(p, env.best_cpu);
1853 WRITE_ONCE(best_rq->numa_migrate_on, 0);
1816 if (ret != 0) 1854 if (ret != 0)
1817 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); 1855 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1818 return ret; 1856 return ret;
1819 } 1857 }
1820 1858
1821 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); 1859 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
1860 WRITE_ONCE(best_rq->numa_migrate_on, 0);
1822 1861
1823 if (ret != 0) 1862 if (ret != 0)
1824 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); 1863 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
@@ -2596,6 +2635,39 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
2596 } 2635 }
2597} 2636}
2598 2637
2638static void update_scan_period(struct task_struct *p, int new_cpu)
2639{
2640 int src_nid = cpu_to_node(task_cpu(p));
2641 int dst_nid = cpu_to_node(new_cpu);
2642
2643 if (!static_branch_likely(&sched_numa_balancing))
2644 return;
2645
2646 if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2647 return;
2648
2649 if (src_nid == dst_nid)
2650 return;
2651
2652 /*
2653 * Allow resets if faults have been trapped before one scan
2654 * has completed. This is most likely due to a new task that
2655 * is pulled cross-node due to wakeups or load balancing.
2656 */
2657 if (p->numa_scan_seq) {
2658 /*
2659 * Avoid scan adjustments if moving to the preferred
2660 * node or if the task was not previously running on
2661 * the preferred node.
2662 */
2663 if (dst_nid == p->numa_preferred_nid ||
2664 (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
2665 return;
2666 }
2667
2668 p->numa_scan_period = task_scan_start(p);
2669}
2670
2599#else 2671#else
2600static void task_tick_numa(struct rq *rq, struct task_struct *curr) 2672static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2601{ 2673{
@@ -2609,6 +2681,10 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2609{ 2681{
2610} 2682}
2611 2683
2684static inline void update_scan_period(struct task_struct *p, int new_cpu)
2685{
2686}
2687
2612#endif /* CONFIG_NUMA_BALANCING */ 2688#endif /* CONFIG_NUMA_BALANCING */
2613 2689
2614static void 2690static void
@@ -6275,7 +6351,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se);
6275 * cfs_rq_of(p) references at time of call are still valid and identify the 6351 * cfs_rq_of(p) references at time of call are still valid and identify the
6276 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. 6352 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
6277 */ 6353 */
6278static void migrate_task_rq_fair(struct task_struct *p) 6354static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
6279{ 6355{
6280 /* 6356 /*
6281 * As blocked tasks retain absolute vruntime the migration needs to 6357 * As blocked tasks retain absolute vruntime the migration needs to
@@ -6328,6 +6404,8 @@ static void migrate_task_rq_fair(struct task_struct *p)
6328 6404
6329 /* We have migrated, no longer consider this task hot */ 6405 /* We have migrated, no longer consider this task hot */
6330 p->se.exec_start = 0; 6406 p->se.exec_start = 0;
6407
6408 update_scan_period(p, new_cpu);
6331} 6409}
6332 6410
6333static void task_dead_fair(struct task_struct *p) 6411static void task_dead_fair(struct task_struct *p)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4a2e8cae63c4..455fa330de04 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -783,6 +783,7 @@ struct rq {
783#ifdef CONFIG_NUMA_BALANCING 783#ifdef CONFIG_NUMA_BALANCING
784 unsigned int nr_numa_running; 784 unsigned int nr_numa_running;
785 unsigned int nr_preferred_running; 785 unsigned int nr_preferred_running;
786 unsigned int numa_migrate_on;
786#endif 787#endif
787 #define CPU_LOAD_IDX_MAX 5 788 #define CPU_LOAD_IDX_MAX 5
788 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 789 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
@@ -1523,7 +1524,7 @@ struct sched_class {
1523 1524
1524#ifdef CONFIG_SMP 1525#ifdef CONFIG_SMP
1525 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1526 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1526 void (*migrate_task_rq)(struct task_struct *p); 1527 void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
1527 1528
1528 void (*task_woken)(struct rq *this_rq, struct task_struct *task); 1529 void (*task_woken)(struct rq *this_rq, struct task_struct *task);
1529 1530