aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@techsingularity.net>2018-02-13 08:37:30 -0500
committerIngo Molnar <mingo@kernel.org>2018-02-21 02:49:45 -0500
commit7347fc87dfe6b7315e74310ee1243dc222c68086 (patch)
treec2cdf5171e956329b953ae253fe64a66e245c6db
parent2c83362734dad8e48ccc0710b5cd2436a0323893 (diff)
sched/numa: Delay retrying placement for automatic NUMA balance after wake_affine()
If wake_affine() pulls a task to another node for any reason and the node is no longer preferred then temporarily stop automatic NUMA balancing pulling the task back. Otherwise, tasks with a strong waker/wakee relationship may constantly fight automatic NUMA balancing over where a task should be placed. Once again netperf is interesting here. The performance barely changes but automatic NUMA balancing is interesting: Hmean send-64 354.67 ( 0.00%) 352.15 ( -0.71%) Hmean send-128 702.91 ( 0.00%) 693.84 ( -1.29%) Hmean send-256 1350.07 ( 0.00%) 1344.19 ( -0.44%) Hmean send-1024 5124.38 ( 0.00%) 4941.24 ( -3.57%) Hmean send-2048 9687.44 ( 0.00%) 9624.45 ( -0.65%) Hmean send-3312 14577.64 ( 0.00%) 14514.35 ( -0.43%) Hmean send-4096 16393.62 ( 0.00%) 16488.30 ( 0.58%) Hmean send-8192 26877.26 ( 0.00%) 26431.63 ( -1.66%) Hmean send-16384 38683.43 ( 0.00%) 38264.91 ( -1.08%) Hmean recv-64 354.67 ( 0.00%) 352.15 ( -0.71%) Hmean recv-128 702.91 ( 0.00%) 693.84 ( -1.29%) Hmean recv-256 1350.07 ( 0.00%) 1344.19 ( -0.44%) Hmean recv-1024 5124.38 ( 0.00%) 4941.24 ( -3.57%) Hmean recv-2048 9687.43 ( 0.00%) 9624.45 ( -0.65%) Hmean recv-3312 14577.59 ( 0.00%) 14514.35 ( -0.43%) Hmean recv-4096 16393.55 ( 0.00%) 16488.20 ( 0.58%) Hmean recv-8192 26876.96 ( 0.00%) 26431.29 ( -1.66%) Hmean recv-16384 38682.41 ( 0.00%) 38263.94 ( -1.08%) NUMA alloc hit 1465986 1423090 NUMA alloc miss 0 0 NUMA interleave hit 0 0 NUMA alloc local 1465897 1423003 NUMA base PTE updates 1473 1420 NUMA huge PMD updates 0 0 NUMA page range updates 1473 1420 NUMA hint faults 1383 1312 NUMA hint local faults 451 124 NUMA hint local percent 32 9 There is a slight degrading in performance but there are slightly fewer NUMA faults. There is a large drop in the percentage of local faults but the bulk of migrations for netperf are in small shared libraries so it's reflecting the fact that automatic NUMA balancing has backed off. This is a case where despite wake_affine() and automatic NUMA balancing fighting for placement that there is a marginal benefit to rescheduling to local data quickly. However, it should be noted that wake_affine() and automatic NUMA balancing fighting each other constantly is undesirable. However, the benefit in other cases is large. This is the result for NAS with the D class sizing on a 4-socket machine: nas-mpi 4.15.0 4.15.0 sdnuma-v1r23 delayretry-v1r23 Time cg.D 557.00 ( 0.00%) 431.82 ( 22.47%) Time ep.D 77.83 ( 0.00%) 79.01 ( -1.52%) Time is.D 26.46 ( 0.00%) 26.64 ( -0.68%) Time lu.D 727.14 ( 0.00%) 597.94 ( 17.77%) Time mg.D 191.35 ( 0.00%) 146.85 ( 23.26%) 4.15.0 4.15.0 sdnuma-v1r23delayretry-v1r23 User 75665.20 70413.30 System 20321.59 8861.67 Elapsed 766.13 634.92 Minor Faults 16528502 7127941 Major Faults 4553 5068 NUMA alloc local 6963197 6749135 NUMA base PTE updates 366409093 107491434 NUMA huge PMD updates 687556 198880 NUMA page range updates 718437765 209317994 NUMA hint faults 13643410 4601187 NUMA hint local faults 9212593 3063996 NUMA hint local percent 67 66 Note the massive reduction in system CPU usage even though the percentage of local faults is barely affected. There is a massive reduction in the number of PTE updates showing that automatic NUMA balancing has backed off. A critical observation is also that there is a massive reduction in minor faults which is due to far fewer NUMA hinting faults being trapped. There were questions on NAS OMP and how it behaved related to threads being bound to CPUs. First, there are more gains than losses with this patch applied and a reduction in system CPU usage: nas-omp 4.16.0-rc1 4.16.0-rc1 sdnuma-v2r1 delayretry-v2r1 Time bt.D 436.71 ( 0.00%) 430.05 ( 1.53%) Time cg.D 201.02 ( 0.00%) 180.87 ( 10.02%) Time ep.D 32.84 ( 0.00%) 32.68 ( 0.49%) Time is.D 9.63 ( 0.00%) 9.64 ( -0.10%) Time lu.D 331.20 ( 0.00%) 304.80 ( 7.97%) Time mg.D 54.87 ( 0.00%) 52.72 ( 3.92%) Time sp.D 1108.78 ( 0.00%) 917.10 ( 17.29%) Time ua.D 378.81 ( 0.00%) 398.83 ( -5.28%) 4.16.0-rc1 4.16.0-rc1 sdnuma-v2r1delayretry-v2r1 User 305633.08 296751.91 System 451.75 357.80 Elapsed 2595.73 2368.13 However, it does not close the gap between binding and being unbound. There is negligible difference between the performance of the baseline and a patched kernel when threads are bound so it is not presented here: 4.16.0-rc1 4.16.0-rc1 delayretry-bind delayretry-unbound Time bt.D 385.02 ( 0.00%) 430.05 ( -11.70%) Time cg.D 144.02 ( 0.00%) 180.87 ( -25.59%) Time ep.D 32.85 ( 0.00%) 32.68 ( 0.52%) Time is.D 10.52 ( 0.00%) 9.64 ( 8.37%) Time lu.D 285.31 ( 0.00%) 304.80 ( -6.83%) Time mg.D 43.21 ( 0.00%) 52.72 ( -22.01%) Time sp.D 820.24 ( 0.00%) 917.10 ( -11.81%) Time ua.D 337.09 ( 0.00%) 398.83 ( -18.32%) 4.16.0-rc1 4.16.0-rc1 delayretry-binddelayretry-unbound User 277731.25 296751.91 System 261.29 357.80 Elapsed 2100.55 2368.13 Unfortunately, while performance is improved by the patch, there is still quite a long way to go before it's equivalent to hard binding. Other workloads like hackbench, tbench, dbench and schbench are barely affected. dbench shows a mix of gains and losses depending on the machine although in general, the results are more stable. Signed-off-by: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Giovanni Gherdovich <ggherdovich@suse.cz> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matt Fleming <matt@codeblueprint.co.uk> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/20180213133730.24064-7-mgorman@techsingularity.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--kernel/sched/fair.c57
1 files changed, 56 insertions, 1 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 94aea5b91a96..33662a3bdc6d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1869,6 +1869,7 @@ static int task_numa_migrate(struct task_struct *p)
1869static void numa_migrate_preferred(struct task_struct *p) 1869static void numa_migrate_preferred(struct task_struct *p)
1870{ 1870{
1871 unsigned long interval = HZ; 1871 unsigned long interval = HZ;
1872 unsigned long numa_migrate_retry;
1872 1873
1873 /* This task has no NUMA fault statistics yet */ 1874 /* This task has no NUMA fault statistics yet */
1874 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) 1875 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
@@ -1876,7 +1877,18 @@ static void numa_migrate_preferred(struct task_struct *p)
1876 1877
1877 /* Periodically retry migrating the task to the preferred node */ 1878 /* Periodically retry migrating the task to the preferred node */
1878 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); 1879 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1879 p->numa_migrate_retry = jiffies + interval; 1880 numa_migrate_retry = jiffies + interval;
1881
1882 /*
1883 * Check that the new retry threshold is after the current one. If
1884 * the retry is in the future, it implies that wake_affine has
1885 * temporarily asked NUMA balancing to backoff from placement.
1886 */
1887 if (numa_migrate_retry > p->numa_migrate_retry)
1888 return;
1889
1890 /* Safe to try placing the task on the preferred node */
1891 p->numa_migrate_retry = numa_migrate_retry;
1880 1892
1881 /* Success if task is already running on preferred CPU */ 1893 /* Success if task is already running on preferred CPU */
1882 if (task_node(p) == p->numa_preferred_nid) 1894 if (task_node(p) == p->numa_preferred_nid)
@@ -5759,6 +5771,48 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5759 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; 5771 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5760} 5772}
5761 5773
5774#ifdef CONFIG_NUMA_BALANCING
5775static void
5776update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
5777{
5778 unsigned long interval;
5779
5780 if (!static_branch_likely(&sched_numa_balancing))
5781 return;
5782
5783 /* If balancing has no preference then continue gathering data */
5784 if (p->numa_preferred_nid == -1)
5785 return;
5786
5787 /*
5788 * If the wakeup is not affecting locality then it is neutral from
5789 * the perspective of NUMA balacing so continue gathering data.
5790 */
5791 if (cpu_to_node(prev_cpu) == cpu_to_node(target))
5792 return;
5793
5794 /*
5795 * Temporarily prevent NUMA balancing trying to place waker/wakee after
5796 * wakee has been moved by wake_affine. This will potentially allow
5797 * related tasks to converge and update their data placement. The
5798 * 4 * numa_scan_period is to allow the two-pass filter to migrate
5799 * hot data to the wakers node.
5800 */
5801 interval = max(sysctl_numa_balancing_scan_delay,
5802 p->numa_scan_period << 2);
5803 p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
5804
5805 interval = max(sysctl_numa_balancing_scan_delay,
5806 current->numa_scan_period << 2);
5807 current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
5808}
5809#else
5810static void
5811update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
5812{
5813}
5814#endif
5815
5762static int wake_affine(struct sched_domain *sd, struct task_struct *p, 5816static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5763 int this_cpu, int prev_cpu, int sync) 5817 int this_cpu, int prev_cpu, int sync)
5764{ 5818{
@@ -5774,6 +5828,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5774 if (target == nr_cpumask_bits) 5828 if (target == nr_cpumask_bits)
5775 return prev_cpu; 5829 return prev_cpu;
5776 5830
5831 update_wa_numa_placement(p, prev_cpu, target);
5777 schedstat_inc(sd->ttwu_move_affine); 5832 schedstat_inc(sd->ttwu_move_affine);
5778 schedstat_inc(p->se.statistics.nr_wakeups_affine); 5833 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5779 return target; 5834 return target;