aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@techsingularity.net>2018-05-09 12:31:15 -0400
committerIngo Molnar <mingo@kernel.org>2018-05-12 02:37:56 -0400
commit789ba28013ce23dbf5e9f5f014f4233b35523bf3 (patch)
tree14762071a1c579fc7eaf072423b0118d9b8b26dc
parent94d7dbf108813ea45a91e27e9a8bd231d5a23fa7 (diff)
Revert "sched/numa: Delay retrying placement for automatic NUMA balance after wake_affine()"
This reverts commit 7347fc87dfe6b7315e74310ee1243dc222c68086. Srikar Dronamra pointed out that while the commit in question did show a performance improvement on ppc64, it did so at the cost of disabling active CPU migration by automatic NUMA balancing which was not the intent. The issue was that a serious flaw in the logic failed to ever active balance if SD_WAKE_AFFINE was disabled on scheduler domains. Even when it's enabled, the logic is still bizarre and against the original intent. Investigation showed that fixing the patch in either the way he suggested, using the correct comparison for jiffies values or introducing a new numa_migrate_deferred variable in task_struct all perform similarly to a revert with a mix of gains and losses depending on the workload, machine and socket count. The original intent of the commit was to handle a problem whereby wake_affine, idle balancing and automatic NUMA balancing disagree on the appropriate placement for a task. This was particularly true for cases where a single task was a massive waker of tasks but where wake_wide logic did not apply. This was particularly noticeable when a futex (a barrier) woke all worker threads and tried pulling the wakees to the waker nodes. In that specific case, it could be handled by tuning MPI or openMP appropriately, but the behavior is not illogical and was worth attempting to fix. However, the approach was wrong. Given that we're at rc4 and a fix is not obvious, it's better to play safe, revert this commit and retry later. Signed-off-by: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: efault@gmx.de Cc: ggherdovich@suse.cz Cc: hpa@zytor.com Cc: matt@codeblueprint.co.uk Cc: mpe@ellerman.id.au Link: http://lkml.kernel.org/r/20180509163115.6fnnyeg4vdm2ct4v@techsingularity.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--kernel/sched/fair.c57
1 files changed, 1 insertions, 56 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 54dc31e7ab9b..f43627c6bb3d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1854,7 +1854,6 @@ static int task_numa_migrate(struct task_struct *p)
1854static void numa_migrate_preferred(struct task_struct *p) 1854static void numa_migrate_preferred(struct task_struct *p)
1855{ 1855{
1856 unsigned long interval = HZ; 1856 unsigned long interval = HZ;
1857 unsigned long numa_migrate_retry;
1858 1857
1859 /* This task has no NUMA fault statistics yet */ 1858 /* This task has no NUMA fault statistics yet */
1860 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) 1859 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
@@ -1862,18 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p)
1862 1861
1863 /* Periodically retry migrating the task to the preferred node */ 1862 /* Periodically retry migrating the task to the preferred node */
1864 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); 1863 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1865 numa_migrate_retry = jiffies + interval; 1864 p->numa_migrate_retry = jiffies + interval;
1866
1867 /*
1868 * Check that the new retry threshold is after the current one. If
1869 * the retry is in the future, it implies that wake_affine has
1870 * temporarily asked NUMA balancing to backoff from placement.
1871 */
1872 if (numa_migrate_retry > p->numa_migrate_retry)
1873 return;
1874
1875 /* Safe to try placing the task on the preferred node */
1876 p->numa_migrate_retry = numa_migrate_retry;
1877 1865
1878 /* Success if task is already running on preferred CPU */ 1866 /* Success if task is already running on preferred CPU */
1879 if (task_node(p) == p->numa_preferred_nid) 1867 if (task_node(p) == p->numa_preferred_nid)
@@ -5922,48 +5910,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5922 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; 5910 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5923} 5911}
5924 5912
5925#ifdef CONFIG_NUMA_BALANCING
5926static void
5927update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
5928{
5929 unsigned long interval;
5930
5931 if (!static_branch_likely(&sched_numa_balancing))
5932 return;
5933
5934 /* If balancing has no preference then continue gathering data */
5935 if (p->numa_preferred_nid == -1)
5936 return;
5937
5938 /*
5939 * If the wakeup is not affecting locality then it is neutral from
5940 * the perspective of NUMA balacing so continue gathering data.
5941 */
5942 if (cpu_to_node(prev_cpu) == cpu_to_node(target))
5943 return;
5944
5945 /*
5946 * Temporarily prevent NUMA balancing trying to place waker/wakee after
5947 * wakee has been moved by wake_affine. This will potentially allow
5948 * related tasks to converge and update their data placement. The
5949 * 4 * numa_scan_period is to allow the two-pass filter to migrate
5950 * hot data to the wakers node.
5951 */
5952 interval = max(sysctl_numa_balancing_scan_delay,
5953 p->numa_scan_period << 2);
5954 p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
5955
5956 interval = max(sysctl_numa_balancing_scan_delay,
5957 current->numa_scan_period << 2);
5958 current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
5959}
5960#else
5961static void
5962update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
5963{
5964}
5965#endif
5966
5967static int wake_affine(struct sched_domain *sd, struct task_struct *p, 5913static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5968 int this_cpu, int prev_cpu, int sync) 5914 int this_cpu, int prev_cpu, int sync)
5969{ 5915{
@@ -5979,7 +5925,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5979 if (target == nr_cpumask_bits) 5925 if (target == nr_cpumask_bits)
5980 return prev_cpu; 5926 return prev_cpu;
5981 5927
5982 update_wa_numa_placement(p, prev_cpu, target);
5983 schedstat_inc(sd->ttwu_move_affine); 5928 schedstat_inc(sd->ttwu_move_affine);
5984 schedstat_inc(p->se.statistics.nr_wakeups_affine); 5929 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5985 return target; 5930 return target;