sched/numa: Adjust scan rate in task_numa_placement

Adjust numa_scan_period in task_numa_placement, depending on how much useful work the numa code can do. The more local faults there are in a given scan window the longer the period (and hence the slower the scan rate) during the next window. If there are excessive shared faults then the scan period will decrease with the amount of scaling depending on whether the ratio of shared/private faults. If the preferred node changes then the scan rate is reset to recheck if the task is properly placed. Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-59-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Rik van Riel <riel@redhat.com> 2013-10-07 06:29:36 -0400
committer: Ingo Molnar <mingo@kernel.org> 2013-10-09 08:48:16 -0400
commit: 04bb2f9475054298f0c67a89ca92cade42d3fe5e (patch)
tree: ab48887e23b7f820380a3f415cbe0a6f64f7fecc /kernel/sched/fair.c
parent: 3e6a9418cf05638b103e34f5d13be0321872e623 (diff)
1 files changed, 87 insertions, 25 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d26a16e45437..66237ff8b01e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1241,6 +1241,12 @@ static int task_numa_migrate(struct task_struct *p)
        sched_setnuma(p, env.dst_nid);
+        /*
+         * Reset the scan period if the task is being rescheduled on an
+         * alternative node to recheck if the tasks is now properly placed.
+         */
+        p->numa_scan_period = task_scan_min(p);
        if (env.best_task == NULL) {
                int ret = migrate_task_to(p, env.best_cpu);
                return ret;
@@ -1276,10 +1282,86 @@ static void numa_migrate_preferred(struct task_struct *p)
                p->numa_migrate_retry = jiffies + HZ*5;
 }
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+                        unsigned long shared, unsigned long private)
+{
+        unsigned int period_slot;
+        int ratio;
+        int diff;
+        unsigned long remote = p->numa_faults_locality[0];
+        unsigned long local = p->numa_faults_locality[1];
+        /*
+         * If there were no record hinting faults then either the task is
+         * completely idle or all activity is areas that are not of interest
+         * to automatic numa balancing. Scan slower
+         */
+        if (local + shared == 0) {
+                p->numa_scan_period = min(p->numa_scan_period_max,
+                        p->numa_scan_period << 1);
+                p->mm->numa_next_scan = jiffies +
+                        msecs_to_jiffies(p->numa_scan_period);
+                return;
+        }
+        /*
+         * Prepare to scale scan period relative to the current period.
+         *       == NUMA_PERIOD_THRESHOLD scan period stays the same
+         *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+         *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+         */
+        period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+        ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+        if (ratio >= NUMA_PERIOD_THRESHOLD) {
+                int slot = ratio - NUMA_PERIOD_THRESHOLD;
+                if (!slot)
+                        slot = 1;
+                diff = slot * period_slot;
+        } else {
+                diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+                /*
+                 * Scale scan rate increases based on sharing. There is an
+                 * inverse relationship between the degree of sharing and
+                 * the adjustment made to the scanning period. Broadly
+                 * speaking the intent is that there is little point
+                 * scanning faster if shared accesses dominate as it may
+                 * simply bounce migrations uselessly
+                 */
+                period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
+                ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+                diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+        }
+        p->numa_scan_period = clamp(p->numa_scan_period + diff,
+                        task_scan_min(p), task_scan_max(p));
+        memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
 static void task_numa_placement(struct task_struct *p)
 {
        int seq, nid, max_nid = -1, max_group_nid = -1;
        unsigned long max_faults = 0, max_group_faults = 0;
+        unsigned long fault_types[2] = { 0, 0 };
        spinlock_t *group_lock = NULL;
        seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1309,6 +1391,7 @@ static void task_numa_placement(struct task_struct *p)
                        /* Decay existing window, copy faults since last scan */
                        p->numa_faults[i] >>= 1;
                        p->numa_faults[i] += p->numa_faults_buffer[i];
+                        fault_types[priv] += p->numa_faults_buffer[i];
                        p->numa_faults_buffer[i] = 0;
                        faults += p->numa_faults[i];
@@ -1333,6 +1416,8 @@ static void task_numa_placement(struct task_struct *p)
                }
        }
+        update_task_scan_period(p, fault_types[0], fault_types[1]);
        if (p->numa_group) {
                /*
                 * If the preferred task and group nids are different,
@@ -1538,6 +1623,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
                BUG_ON(p->numa_faults_buffer);
                p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
                p->total_numa_faults = 0;
+                memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
        }
        /*
@@ -1552,19 +1638,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
                        task_numa_group(p, last_cpupid, flags, &priv);
        }
-        /*
-         * If pages are properly placed (did not migrate) then scan slower.
-         * This is reset periodically in case of phase changes
-         */
-        if (!migrated) {
-                /* Initialise if necessary */
-                if (!p->numa_scan_period_max)
-                        p->numa_scan_period_max = task_scan_max(p);
-                p->numa_scan_period = min(p->numa_scan_period_max,
-                        p->numa_scan_period + 10);
-        }
        task_numa_placement(p);
        /* Retry task to preferred node migration if it previously failed */
@@ -1575,6 +1648,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
                p->numa_pages_migrated += pages;
        p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+        p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
 }
 static void reset_ptenuma_scan(struct task_struct *p)
@@ -1702,18 +1776,6 @@ void task_numa_work(struct callback_head *work)
 out:
        /*
-         * If the whole process was scanned without updates then no NUMA
-         * hinting faults are being recorded and scan rate should be lower.
-         */
-        if (mm->numa_scan_offset == 0 && !nr_pte_updates) {
-                p->numa_scan_period = min(p->numa_scan_period_max,
-                        p->numa_scan_period << 1);
-                next_scan = now + msecs_to_jiffies(p->numa_scan_period);
-                mm->numa_next_scan = next_scan;
-        }
-        /*
         * It is possible to reach the end of the VMA list but the last few
         * VMAs are not guaranteed to the vma_migratable. If they are not, we
         * would find the !migratable VMA on the next scan but not reset the
author	Rik van Riel <riel@redhat.com>	2013-10-07 06:29:36 -0400
committer	Ingo Molnar <mingo@kernel.org>	2013-10-09 08:48:16 -0400
commit	04bb2f9475054298f0c67a89ca92cade42d3fe5e (patch)
tree	ab48887e23b7f820380a3f415cbe0a6f64f7fecc /kernel/sched/fair.c
parent	3e6a9418cf05638b103e34f5d13be0321872e623 (diff)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d26a16e45437..66237ff8b01e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -1241,6 +1241,12 @@ static int task_numa_migrate(struct task_struct *p)
1241		1241
1242	sched_setnuma(p, env.dst_nid);	1242	sched_setnuma(p, env.dst_nid);
1243		1243
		1244	/*
		1245	* Reset the scan period if the task is being rescheduled on an
		1246	* alternative node to recheck if the tasks is now properly placed.
		1247	*/
		1248	p->numa_scan_period = task_scan_min(p);
		1249
1244	if (env.best_task == NULL) {	1250	if (env.best_task == NULL) {
1245	int ret = migrate_task_to(p, env.best_cpu);	1251	int ret = migrate_task_to(p, env.best_cpu);
1246	return ret;	1252	return ret;
@@ -1276,10 +1282,86 @@ static void numa_migrate_preferred(struct task_struct *p)
1276	p->numa_migrate_retry = jiffies + HZ*5;	1282	p->numa_migrate_retry = jiffies + HZ*5;
1277	}	1283	}
1278		1284
		1285	/*
		1286	* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
		1287	* increments. The more local the fault statistics are, the higher the scan
		1288	* period will be for the next scan window. If local/remote ratio is below
		1289	* NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
		1290	* scan period will decrease
		1291	*/
		1292	#define NUMA_PERIOD_SLOTS 10
		1293	#define NUMA_PERIOD_THRESHOLD 3
		1294
		1295	/*
		1296	* Increase the scan period (slow down scanning) if the majority of
		1297	* our memory is already on our local node, or if the majority of
		1298	* the page accesses are shared with other processes.
		1299	* Otherwise, decrease the scan period.
		1300	*/
		1301	static void update_task_scan_period(struct task_struct *p,
		1302	unsigned long shared, unsigned long private)
		1303	{
		1304	unsigned int period_slot;
		1305	int ratio;
		1306	int diff;
		1307
		1308	unsigned long remote = p->numa_faults_locality[0];
		1309	unsigned long local = p->numa_faults_locality[1];
		1310
		1311	/*
		1312	* If there were no record hinting faults then either the task is
		1313	* completely idle or all activity is areas that are not of interest
		1314	* to automatic numa balancing. Scan slower
		1315	*/
		1316	if (local + shared == 0) {
		1317	p->numa_scan_period = min(p->numa_scan_period_max,
		1318	p->numa_scan_period << 1);
		1319
		1320	p->mm->numa_next_scan = jiffies +
		1321	msecs_to_jiffies(p->numa_scan_period);
		1322
		1323	return;
		1324	}
		1325
		1326	/*
		1327	* Prepare to scale scan period relative to the current period.
		1328	* == NUMA_PERIOD_THRESHOLD scan period stays the same
		1329	* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
		1330	* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
		1331	*/
		1332	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
		1333	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
		1334	if (ratio >= NUMA_PERIOD_THRESHOLD) {
		1335	int slot = ratio - NUMA_PERIOD_THRESHOLD;
		1336	if (!slot)
		1337	slot = 1;
		1338	diff = slot * period_slot;
		1339	} else {
		1340	diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
		1341
		1342	/*
		1343	* Scale scan rate increases based on sharing. There is an
		1344	* inverse relationship between the degree of sharing and
		1345	* the adjustment made to the scanning period. Broadly
		1346	* speaking the intent is that there is little point
		1347	* scanning faster if shared accesses dominate as it may
		1348	* simply bounce migrations uselessly
		1349	*/
		1350	period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
		1351	ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
		1352	diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
		1353	}
		1354
		1355	p->numa_scan_period = clamp(p->numa_scan_period + diff,
		1356	task_scan_min(p), task_scan_max(p));
		1357	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
		1358	}
		1359
1279	static void task_numa_placement(struct task_struct *p)	1360	static void task_numa_placement(struct task_struct *p)
1280	{	1361	{
1281	int seq, nid, max_nid = -1, max_group_nid = -1;	1362	int seq, nid, max_nid = -1, max_group_nid = -1;
1282	unsigned long max_faults = 0, max_group_faults = 0;	1363	unsigned long max_faults = 0, max_group_faults = 0;
		1364	unsigned long fault_types[2] = { 0, 0 };
1283	spinlock_t *group_lock = NULL;	1365	spinlock_t *group_lock = NULL;
1284		1366
1285	seq = ACCESS_ONCE(p->mm->numa_scan_seq);	1367	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1309,6 +1391,7 @@ static void task_numa_placement(struct task_struct *p)
1309	/* Decay existing window, copy faults since last scan */	1391	/* Decay existing window, copy faults since last scan */
1310	p->numa_faults[i] >>= 1;	1392	p->numa_faults[i] >>= 1;
1311	p->numa_faults[i] += p->numa_faults_buffer[i];	1393	p->numa_faults[i] += p->numa_faults_buffer[i];
		1394	fault_types[priv] += p->numa_faults_buffer[i];
1312	p->numa_faults_buffer[i] = 0;	1395	p->numa_faults_buffer[i] = 0;
1313		1396
1314	faults += p->numa_faults[i];	1397	faults += p->numa_faults[i];
@@ -1333,6 +1416,8 @@ static void task_numa_placement(struct task_struct *p)
1333	}	1416	}
1334	}	1417	}
1335		1418
		1419	update_task_scan_period(p, fault_types[0], fault_types[1]);
		1420
1336	if (p->numa_group) {	1421	if (p->numa_group) {
1337	/*	1422	/*
1338	* If the preferred task and group nids are different,	1423	* If the preferred task and group nids are different,
@@ -1538,6 +1623,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1538	BUG_ON(p->numa_faults_buffer);	1623	BUG_ON(p->numa_faults_buffer);
1539	p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);	1624	p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
1540	p->total_numa_faults = 0;	1625	p->total_numa_faults = 0;
		1626	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1541	}	1627	}
1542		1628
1543	/*	1629	/*
@@ -1552,19 +1638,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1552	task_numa_group(p, last_cpupid, flags, &priv);	1638	task_numa_group(p, last_cpupid, flags, &priv);
1553	}	1639	}
1554		1640
1555	/*
1556	* If pages are properly placed (did not migrate) then scan slower.
1557	* This is reset periodically in case of phase changes
1558	*/
1559	if (!migrated) {
1560	/* Initialise if necessary */
1561	if (!p->numa_scan_period_max)
1562	p->numa_scan_period_max = task_scan_max(p);
1563
1564	p->numa_scan_period = min(p->numa_scan_period_max,
1565	p->numa_scan_period + 10);
1566	}
1567
1568	task_numa_placement(p);	1641	task_numa_placement(p);
1569		1642
1570	/* Retry task to preferred node migration if it previously failed */	1643	/* Retry task to preferred node migration if it previously failed */
@@ -1575,6 +1648,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1575	p->numa_pages_migrated += pages;	1648	p->numa_pages_migrated += pages;
1576		1649
1577	p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;	1650	p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
		1651	p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
1578	}	1652	}
1579		1653
1580	static void reset_ptenuma_scan(struct task_struct *p)	1654	static void reset_ptenuma_scan(struct task_struct *p)
@@ -1702,18 +1776,6 @@ void task_numa_work(struct callback_head *work)
1702		1776
1703	out:	1777	out:
1704	/*	1778	/*
1705	* If the whole process was scanned without updates then no NUMA
1706	* hinting faults are being recorded and scan rate should be lower.
1707	*/
1708	if (mm->numa_scan_offset == 0 && !nr_pte_updates) {
1709	p->numa_scan_period = min(p->numa_scan_period_max,
1710	p->numa_scan_period << 1);
1711
1712	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
1713	mm->numa_next_scan = next_scan;
1714	}
1715
1716	/*
1717	* It is possible to reach the end of the VMA list but the last few	1779	* It is possible to reach the end of the VMA list but the last few
1718	* VMAs are not guaranteed to the vma_migratable. If they are not, we	1780	* VMAs are not guaranteed to the vma_migratable. If they are not, we
1719	* would find the !migratable VMA on the next scan but not reset the	1781	* would find the !migratable VMA on the next scan but not reset the