[PATCH] VM: early zone reclaim

This is the core of the (much simplified) early reclaim. The goal of this patch is to reclaim some easily-freed pages from a zone before falling back onto another zone. One of the major uses of this is NUMA machines. With the default allocator behavior the allocator would look for memory in another zone, which might be off-node, before trying to reclaim from the current zone. This adds a zone tuneable to enable early zone reclaim. It is selected on a per-zone basis and is turned on/off via syscall. Adding some extra throttling on the reclaim was also required (patch 4/4). Without the machine would grind to a crawl when doing a "make -j" kernel build. Even with this patch the System Time is higher on average, but it seems tolerable. Here are some numbers for kernbench runs on a 2-node, 4cpu, 8Gig RAM Altix in the "make -j" run: wall user sys %cpu ctx sw. sleeps ---- ---- --- ---- ------ ------ No patch 1009 1384 847 258 298170 504402 w/patch, no reclaim 880 1376 667 288 254064 396745 w/patch & reclaim 1079 1385 926 252 291625 548873 These numbers are the average of 2 runs of 3 "make -j" runs done right after system boot. Run-to-run variability for "make -j" is huge, so these numbers aren't terribly useful except to seee that with reclaim the benchmark still finishes in a reasonable amount of time. I also looked at the NUMA hit/miss stats for the "make -j" runs and the reclaim doesn't make any difference when the machine is thrashing away. Doing a "make -j8" on a single node that is filled with page cache pages takes 700 seconds with reclaim turned on and 735 seconds without reclaim (due to remote memory accesses). The simple zone_reclaim syscall program is at http://www.bork.org/~mort/sgi/zone_reclaim.c Signed-off-by: Martin Hicks <mort@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Martin Hicks <mort@sgi.com> 2005-06-21 20:14:41 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-06-21 21:46:14 -0400
commit: 753ee728964e5afb80c17659cc6c3a6fd0a42fe0 (patch)
tree: 41c9a7700d0858c1f77c5bdaba97e5b636f69b06 /mm
parent: bfbb38fb808ac23ef44472d05d9bb36edfb49ed0 (diff)
2 files changed, 92 insertions, 5 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 40169f0b7e9e..3c0f69ded6b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -724,6 +724,14 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        return 1;
 }
+static inline int
+should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
+{
+        if (!z->reclaim_pages)
+                return 0;
+        return 1;
+}
 /*
 * This is the 'heart' of the zoned buddy allocator.
 */
@@ -760,17 +768,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
        classzone_idx = zone_idx(zones[0]);
- restart:
+restart:
        /* Go through the zonelist once, looking for a zone with enough free */
        for (i = 0; (z = zones[i]) != NULL; i++) {
+                int do_reclaim = should_reclaim_zone(z, gfp_mask);
-                if (!zone_watermark_ok(z, order, z->pages_low,
-                                       classzone_idx, 0, 0))
-                        continue;
                if (!cpuset_zone_allowed(z))
                        continue;
+                /*
+                 * If the zone is to attempt early page reclaim then this loop
+                 * will try to reclaim pages and check the watermark a second
+                 * time before giving up and falling back to the next zone.
+                 */
+zone_reclaim_retry:
+                if (!zone_watermark_ok(z, order, z->pages_low,
+                                       classzone_idx, 0, 0)) {
+                        if (!do_reclaim)
+                                continue;
+                        else {
+                                zone_reclaim(z, gfp_mask, order);
+                                /* Only try reclaim once */
+                                do_reclaim = 0;
+                                goto zone_reclaim_retry;
+                        }
+                }
                page = buffered_rmqueue(z, order, gfp_mask);
                if (page)
                        goto got_pg;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6379ddbffd9b..7da846960d8a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1323,3 +1323,67 @@ static int __init kswapd_init(void)
 }
 module_init(kswapd_init)
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
+{
+        struct scan_control sc;
+        int nr_pages = 1 << order;
+        int total_reclaimed = 0;
+        /* The reclaim may sleep, so don't do it if sleep isn't allowed */
+        if (!(gfp_mask & __GFP_WAIT))
+                return 0;
+        if (zone->all_unreclaimable)
+                return 0;
+        sc.gfp_mask = gfp_mask;
+        sc.may_writepage = 0;
+        sc.may_swap = 0;
+        sc.nr_mapped = read_page_state(nr_mapped);
+        sc.nr_scanned = 0;
+        sc.nr_reclaimed = 0;
+        /* scan at the highest priority */
+        sc.priority = 0;
+        if (nr_pages > SWAP_CLUSTER_MAX)
+                sc.swap_cluster_max = nr_pages;
+        else
+                sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+        shrink_zone(zone, &sc);
+        total_reclaimed = sc.nr_reclaimed;
+        return total_reclaimed;
+}
+asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
+                                     unsigned int state)
+{
+        struct zone *z;
+        int i;
+        if (node >= MAX_NUMNODES || !node_online(node))
+                return -EINVAL;
+        /* This will break if we ever add more zones */
+        if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
+                return -EINVAL;
+        for (i = 0; i < MAX_NR_ZONES; i++) {
+                if (!(zone & 1<<i))
+                        continue;
+                z = &NODE_DATA(node)->node_zones[i];
+                if (state)
+                        z->reclaim_pages = 1;
+                else
+                        z->reclaim_pages = 0;
+        }
+        return 0;
+}
author	Martin Hicks <mort@sgi.com>	2005-06-21 20:14:41 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-06-21 21:46:14 -0400
commit	753ee728964e5afb80c17659cc6c3a6fd0a42fe0 (patch)
tree	41c9a7700d0858c1f77c5bdaba97e5b636f69b06 /mm
parent	bfbb38fb808ac23ef44472d05d9bb36edfb49ed0 (diff)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 40169f0b7e9e..3c0f69ded6b5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -724,6 +724,14 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
724	return 1;	724	return 1;
725	}	725	}
726		726
		727	static inline int
		728	should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
		729	{
		730	if (!z->reclaim_pages)
		731	return 0;
		732	return 1;
		733	}
		734
727	/*	735	/*
728	* This is the 'heart' of the zoned buddy allocator.	736	* This is the 'heart' of the zoned buddy allocator.
729	*/	737	*/
@@ -760,17 +768,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
760		768
761	classzone_idx = zone_idx(zones[0]);	769	classzone_idx = zone_idx(zones[0]);
762		770
763	restart:	771	restart:
764	/* Go through the zonelist once, looking for a zone with enough free */	772	/* Go through the zonelist once, looking for a zone with enough free */
765	for (i = 0; (z = zones[i]) != NULL; i++) {	773	for (i = 0; (z = zones[i]) != NULL; i++) {
766		774	int do_reclaim = should_reclaim_zone(z, gfp_mask);
767	if (!zone_watermark_ok(z, order, z->pages_low,
768	classzone_idx, 0, 0))
769	continue;
770		775
771	if (!cpuset_zone_allowed(z))	776	if (!cpuset_zone_allowed(z))
772	continue;	777	continue;
773		778
		779	/*
		780	* If the zone is to attempt early page reclaim then this loop
		781	* will try to reclaim pages and check the watermark a second
		782	* time before giving up and falling back to the next zone.
		783	*/
		784	zone_reclaim_retry:
		785	if (!zone_watermark_ok(z, order, z->pages_low,
		786	classzone_idx, 0, 0)) {
		787	if (!do_reclaim)
		788	continue;
		789	else {
		790	zone_reclaim(z, gfp_mask, order);
		791	/* Only try reclaim once */
		792	do_reclaim = 0;
		793	goto zone_reclaim_retry;
		794	}
		795	}
		796
774	page = buffered_rmqueue(z, order, gfp_mask);	797	page = buffered_rmqueue(z, order, gfp_mask);
775	if (page)	798	if (page)
776	goto got_pg;	799	goto got_pg;


diff --git a/mm/vmscan.c b/mm/vmscan.c index 6379ddbffd9b..7da846960d8a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -1323,3 +1323,67 @@ static int __init kswapd_init(void)
1323	}	1323	}
1324		1324
1325	module_init(kswapd_init)	1325	module_init(kswapd_init)
		1326
		1327
		1328	/*
		1329	* Try to free up some pages from this zone through reclaim.
		1330	*/
		1331	int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
		1332	{
		1333	struct scan_control sc;
		1334	int nr_pages = 1 << order;
		1335	int total_reclaimed = 0;
		1336
		1337	/* The reclaim may sleep, so don't do it if sleep isn't allowed */
		1338	if (!(gfp_mask & __GFP_WAIT))
		1339	return 0;
		1340	if (zone->all_unreclaimable)
		1341	return 0;
		1342
		1343	sc.gfp_mask = gfp_mask;
		1344	sc.may_writepage = 0;
		1345	sc.may_swap = 0;
		1346	sc.nr_mapped = read_page_state(nr_mapped);
		1347	sc.nr_scanned = 0;
		1348	sc.nr_reclaimed = 0;
		1349	/* scan at the highest priority */
		1350	sc.priority = 0;
		1351
		1352	if (nr_pages > SWAP_CLUSTER_MAX)
		1353	sc.swap_cluster_max = nr_pages;
		1354	else
		1355	sc.swap_cluster_max = SWAP_CLUSTER_MAX;
		1356
		1357	shrink_zone(zone, &sc);
		1358	total_reclaimed = sc.nr_reclaimed;
		1359
		1360	return total_reclaimed;
		1361	}
		1362
		1363	asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
		1364	unsigned int state)
		1365	{
		1366	struct zone *z;
		1367	int i;
		1368
		1369	if (node >= MAX_NUMNODES \|\| !node_online(node))
		1370	return -EINVAL;
		1371
		1372	/* This will break if we ever add more zones */
		1373	if (!(zone & (1<<ZONE_DMA\|1<<ZONE_NORMAL\|1<<ZONE_HIGHMEM)))
		1374	return -EINVAL;
		1375
		1376	for (i = 0; i < MAX_NR_ZONES; i++) {
		1377	if (!(zone & 1<<i))
		1378	continue;
		1379
		1380	z = &NODE_DATA(node)->node_zones[i];
		1381
		1382	if (state)
		1383	z->reclaim_pages = 1;
		1384	else
		1385	z->reclaim_pages = 0;
		1386	}
		1387
		1388	return 0;
		1389	}