[PATCH] Zone reclaim: Reclaim logic

Some bits for zone reclaim exists in 2.6.15 but they are not usable. This patch fixes them up, removes unused code and makes zone reclaim usable. Zone reclaim allows the reclaiming of pages from a zone if the number of free pages falls below the watermarks even if other zones still have enough pages available. Zone reclaim is of particular importance for NUMA machines. It can be more beneficial to reclaim a page than taking the performance penalties that come with allocating a page on a remote zone. Zone reclaim is enabled if the maximum distance to another node is higher than RECLAIM_DISTANCE, which may be defined by an arch. By default RECLAIM_DISTANCE is 20. 20 is the distance to another node in the same component (enclosure or motherboard) on IA64. The meaning of the NUMA distance information seems to vary by arch. If zone reclaim is not successful then no further reclaim attempts will occur for a certain time period (ZONE_RECLAIM_INTERVAL). This patch was discussed before. See http://marc.theaimsgroup.com/?l=linux-kernel&m=113519961504207&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113408418232531&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113389027420032&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113380938612205&w=2 Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Christoph Lameter <clameter@sgi.com> 2006-01-18 20:42:31 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-01-18 22:20:17 -0500
commit: 9eeff2395e3cfd05c9b2e6074ff943a34b0c5c21 (patch)
tree: 20160098ec6ed8738cfecfc5f81181ad22b44e60 /mm
parent: f1fd1067ece574ab56e4a70878b9a5a1ed4c3c42 (diff)
2 files changed, 82 insertions, 3 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2e29743a8d1..df54e2fc8ee0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                                mark = (*z)->pages_high;
                        if (!zone_watermark_ok(*z, order, mark,
                                    classzone_idx, alloc_flags))
-                                continue;
+                                if (!zone_reclaim_mode ||
+                                    !zone_reclaim(*z, gfp_mask, order))
+                                        continue;
                }
                page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
@@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
        prev_node = local_node;
        nodes_clear(used_mask);
        while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+                int distance = node_distance(local_node, node);
+                /*
+                 * If another node is sufficiently far away then it is better
+                 * to reclaim pages in a zone before going off node.
+                 */
+                if (distance > RECLAIM_DISTANCE)
+                        zone_reclaim_mode = 1;
                /*
                 * We don't want to pressure a particular node.
                 * So adding penalty to the first node in same
                 * distance group to make it round-robin.
                 */
-                if (node_distance(local_node, node) !=
-                                node_distance(local_node, prev_node))
+                if (distance != node_distance(local_node, prev_node))
                        node_load[node] += load;
                prev_node = node;
                load--;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5117b6897a9..2e34b61a70c7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1572,3 +1572,71 @@ static int __init kswapd_init(void)
 }
 module_init(kswapd_init)
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
+/*
+ * Mininum time between zone reclaim scans
+ */
+#define ZONE_RECLAIM_INTERVAL HZ/2
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+        int nr_pages = 1 << order;
+        struct task_struct *p = current;
+        struct reclaim_state reclaim_state;
+        struct scan_control sc = {
+                .gfp_mask       = gfp_mask,
+                .may_writepage  = 0,
+                .may_swap       = 0,
+                .nr_mapped      = read_page_state(nr_mapped),
+                .nr_scanned     = 0,
+                .nr_reclaimed   = 0,
+                .priority       = 0
+        };
+        if (!(gfp_mask & __GFP_WAIT) ||
+                zone->zone_pgdat->node_id != numa_node_id() ||
+                zone->all_unreclaimable ||
+                atomic_read(&zone->reclaim_in_progress) > 0)
+                        return 0;
+        if (time_before(jiffies,
+                zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+                        return 0;
+        disable_swap_token();
+        if (nr_pages > SWAP_CLUSTER_MAX)
+                sc.swap_cluster_max = nr_pages;
+        else
+                sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+        cond_resched();
+        p->flags |= PF_MEMALLOC;
+        reclaim_state.reclaimed_slab = 0;
+        p->reclaim_state = &reclaim_state;
+        shrink_zone(zone, &sc);
+        p->reclaim_state = NULL;
+        current->flags &= ~PF_MEMALLOC;
+        if (sc.nr_reclaimed == 0)
+                zone->last_unsuccessful_zone_reclaim = jiffies;
+        return sc.nr_reclaimed > nr_pages;
+}
+#endif
author	Christoph Lameter <clameter@sgi.com>	2006-01-18 20:42:31 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-01-18 22:20:17 -0500
commit	9eeff2395e3cfd05c9b2e6074ff943a34b0c5c21 (patch)
tree	20160098ec6ed8738cfecfc5f81181ad22b44e60 /mm
parent	f1fd1067ece574ab56e4a70878b9a5a1ed4c3c42 (diff)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2e29743a8d1..df54e2fc8ee0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
878	mark = (*z)->pages_high;	878	mark = (*z)->pages_high;
879	if (!zone_watermark_ok(*z, order, mark,	879	if (!zone_watermark_ok(*z, order, mark,
880	classzone_idx, alloc_flags))	880	classzone_idx, alloc_flags))
881	continue;	881	if (!zone_reclaim_mode \|\|
		882	!zone_reclaim(*z, gfp_mask, order))
		883	continue;
882	}	884	}
883		885
884	page = buffered_rmqueue(zonelist, *z, order, gfp_mask);	886	page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
@@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
1595	prev_node = local_node;	1597	prev_node = local_node;
1596	nodes_clear(used_mask);	1598	nodes_clear(used_mask);
1597	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {	1599	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
		1600	int distance = node_distance(local_node, node);
		1601
		1602	/*
		1603	* If another node is sufficiently far away then it is better
		1604	* to reclaim pages in a zone before going off node.
		1605	*/
		1606	if (distance > RECLAIM_DISTANCE)
		1607	zone_reclaim_mode = 1;
		1608
1598	/*	1609	/*
1599	* We don't want to pressure a particular node.	1610	* We don't want to pressure a particular node.
1600	* So adding penalty to the first node in same	1611	* So adding penalty to the first node in same
1601	* distance group to make it round-robin.	1612	* distance group to make it round-robin.
1602	*/	1613	*/
1603	if (node_distance(local_node, node) !=	1614
1604	node_distance(local_node, prev_node))	1615	if (distance != node_distance(local_node, prev_node))
1605	node_load[node] += load;	1616	node_load[node] += load;
1606	prev_node = node;	1617	prev_node = node;
1607	load--;	1618	load--;


diff --git a/mm/vmscan.c b/mm/vmscan.c index e5117b6897a9..2e34b61a70c7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -1572,3 +1572,71 @@ static int __init kswapd_init(void)
1572	}	1572	}
1573		1573
1574	module_init(kswapd_init)	1574	module_init(kswapd_init)
		1575
		1576	#ifdef CONFIG_NUMA
		1577	/*
		1578	* Zone reclaim mode
		1579	*
		1580	* If non-zero call zone_reclaim when the number of free pages falls below
		1581	* the watermarks.
		1582	*
		1583	* In the future we may add flags to the mode. However, the page allocator
		1584	* should only have to check that zone_reclaim_mode != 0 before calling
		1585	* zone_reclaim().
		1586	*/
		1587	int zone_reclaim_mode __read_mostly;
		1588
		1589	/*
		1590	* Mininum time between zone reclaim scans
		1591	*/
		1592	#define ZONE_RECLAIM_INTERVAL HZ/2
		1593	/*
		1594	* Try to free up some pages from this zone through reclaim.
		1595	*/
		1596	int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
		1597	{
		1598	int nr_pages = 1 << order;
		1599	struct task_struct *p = current;
		1600	struct reclaim_state reclaim_state;
		1601	struct scan_control sc = {
		1602	.gfp_mask = gfp_mask,
		1603	.may_writepage = 0,
		1604	.may_swap = 0,
		1605	.nr_mapped = read_page_state(nr_mapped),
		1606	.nr_scanned = 0,
		1607	.nr_reclaimed = 0,
		1608	.priority = 0
		1609	};
		1610
		1611	if (!(gfp_mask & __GFP_WAIT) \|\|
		1612	zone->zone_pgdat->node_id != numa_node_id() \|\|
		1613	zone->all_unreclaimable \|\|
		1614	atomic_read(&zone->reclaim_in_progress) > 0)
		1615	return 0;
		1616
		1617	if (time_before(jiffies,
		1618	zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
		1619	return 0;
		1620
		1621	disable_swap_token();
		1622
		1623	if (nr_pages > SWAP_CLUSTER_MAX)
		1624	sc.swap_cluster_max = nr_pages;
		1625	else
		1626	sc.swap_cluster_max = SWAP_CLUSTER_MAX;
		1627
		1628	cond_resched();
		1629	p->flags \|= PF_MEMALLOC;
		1630	reclaim_state.reclaimed_slab = 0;
		1631	p->reclaim_state = &reclaim_state;
		1632	shrink_zone(zone, &sc);
		1633	p->reclaim_state = NULL;
		1634	current->flags &= ~PF_MEMALLOC;
		1635
		1636	if (sc.nr_reclaimed == 0)
		1637	zone->last_unsuccessful_zone_reclaim = jiffies;
		1638
		1639	return sc.nr_reclaimed > nr_pages;
		1640	}
		1641	#endif
		1642