[PATCH] Zone reclaim: Reclaim logic

Some bits for zone reclaim exists in 2.6.15 but they are not usable. This patch fixes them up, removes unused code and makes zone reclaim usable. Zone reclaim allows the reclaiming of pages from a zone if the number of free pages falls below the watermarks even if other zones still have enough pages available. Zone reclaim is of particular importance for NUMA machines. It can be more beneficial to reclaim a page than taking the performance penalties that come with allocating a page on a remote zone. Zone reclaim is enabled if the maximum distance to another node is higher than RECLAIM_DISTANCE, which may be defined by an arch. By default RECLAIM_DISTANCE is 20. 20 is the distance to another node in the same component (enclosure or motherboard) on IA64. The meaning of the NUMA distance information seems to vary by arch. If zone reclaim is not successful then no further reclaim attempts will occur for a certain time period (ZONE_RECLAIM_INTERVAL). This patch was discussed before. See http://marc.theaimsgroup.com/?l=linux-kernel&m=113519961504207&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113408418232531&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113389027420032&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113380938612205&w=2 Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Christoph Lameter <clameter@sgi.com> 2006-01-18 20:42:31 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-01-18 22:20:17 -0500
commit: 9eeff2395e3cfd05c9b2e6074ff943a34b0c5c21 (patch)
tree: 20160098ec6ed8738cfecfc5f81181ad22b44e60
parent: f1fd1067ece574ab56e4a70878b9a5a1ed4c3c42 (diff)
5 files changed, 108 insertions, 8 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 34cbefd2ebde..93a849f742db 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -149,15 +149,17 @@ struct zone {
        unsigned long           pages_scanned;     /* since last reclaim */
        int                     all_unreclaimable; /* All pages pinned */
-        /*
-         * Does the allocator try to reclaim pages from the zone as soon
-         * as it fails a watermark_ok() in __alloc_pages?
-         */
-        int                     reclaim_pages;
        /* A count of how many reclaimers are scanning this zone */
        atomic_t                reclaim_in_progress;
        /*
+         * timestamp (in jiffies) of the last zone reclaim that did not
+         * result in freeing of pages. This is used to avoid repeated scans
+         * if all memory in the zone is in use.
+         */
+        unsigned long           last_unsuccessful_zone_reclaim;
+        /*
         * prev_priority holds the scanning priority for this zone.  It is
         * defined as the scanning priority at which we achieved our reclaim
         * target at the previous try_to_free_pages() or balance_pgdat()
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d01f7efb0f2c..4a99e4a7fbf3 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -176,6 +176,17 @@ extern int try_to_free_pages(struct zone **, gfp_t);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;
+#ifdef CONFIG_NUMA
+extern int zone_reclaim_mode;
+extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
+#else
+#define zone_reclaim_mode 0
+static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
+{
+        return 0;
+}
+#endif
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 315a5163d6a0..e8eb0040ce3a 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -56,6 +56,14 @@
 #define REMOTE_DISTANCE         20
 #define node_distance(from,to)  ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
 #endif
+#ifndef RECLAIM_DISTANCE
+/*
+ * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
+ * (in whatever arch specific measurement units returned by node_distance())
+ * then switch on zone reclaim on boot.
+ */
+#define RECLAIM_DISTANCE 20
+#endif
 #ifndef PENALTY_FOR_NODE_WITH_CPUS
 #define PENALTY_FOR_NODE_WITH_CPUS      (1)
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2e29743a8d1..df54e2fc8ee0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                                mark = (*z)->pages_high;
                        if (!zone_watermark_ok(*z, order, mark,
                                    classzone_idx, alloc_flags))
-                                continue;
+                                if (!zone_reclaim_mode ||
+                                    !zone_reclaim(*z, gfp_mask, order))
+                                        continue;
                }
                page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
@@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
        prev_node = local_node;
        nodes_clear(used_mask);
        while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+                int distance = node_distance(local_node, node);
+                /*
+                 * If another node is sufficiently far away then it is better
+                 * to reclaim pages in a zone before going off node.
+                 */
+                if (distance > RECLAIM_DISTANCE)
+                        zone_reclaim_mode = 1;
                /*
                 * We don't want to pressure a particular node.
                 * So adding penalty to the first node in same
                 * distance group to make it round-robin.
                 */
-                if (node_distance(local_node, node) !=
-                                node_distance(local_node, prev_node))
+                if (distance != node_distance(local_node, prev_node))
                        node_load[node] += load;
                prev_node = node;
                load--;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5117b6897a9..2e34b61a70c7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1572,3 +1572,71 @@ static int __init kswapd_init(void)
 }
 module_init(kswapd_init)
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
+/*
+ * Mininum time between zone reclaim scans
+ */
+#define ZONE_RECLAIM_INTERVAL HZ/2
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+        int nr_pages = 1 << order;
+        struct task_struct *p = current;
+        struct reclaim_state reclaim_state;
+        struct scan_control sc = {
+                .gfp_mask       = gfp_mask,
+                .may_writepage  = 0,
+                .may_swap       = 0,
+                .nr_mapped      = read_page_state(nr_mapped),
+                .nr_scanned     = 0,
+                .nr_reclaimed   = 0,
+                .priority       = 0
+        };
+        if (!(gfp_mask & __GFP_WAIT) ||
+                zone->zone_pgdat->node_id != numa_node_id() ||
+                zone->all_unreclaimable ||
+                atomic_read(&zone->reclaim_in_progress) > 0)
+                        return 0;
+        if (time_before(jiffies,
+                zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+                        return 0;
+        disable_swap_token();
+        if (nr_pages > SWAP_CLUSTER_MAX)
+                sc.swap_cluster_max = nr_pages;
+        else
+                sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+        cond_resched();
+        p->flags |= PF_MEMALLOC;
+        reclaim_state.reclaimed_slab = 0;
+        p->reclaim_state = &reclaim_state;
+        shrink_zone(zone, &sc);
+        p->reclaim_state = NULL;
+        current->flags &= ~PF_MEMALLOC;
+        if (sc.nr_reclaimed == 0)
+                zone->last_unsuccessful_zone_reclaim = jiffies;
+        return sc.nr_reclaimed > nr_pages;
+}
+#endif
author	Christoph Lameter <clameter@sgi.com>	2006-01-18 20:42:31 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-01-18 22:20:17 -0500
commit	9eeff2395e3cfd05c9b2e6074ff943a34b0c5c21 (patch)
tree	20160098ec6ed8738cfecfc5f81181ad22b44e60
parent	f1fd1067ece574ab56e4a70878b9a5a1ed4c3c42 (diff)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 34cbefd2ebde..93a849f742db 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h
@@ -149,15 +149,17 @@ struct zone {
149	unsigned long pages_scanned; /* since last reclaim */	149	unsigned long pages_scanned; /* since last reclaim */
150	int all_unreclaimable; /* All pages pinned */	150	int all_unreclaimable; /* All pages pinned */
151		151
152	/*
153	* Does the allocator try to reclaim pages from the zone as soon
154	* as it fails a watermark_ok() in __alloc_pages?
155	*/
156	int reclaim_pages;
157	/* A count of how many reclaimers are scanning this zone */	152	/* A count of how many reclaimers are scanning this zone */
158	atomic_t reclaim_in_progress;	153	atomic_t reclaim_in_progress;
159		154
160	/*	155	/*
		156	* timestamp (in jiffies) of the last zone reclaim that did not
		157	* result in freeing of pages. This is used to avoid repeated scans
		158	* if all memory in the zone is in use.
		159	*/
		160	unsigned long last_unsuccessful_zone_reclaim;
		161
		162	/*
161	* prev_priority holds the scanning priority for this zone. It is	163	* prev_priority holds the scanning priority for this zone. It is
162	* defined as the scanning priority at which we achieved our reclaim	164	* defined as the scanning priority at which we achieved our reclaim
163	* target at the previous try_to_free_pages() or balance_pgdat()	165	* target at the previous try_to_free_pages() or balance_pgdat()


diff --git a/include/linux/swap.h b/include/linux/swap.h index d01f7efb0f2c..4a99e4a7fbf3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h
@@ -176,6 +176,17 @@ extern int try_to_free_pages(struct zone **, gfp_t);
176	extern int shrink_all_memory(int);	176	extern int shrink_all_memory(int);
177	extern int vm_swappiness;	177	extern int vm_swappiness;
178		178
		179	#ifdef CONFIG_NUMA
		180	extern int zone_reclaim_mode;
		181	extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
		182	#else
		183	#define zone_reclaim_mode 0
		184	static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
		185	{
		186	return 0;
		187	}
		188	#endif
		189
179	#ifdef CONFIG_MIGRATION	190	#ifdef CONFIG_MIGRATION
180	extern int isolate_lru_page(struct page *p);	191	extern int isolate_lru_page(struct page *p);
181	extern int putback_lru_pages(struct list_head *l);	192	extern int putback_lru_pages(struct list_head *l);


diff --git a/include/linux/topology.h b/include/linux/topology.h index 315a5163d6a0..e8eb0040ce3a 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h
@@ -56,6 +56,14 @@
56	#define REMOTE_DISTANCE 20	56	#define REMOTE_DISTANCE 20
57	#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)	57	#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
58	#endif	58	#endif
		59	#ifndef RECLAIM_DISTANCE
		60	/*
		61	* If the distance between nodes in a system is larger than RECLAIM_DISTANCE
		62	* (in whatever arch specific measurement units returned by node_distance())
		63	* then switch on zone reclaim on boot.
		64	*/
		65	#define RECLAIM_DISTANCE 20
		66	#endif
59	#ifndef PENALTY_FOR_NODE_WITH_CPUS	67	#ifndef PENALTY_FOR_NODE_WITH_CPUS
60	#define PENALTY_FOR_NODE_WITH_CPUS (1)	68	#define PENALTY_FOR_NODE_WITH_CPUS (1)
61	#endif	69	#endif


diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2e29743a8d1..df54e2fc8ee0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
878	mark = (*z)->pages_high;	878	mark = (*z)->pages_high;
879	if (!zone_watermark_ok(*z, order, mark,	879	if (!zone_watermark_ok(*z, order, mark,
880	classzone_idx, alloc_flags))	880	classzone_idx, alloc_flags))
881	continue;	881	if (!zone_reclaim_mode \|\|
		882	!zone_reclaim(*z, gfp_mask, order))
		883	continue;
882	}	884	}
883		885
884	page = buffered_rmqueue(zonelist, *z, order, gfp_mask);	886	page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
@@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
1595	prev_node = local_node;	1597	prev_node = local_node;
1596	nodes_clear(used_mask);	1598	nodes_clear(used_mask);
1597	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {	1599	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
		1600	int distance = node_distance(local_node, node);
		1601
		1602	/*
		1603	* If another node is sufficiently far away then it is better
		1604	* to reclaim pages in a zone before going off node.
		1605	*/
		1606	if (distance > RECLAIM_DISTANCE)
		1607	zone_reclaim_mode = 1;
		1608
1598	/*	1609	/*
1599	* We don't want to pressure a particular node.	1610	* We don't want to pressure a particular node.
1600	* So adding penalty to the first node in same	1611	* So adding penalty to the first node in same
1601	* distance group to make it round-robin.	1612	* distance group to make it round-robin.
1602	*/	1613	*/
1603	if (node_distance(local_node, node) !=	1614
1604	node_distance(local_node, prev_node))	1615	if (distance != node_distance(local_node, prev_node))
1605	node_load[node] += load;	1616	node_load[node] += load;
1606	prev_node = node;	1617	prev_node = node;
1607	load--;	1618	load--;


diff --git a/mm/vmscan.c b/mm/vmscan.c index e5117b6897a9..2e34b61a70c7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -1572,3 +1572,71 @@ static int __init kswapd_init(void)
1572	}	1572	}
1573		1573
1574	module_init(kswapd_init)	1574	module_init(kswapd_init)
		1575
		1576	#ifdef CONFIG_NUMA
		1577	/*
		1578	* Zone reclaim mode
		1579	*
		1580	* If non-zero call zone_reclaim when the number of free pages falls below
		1581	* the watermarks.
		1582	*
		1583	* In the future we may add flags to the mode. However, the page allocator
		1584	* should only have to check that zone_reclaim_mode != 0 before calling
		1585	* zone_reclaim().
		1586	*/
		1587	int zone_reclaim_mode __read_mostly;
		1588
		1589	/*
		1590	* Mininum time between zone reclaim scans
		1591	*/
		1592	#define ZONE_RECLAIM_INTERVAL HZ/2
		1593	/*
		1594	* Try to free up some pages from this zone through reclaim.
		1595	*/
		1596	int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
		1597	{
		1598	int nr_pages = 1 << order;
		1599	struct task_struct *p = current;
		1600	struct reclaim_state reclaim_state;
		1601	struct scan_control sc = {
		1602	.gfp_mask = gfp_mask,
		1603	.may_writepage = 0,
		1604	.may_swap = 0,
		1605	.nr_mapped = read_page_state(nr_mapped),
		1606	.nr_scanned = 0,
		1607	.nr_reclaimed = 0,
		1608	.priority = 0
		1609	};
		1610
		1611	if (!(gfp_mask & __GFP_WAIT) \|\|
		1612	zone->zone_pgdat->node_id != numa_node_id() \|\|
		1613	zone->all_unreclaimable \|\|
		1614	atomic_read(&zone->reclaim_in_progress) > 0)
		1615	return 0;
		1616
		1617	if (time_before(jiffies,
		1618	zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
		1619	return 0;
		1620
		1621	disable_swap_token();
		1622
		1623	if (nr_pages > SWAP_CLUSTER_MAX)
		1624	sc.swap_cluster_max = nr_pages;
		1625	else
		1626	sc.swap_cluster_max = SWAP_CLUSTER_MAX;
		1627
		1628	cond_resched();
		1629	p->flags \|= PF_MEMALLOC;
		1630	reclaim_state.reclaimed_slab = 0;
		1631	p->reclaim_state = &reclaim_state;
		1632	shrink_zone(zone, &sc);
		1633	p->reclaim_state = NULL;
		1634	current->flags &= ~PF_MEMALLOC;
		1635
		1636	if (sc.nr_reclaimed == 0)
		1637	zone->last_unsuccessful_zone_reclaim = jiffies;
		1638
		1639	return sc.nr_reclaimed > nr_pages;
		1640	}
		1641	#endif
		1642