aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2006-01-18 20:42:31 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-18 22:20:17 -0500
commit9eeff2395e3cfd05c9b2e6074ff943a34b0c5c21 (patch)
tree20160098ec6ed8738cfecfc5f81181ad22b44e60
parentf1fd1067ece574ab56e4a70878b9a5a1ed4c3c42 (diff)
[PATCH] Zone reclaim: Reclaim logic
Some bits for zone reclaim exists in 2.6.15 but they are not usable. This patch fixes them up, removes unused code and makes zone reclaim usable. Zone reclaim allows the reclaiming of pages from a zone if the number of free pages falls below the watermarks even if other zones still have enough pages available. Zone reclaim is of particular importance for NUMA machines. It can be more beneficial to reclaim a page than taking the performance penalties that come with allocating a page on a remote zone. Zone reclaim is enabled if the maximum distance to another node is higher than RECLAIM_DISTANCE, which may be defined by an arch. By default RECLAIM_DISTANCE is 20. 20 is the distance to another node in the same component (enclosure or motherboard) on IA64. The meaning of the NUMA distance information seems to vary by arch. If zone reclaim is not successful then no further reclaim attempts will occur for a certain time period (ZONE_RECLAIM_INTERVAL). This patch was discussed before. See http://marc.theaimsgroup.com/?l=linux-kernel&m=113519961504207&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113408418232531&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113389027420032&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113380938612205&w=2 Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/mmzone.h12
-rw-r--r--include/linux/swap.h11
-rw-r--r--include/linux/topology.h8
-rw-r--r--mm/page_alloc.c17
-rw-r--r--mm/vmscan.c68
5 files changed, 108 insertions, 8 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 34cbefd2ebde..93a849f742db 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -149,15 +149,17 @@ struct zone {
149 unsigned long pages_scanned; /* since last reclaim */ 149 unsigned long pages_scanned; /* since last reclaim */
150 int all_unreclaimable; /* All pages pinned */ 150 int all_unreclaimable; /* All pages pinned */
151 151
152 /*
153 * Does the allocator try to reclaim pages from the zone as soon
154 * as it fails a watermark_ok() in __alloc_pages?
155 */
156 int reclaim_pages;
157 /* A count of how many reclaimers are scanning this zone */ 152 /* A count of how many reclaimers are scanning this zone */
158 atomic_t reclaim_in_progress; 153 atomic_t reclaim_in_progress;
159 154
160 /* 155 /*
156 * timestamp (in jiffies) of the last zone reclaim that did not
157 * result in freeing of pages. This is used to avoid repeated scans
158 * if all memory in the zone is in use.
159 */
160 unsigned long last_unsuccessful_zone_reclaim;
161
162 /*
161 * prev_priority holds the scanning priority for this zone. It is 163 * prev_priority holds the scanning priority for this zone. It is
162 * defined as the scanning priority at which we achieved our reclaim 164 * defined as the scanning priority at which we achieved our reclaim
163 * target at the previous try_to_free_pages() or balance_pgdat() 165 * target at the previous try_to_free_pages() or balance_pgdat()
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d01f7efb0f2c..4a99e4a7fbf3 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -176,6 +176,17 @@ extern int try_to_free_pages(struct zone **, gfp_t);
176extern int shrink_all_memory(int); 176extern int shrink_all_memory(int);
177extern int vm_swappiness; 177extern int vm_swappiness;
178 178
179#ifdef CONFIG_NUMA
180extern int zone_reclaim_mode;
181extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
182#else
183#define zone_reclaim_mode 0
184static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
185{
186 return 0;
187}
188#endif
189
179#ifdef CONFIG_MIGRATION 190#ifdef CONFIG_MIGRATION
180extern int isolate_lru_page(struct page *p); 191extern int isolate_lru_page(struct page *p);
181extern int putback_lru_pages(struct list_head *l); 192extern int putback_lru_pages(struct list_head *l);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 315a5163d6a0..e8eb0040ce3a 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -56,6 +56,14 @@
56#define REMOTE_DISTANCE 20 56#define REMOTE_DISTANCE 20
57#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) 57#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
58#endif 58#endif
59#ifndef RECLAIM_DISTANCE
60/*
61 * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
62 * (in whatever arch specific measurement units returned by node_distance())
63 * then switch on zone reclaim on boot.
64 */
65#define RECLAIM_DISTANCE 20
66#endif
59#ifndef PENALTY_FOR_NODE_WITH_CPUS 67#ifndef PENALTY_FOR_NODE_WITH_CPUS
60#define PENALTY_FOR_NODE_WITH_CPUS (1) 68#define PENALTY_FOR_NODE_WITH_CPUS (1)
61#endif 69#endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2e29743a8d1..df54e2fc8ee0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
878 mark = (*z)->pages_high; 878 mark = (*z)->pages_high;
879 if (!zone_watermark_ok(*z, order, mark, 879 if (!zone_watermark_ok(*z, order, mark,
880 classzone_idx, alloc_flags)) 880 classzone_idx, alloc_flags))
881 continue; 881 if (!zone_reclaim_mode ||
882 !zone_reclaim(*z, gfp_mask, order))
883 continue;
882 } 884 }
883 885
884 page = buffered_rmqueue(zonelist, *z, order, gfp_mask); 886 page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
@@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
1595 prev_node = local_node; 1597 prev_node = local_node;
1596 nodes_clear(used_mask); 1598 nodes_clear(used_mask);
1597 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 1599 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
1600 int distance = node_distance(local_node, node);
1601
1602 /*
1603 * If another node is sufficiently far away then it is better
1604 * to reclaim pages in a zone before going off node.
1605 */
1606 if (distance > RECLAIM_DISTANCE)
1607 zone_reclaim_mode = 1;
1608
1598 /* 1609 /*
1599 * We don't want to pressure a particular node. 1610 * We don't want to pressure a particular node.
1600 * So adding penalty to the first node in same 1611 * So adding penalty to the first node in same
1601 * distance group to make it round-robin. 1612 * distance group to make it round-robin.
1602 */ 1613 */
1603 if (node_distance(local_node, node) != 1614
1604 node_distance(local_node, prev_node)) 1615 if (distance != node_distance(local_node, prev_node))
1605 node_load[node] += load; 1616 node_load[node] += load;
1606 prev_node = node; 1617 prev_node = node;
1607 load--; 1618 load--;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5117b6897a9..2e34b61a70c7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1572,3 +1572,71 @@ static int __init kswapd_init(void)
1572} 1572}
1573 1573
1574module_init(kswapd_init) 1574module_init(kswapd_init)
1575
1576#ifdef CONFIG_NUMA
1577/*
1578 * Zone reclaim mode
1579 *
1580 * If non-zero call zone_reclaim when the number of free pages falls below
1581 * the watermarks.
1582 *
1583 * In the future we may add flags to the mode. However, the page allocator
1584 * should only have to check that zone_reclaim_mode != 0 before calling
1585 * zone_reclaim().
1586 */
1587int zone_reclaim_mode __read_mostly;
1588
1589/*
1590 * Mininum time between zone reclaim scans
1591 */
1592#define ZONE_RECLAIM_INTERVAL HZ/2
1593/*
1594 * Try to free up some pages from this zone through reclaim.
1595 */
1596int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1597{
1598 int nr_pages = 1 << order;
1599 struct task_struct *p = current;
1600 struct reclaim_state reclaim_state;
1601 struct scan_control sc = {
1602 .gfp_mask = gfp_mask,
1603 .may_writepage = 0,
1604 .may_swap = 0,
1605 .nr_mapped = read_page_state(nr_mapped),
1606 .nr_scanned = 0,
1607 .nr_reclaimed = 0,
1608 .priority = 0
1609 };
1610
1611 if (!(gfp_mask & __GFP_WAIT) ||
1612 zone->zone_pgdat->node_id != numa_node_id() ||
1613 zone->all_unreclaimable ||
1614 atomic_read(&zone->reclaim_in_progress) > 0)
1615 return 0;
1616
1617 if (time_before(jiffies,
1618 zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
1619 return 0;
1620
1621 disable_swap_token();
1622
1623 if (nr_pages > SWAP_CLUSTER_MAX)
1624 sc.swap_cluster_max = nr_pages;
1625 else
1626 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1627
1628 cond_resched();
1629 p->flags |= PF_MEMALLOC;
1630 reclaim_state.reclaimed_slab = 0;
1631 p->reclaim_state = &reclaim_state;
1632 shrink_zone(zone, &sc);
1633 p->reclaim_state = NULL;
1634 current->flags &= ~PF_MEMALLOC;
1635
1636 if (sc.nr_reclaimed == 0)
1637 zone->last_unsuccessful_zone_reclaim = jiffies;
1638
1639 return sc.nr_reclaimed > nr_pages;
1640}
1641#endif
1642