aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Hicks <mort@sgi.com>2005-06-21 20:14:41 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-21 21:46:14 -0400
commit753ee728964e5afb80c17659cc6c3a6fd0a42fe0 (patch)
tree41c9a7700d0858c1f77c5bdaba97e5b636f69b06
parentbfbb38fb808ac23ef44472d05d9bb36edfb49ed0 (diff)
[PATCH] VM: early zone reclaim
This is the core of the (much simplified) early reclaim. The goal of this patch is to reclaim some easily-freed pages from a zone before falling back onto another zone. One of the major uses of this is NUMA machines. With the default allocator behavior the allocator would look for memory in another zone, which might be off-node, before trying to reclaim from the current zone. This adds a zone tuneable to enable early zone reclaim. It is selected on a per-zone basis and is turned on/off via syscall. Adding some extra throttling on the reclaim was also required (patch 4/4). Without the machine would grind to a crawl when doing a "make -j" kernel build. Even with this patch the System Time is higher on average, but it seems tolerable. Here are some numbers for kernbench runs on a 2-node, 4cpu, 8Gig RAM Altix in the "make -j" run: wall user sys %cpu ctx sw. sleeps ---- ---- --- ---- ------ ------ No patch 1009 1384 847 258 298170 504402 w/patch, no reclaim 880 1376 667 288 254064 396745 w/patch & reclaim 1079 1385 926 252 291625 548873 These numbers are the average of 2 runs of 3 "make -j" runs done right after system boot. Run-to-run variability for "make -j" is huge, so these numbers aren't terribly useful except to seee that with reclaim the benchmark still finishes in a reasonable amount of time. I also looked at the NUMA hit/miss stats for the "make -j" runs and the reclaim doesn't make any difference when the machine is thrashing away. Doing a "make -j8" on a single node that is filled with page cache pages takes 700 seconds with reclaim turned on and 735 seconds without reclaim (due to remote memory accesses). The simple zone_reclaim syscall program is at http://www.bork.org/~mort/sgi/zone_reclaim.c Signed-off-by: Martin Hicks <mort@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/i386/kernel/syscall_table.S2
-rw-r--r--arch/ia64/kernel/entry.S2
-rw-r--r--include/asm-i386/unistd.h2
-rw-r--r--include/asm-ia64/unistd.h1
-rw-r--r--include/linux/mmzone.h6
-rw-r--r--include/linux/swap.h1
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--mm/page_alloc.c33
-rw-r--r--mm/vmscan.c64
9 files changed, 104 insertions, 8 deletions
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 6cd1ed311f02..d408afaf6495 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -251,7 +251,7 @@ ENTRY(sys_call_table)
251 .long sys_io_submit 251 .long sys_io_submit
252 .long sys_io_cancel 252 .long sys_io_cancel
253 .long sys_fadvise64 /* 250 */ 253 .long sys_fadvise64 /* 250 */
254 .long sys_ni_syscall 254 .long sys_set_zone_reclaim
255 .long sys_exit_group 255 .long sys_exit_group
256 .long sys_lookup_dcookie 256 .long sys_lookup_dcookie
257 .long sys_epoll_create 257 .long sys_epoll_create
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index d99316c9be28..b1d5d3d5276c 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1579,7 +1579,7 @@ sys_call_table:
1579 data8 sys_keyctl 1579 data8 sys_keyctl
1580 data8 sys_ni_syscall 1580 data8 sys_ni_syscall
1581 data8 sys_ni_syscall // 1275 1581 data8 sys_ni_syscall // 1275
1582 data8 sys_ni_syscall 1582 data8 sys_set_zone_reclaim
1583 data8 sys_ni_syscall 1583 data8 sys_ni_syscall
1584 data8 sys_ni_syscall 1584 data8 sys_ni_syscall
1585 data8 sys_ni_syscall 1585 data8 sys_ni_syscall
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 61bcc1b1e3f4..176413fb9ae3 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -256,7 +256,7 @@
256#define __NR_io_submit 248 256#define __NR_io_submit 248
257#define __NR_io_cancel 249 257#define __NR_io_cancel 249
258#define __NR_fadvise64 250 258#define __NR_fadvise64 250
259 259#define __NR_set_zone_reclaim 251
260#define __NR_exit_group 252 260#define __NR_exit_group 252
261#define __NR_lookup_dcookie 253 261#define __NR_lookup_dcookie 253
262#define __NR_epoll_create 254 262#define __NR_epoll_create 254
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 33e26c557c5c..f7f43ec2483a 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -263,6 +263,7 @@
263#define __NR_add_key 1271 263#define __NR_add_key 1271
264#define __NR_request_key 1272 264#define __NR_request_key 1272
265#define __NR_keyctl 1273 265#define __NR_keyctl 1273
266#define __NR_set_zone_reclaim 1276
266 267
267#ifdef __KERNEL__ 268#ifdef __KERNEL__
268 269
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index beacd931b606..dfc2452ccb10 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -145,6 +145,12 @@ struct zone {
145 int all_unreclaimable; /* All pages pinned */ 145 int all_unreclaimable; /* All pages pinned */
146 146
147 /* 147 /*
148 * Does the allocator try to reclaim pages from the zone as soon
149 * as it fails a watermark_ok() in __alloc_pages?
150 */
151 int reclaim_pages;
152
153 /*
148 * prev_priority holds the scanning priority for this zone. It is 154 * prev_priority holds the scanning priority for this zone. It is
149 * defined as the scanning priority at which we achieved our reclaim 155 * defined as the scanning priority at which we achieved our reclaim
150 * target at the previous try_to_free_pages() or balance_pgdat() 156 * target at the previous try_to_free_pages() or balance_pgdat()
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3bbc41be9bd0..0d21e682d99d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -173,6 +173,7 @@ extern void swap_setup(void);
173 173
174/* linux/mm/vmscan.c */ 174/* linux/mm/vmscan.c */
175extern int try_to_free_pages(struct zone **, unsigned int, unsigned int); 175extern int try_to_free_pages(struct zone **, unsigned int, unsigned int);
176extern int zone_reclaim(struct zone *, unsigned int, unsigned int);
176extern int shrink_all_memory(int); 177extern int shrink_all_memory(int);
177extern int vm_swappiness; 178extern int vm_swappiness;
178 179
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0dda70ed1f98..6f15bea7d1a8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -77,6 +77,7 @@ cond_syscall(sys_request_key);
77cond_syscall(sys_keyctl); 77cond_syscall(sys_keyctl);
78cond_syscall(compat_sys_keyctl); 78cond_syscall(compat_sys_keyctl);
79cond_syscall(compat_sys_socketcall); 79cond_syscall(compat_sys_socketcall);
80cond_syscall(sys_set_zone_reclaim);
80 81
81/* arch-specific weak syscall entries */ 82/* arch-specific weak syscall entries */
82cond_syscall(sys_pciconfig_read); 83cond_syscall(sys_pciconfig_read);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 40169f0b7e9e..3c0f69ded6b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -724,6 +724,14 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
724 return 1; 724 return 1;
725} 725}
726 726
727static inline int
728should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
729{
730 if (!z->reclaim_pages)
731 return 0;
732 return 1;
733}
734
727/* 735/*
728 * This is the 'heart' of the zoned buddy allocator. 736 * This is the 'heart' of the zoned buddy allocator.
729 */ 737 */
@@ -760,17 +768,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
760 768
761 classzone_idx = zone_idx(zones[0]); 769 classzone_idx = zone_idx(zones[0]);
762 770
763 restart: 771restart:
764 /* Go through the zonelist once, looking for a zone with enough free */ 772 /* Go through the zonelist once, looking for a zone with enough free */
765 for (i = 0; (z = zones[i]) != NULL; i++) { 773 for (i = 0; (z = zones[i]) != NULL; i++) {
766 774 int do_reclaim = should_reclaim_zone(z, gfp_mask);
767 if (!zone_watermark_ok(z, order, z->pages_low,
768 classzone_idx, 0, 0))
769 continue;
770 775
771 if (!cpuset_zone_allowed(z)) 776 if (!cpuset_zone_allowed(z))
772 continue; 777 continue;
773 778
779 /*
780 * If the zone is to attempt early page reclaim then this loop
781 * will try to reclaim pages and check the watermark a second
782 * time before giving up and falling back to the next zone.
783 */
784zone_reclaim_retry:
785 if (!zone_watermark_ok(z, order, z->pages_low,
786 classzone_idx, 0, 0)) {
787 if (!do_reclaim)
788 continue;
789 else {
790 zone_reclaim(z, gfp_mask, order);
791 /* Only try reclaim once */
792 do_reclaim = 0;
793 goto zone_reclaim_retry;
794 }
795 }
796
774 page = buffered_rmqueue(z, order, gfp_mask); 797 page = buffered_rmqueue(z, order, gfp_mask);
775 if (page) 798 if (page)
776 goto got_pg; 799 goto got_pg;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6379ddbffd9b..7da846960d8a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1323,3 +1323,67 @@ static int __init kswapd_init(void)
1323} 1323}
1324 1324
1325module_init(kswapd_init) 1325module_init(kswapd_init)
1326
1327
1328/*
1329 * Try to free up some pages from this zone through reclaim.
1330 */
1331int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
1332{
1333 struct scan_control sc;
1334 int nr_pages = 1 << order;
1335 int total_reclaimed = 0;
1336
1337 /* The reclaim may sleep, so don't do it if sleep isn't allowed */
1338 if (!(gfp_mask & __GFP_WAIT))
1339 return 0;
1340 if (zone->all_unreclaimable)
1341 return 0;
1342
1343 sc.gfp_mask = gfp_mask;
1344 sc.may_writepage = 0;
1345 sc.may_swap = 0;
1346 sc.nr_mapped = read_page_state(nr_mapped);
1347 sc.nr_scanned = 0;
1348 sc.nr_reclaimed = 0;
1349 /* scan at the highest priority */
1350 sc.priority = 0;
1351
1352 if (nr_pages > SWAP_CLUSTER_MAX)
1353 sc.swap_cluster_max = nr_pages;
1354 else
1355 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1356
1357 shrink_zone(zone, &sc);
1358 total_reclaimed = sc.nr_reclaimed;
1359
1360 return total_reclaimed;
1361}
1362
1363asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
1364 unsigned int state)
1365{
1366 struct zone *z;
1367 int i;
1368
1369 if (node >= MAX_NUMNODES || !node_online(node))
1370 return -EINVAL;
1371
1372 /* This will break if we ever add more zones */
1373 if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
1374 return -EINVAL;
1375
1376 for (i = 0; i < MAX_NR_ZONES; i++) {
1377 if (!(zone & 1<<i))
1378 continue;
1379
1380 z = &NODE_DATA(node)->node_zones[i];
1381
1382 if (state)
1383 z->reclaim_pages = 1;
1384 else
1385 z->reclaim_pages = 0;
1386 }
1387
1388 return 0;
1389}