aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Mundt <lethal@linux-sh.org>2009-01-08 07:04:47 -0500
committerDavid Howells <dhowells@redhat.com>2009-01-08 07:04:47 -0500
commitdd8632a12e500a684478fea0951f380478d56fed (patch)
tree1a12f441f9de14fd233faa92cf13a5fbb0319f41
parent8feae13110d60cc6287afabc2887366b0eb226c2 (diff)
NOMMU: Make mmap allocation page trimming behaviour configurable.
NOMMU mmap allocates a piece of memory for an mmap that's rounded up in size to the nearest power-of-2 number of pages. Currently it then discards the excess pages back to the page allocator, making that memory available for use by other things. This can, however, cause greater amount of fragmentation. To counter this, a sysctl is added in order to fine-tune the trimming behaviour. The default behaviour remains to trim pages aggressively, while this can either be disabled completely or set to a higher page-granular watermark in order to have finer-grained control. vm region vm_top bits taken from an earlier patch by David Howells. Signed-off-by: Paul Mundt <lethal@linux-sh.org> Signed-off-by: David Howells <dhowells@redhat.com> Tested-by: Mike Frysinger <vapier.adi@gmail.com>
-rw-r--r--Documentation/nommu-mmap.txt15
-rw-r--r--Documentation/sysctl/vm.txt18
-rw-r--r--include/linux/mm_types.h1
-rw-r--r--kernel/sysctl.c14
-rw-r--r--mm/nommu.c65
5 files changed, 90 insertions, 23 deletions
diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt
index 02b89dcf38ac..b565e8279d13 100644
--- a/Documentation/nommu-mmap.txt
+++ b/Documentation/nommu-mmap.txt
@@ -248,3 +248,18 @@ PROVIDING SHAREABLE BLOCK DEVICE SUPPORT
248Provision of shared mappings on block device files is exactly the same as for 248Provision of shared mappings on block device files is exactly the same as for
249character devices. If there isn't a real device underneath, then the driver 249character devices. If there isn't a real device underneath, then the driver
250should allocate sufficient contiguous memory to honour any supported mapping. 250should allocate sufficient contiguous memory to honour any supported mapping.
251
252
253=================================
254ADJUSTING PAGE TRIMMING BEHAVIOUR
255=================================
256
257NOMMU mmap automatically rounds up to the nearest power-of-2 number of pages
258when performing an allocation. This can have adverse effects on memory
259fragmentation, and as such, is left configurable. The default behaviour is to
260aggressively trim allocations and discard any excess pages back in to the page
261allocator. In order to retain finer-grained control over fragmentation, this
262behaviour can either be disabled completely, or bumped up to a higher page
263watermark where trimming begins.
264
265Page trimming behaviour is configurable via the sysctl `vm.nr_trim_pages'.
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index cd05994a49e6..a3415070bcac 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -38,6 +38,7 @@ Currently, these files are in /proc/sys/vm:
38- numa_zonelist_order 38- numa_zonelist_order
39- nr_hugepages 39- nr_hugepages
40- nr_overcommit_hugepages 40- nr_overcommit_hugepages
41- nr_trim_pages (only if CONFIG_MMU=n)
41 42
42============================================================== 43==============================================================
43 44
@@ -348,3 +349,20 @@ Change the maximum size of the hugepage pool. The maximum is
348nr_hugepages + nr_overcommit_hugepages. 349nr_hugepages + nr_overcommit_hugepages.
349 350
350See Documentation/vm/hugetlbpage.txt 351See Documentation/vm/hugetlbpage.txt
352
353==============================================================
354
355nr_trim_pages
356
357This is available only on NOMMU kernels.
358
359This value adjusts the excess page trimming behaviour of power-of-2 aligned
360NOMMU mmap allocations.
361
362A value of 0 disables trimming of allocations entirely, while a value of 1
363trims excess pages aggressively. Any value >= 1 acts as the watermark where
364trimming of allocations is initiated.
365
366The default value is 1.
367
368See Documentation/nommu-mmap.txt for more information.
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1c1e0d3a1714..92915e81443f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -106,6 +106,7 @@ struct vm_region {
106 unsigned long vm_flags; /* VMA vm_flags */ 106 unsigned long vm_flags; /* VMA vm_flags */
107 unsigned long vm_start; /* start address of region */ 107 unsigned long vm_start; /* start address of region */
108 unsigned long vm_end; /* region initialised to here */ 108 unsigned long vm_end; /* region initialised to here */
109 unsigned long vm_top; /* region allocated to here */
109 unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ 110 unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */
110 struct file *vm_file; /* the backing file or NULL */ 111 struct file *vm_file; /* the backing file or NULL */
111 112
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 92f6e5bc3c24..89d74436318c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -82,6 +82,9 @@ extern int percpu_pagelist_fraction;
82extern int compat_log; 82extern int compat_log;
83extern int latencytop_enabled; 83extern int latencytop_enabled;
84extern int sysctl_nr_open_min, sysctl_nr_open_max; 84extern int sysctl_nr_open_min, sysctl_nr_open_max;
85#ifndef CONFIG_MMU
86extern int sysctl_nr_trim_pages;
87#endif
85#ifdef CONFIG_RCU_TORTURE_TEST 88#ifdef CONFIG_RCU_TORTURE_TEST
86extern int rcutorture_runnable; 89extern int rcutorture_runnable;
87#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 90#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
@@ -1102,6 +1105,17 @@ static struct ctl_table vm_table[] = {
1102 .mode = 0644, 1105 .mode = 0644,
1103 .proc_handler = &proc_dointvec 1106 .proc_handler = &proc_dointvec
1104 }, 1107 },
1108#else
1109 {
1110 .ctl_name = CTL_UNNUMBERED,
1111 .procname = "nr_trim_pages",
1112 .data = &sysctl_nr_trim_pages,
1113 .maxlen = sizeof(sysctl_nr_trim_pages),
1114 .mode = 0644,
1115 .proc_handler = &proc_dointvec_minmax,
1116 .strategy = &sysctl_intvec,
1117 .extra1 = &zero,
1118 },
1105#endif 1119#endif
1106 { 1120 {
1107 .ctl_name = VM_LAPTOP_MODE, 1121 .ctl_name = VM_LAPTOP_MODE,
diff --git a/mm/nommu.c b/mm/nommu.c
index 0d363dfcf10e..a6e8ccfbd400 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,7 +10,7 @@
10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> 10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> 11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> 12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
13 * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org> 13 * Copyright (c) 2007-2008 Paul Mundt <lethal@linux-sh.org>
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/module.h>
@@ -66,6 +66,7 @@ atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
67int sysctl_overcommit_ratio = 50; /* default is 50% */ 67int sysctl_overcommit_ratio = 50; /* default is 50% */
68int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 68int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
69int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
69int heap_stack_gap = 0; 70int heap_stack_gap = 0;
70 71
71atomic_t mmap_pages_allocated; 72atomic_t mmap_pages_allocated;
@@ -455,6 +456,8 @@ static noinline void validate_nommu_regions(void)
455 last = rb_entry(lastp, struct vm_region, vm_rb); 456 last = rb_entry(lastp, struct vm_region, vm_rb);
456 if (unlikely(last->vm_end <= last->vm_start)) 457 if (unlikely(last->vm_end <= last->vm_start))
457 BUG(); 458 BUG();
459 if (unlikely(last->vm_top < last->vm_end))
460 BUG();
458 461
459 while ((p = rb_next(lastp))) { 462 while ((p = rb_next(lastp))) {
460 region = rb_entry(p, struct vm_region, vm_rb); 463 region = rb_entry(p, struct vm_region, vm_rb);
@@ -462,7 +465,9 @@ static noinline void validate_nommu_regions(void)
462 465
463 if (unlikely(region->vm_end <= region->vm_start)) 466 if (unlikely(region->vm_end <= region->vm_start))
464 BUG(); 467 BUG();
465 if (unlikely(region->vm_start < last->vm_end)) 468 if (unlikely(region->vm_top < region->vm_end))
469 BUG();
470 if (unlikely(region->vm_start < last->vm_top))
466 BUG(); 471 BUG();
467 472
468 lastp = p; 473 lastp = p;
@@ -536,7 +541,7 @@ static void free_page_series(unsigned long from, unsigned long to)
536/* 541/*
537 * release a reference to a region 542 * release a reference to a region
538 * - the caller must hold the region semaphore, which this releases 543 * - the caller must hold the region semaphore, which this releases
539 * - the region may not have been added to the tree yet, in which case vm_end 544 * - the region may not have been added to the tree yet, in which case vm_top
540 * will equal vm_start 545 * will equal vm_start
541 */ 546 */
542static void __put_nommu_region(struct vm_region *region) 547static void __put_nommu_region(struct vm_region *region)
@@ -547,7 +552,7 @@ static void __put_nommu_region(struct vm_region *region)
547 BUG_ON(!nommu_region_tree.rb_node); 552 BUG_ON(!nommu_region_tree.rb_node);
548 553
549 if (atomic_dec_and_test(&region->vm_usage)) { 554 if (atomic_dec_and_test(&region->vm_usage)) {
550 if (region->vm_end > region->vm_start) 555 if (region->vm_top > region->vm_start)
551 delete_nommu_region(region); 556 delete_nommu_region(region);
552 up_write(&nommu_region_sem); 557 up_write(&nommu_region_sem);
553 558
@@ -558,7 +563,7 @@ static void __put_nommu_region(struct vm_region *region)
558 * from ramfs/tmpfs mustn't be released here */ 563 * from ramfs/tmpfs mustn't be released here */
559 if (region->vm_flags & VM_MAPPED_COPY) { 564 if (region->vm_flags & VM_MAPPED_COPY) {
560 kdebug("free series"); 565 kdebug("free series");
561 free_page_series(region->vm_start, region->vm_end); 566 free_page_series(region->vm_start, region->vm_top);
562 } 567 }
563 kmem_cache_free(vm_region_jar, region); 568 kmem_cache_free(vm_region_jar, region);
564 } else { 569 } else {
@@ -999,6 +1004,10 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
999 int ret; 1004 int ret;
1000 1005
1001 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1006 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1007 if (ret == 0) {
1008 vma->vm_region->vm_top = vma->vm_region->vm_end;
1009 return ret;
1010 }
1002 if (ret != -ENOSYS) 1011 if (ret != -ENOSYS)
1003 return ret; 1012 return ret;
1004 1013
@@ -1027,11 +1036,14 @@ static int do_mmap_private(struct vm_area_struct *vma,
1027 */ 1036 */
1028 if (vma->vm_file) { 1037 if (vma->vm_file) {
1029 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1038 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1030 if (ret != -ENOSYS) { 1039 if (ret == 0) {
1031 /* shouldn't return success if we're not sharing */ 1040 /* shouldn't return success if we're not sharing */
1032 BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE)); 1041 BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
1033 return ret; /* success or a real error */ 1042 vma->vm_region->vm_top = vma->vm_region->vm_end;
1043 return ret;
1034 } 1044 }
1045 if (ret != -ENOSYS)
1046 return ret;
1035 1047
1036 /* getting an ENOSYS error indicates that direct mmap isn't 1048 /* getting an ENOSYS error indicates that direct mmap isn't
1037 * possible (as opposed to tried but failed) so we'll try to 1049 * possible (as opposed to tried but failed) so we'll try to
@@ -1051,23 +1063,25 @@ static int do_mmap_private(struct vm_area_struct *vma,
1051 if (!pages) 1063 if (!pages)
1052 goto enomem; 1064 goto enomem;
1053 1065
1054 /* we allocated a power-of-2 sized page set, so we need to trim off the
1055 * excess */
1056 total = 1 << order; 1066 total = 1 << order;
1057 atomic_add(total, &mmap_pages_allocated); 1067 atomic_add(total, &mmap_pages_allocated);
1058 1068
1059 point = rlen >> PAGE_SHIFT; 1069 point = rlen >> PAGE_SHIFT;
1060 while (total > point) { 1070
1061 order = ilog2(total - point); 1071 /* we allocated a power-of-2 sized page set, so we may want to trim off
1062 n = 1 << order; 1072 * the excess */
1063 kdebug("shave %lu/%lu @%lu", n, total - point, total); 1073 if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
1064 atomic_sub(n, &mmap_pages_allocated); 1074 while (total > point) {
1065 total -= n; 1075 order = ilog2(total - point);
1066 set_page_refcounted(pages + total); 1076 n = 1 << order;
1067 __free_pages(pages + total, order); 1077 kdebug("shave %lu/%lu @%lu", n, total - point, total);
1078 atomic_sub(n, &mmap_pages_allocated);
1079 total -= n;
1080 set_page_refcounted(pages + total);
1081 __free_pages(pages + total, order);
1082 }
1068 } 1083 }
1069 1084
1070 total = rlen >> PAGE_SHIFT;
1071 for (point = 1; point < total; point++) 1085 for (point = 1; point < total; point++)
1072 set_page_refcounted(&pages[point]); 1086 set_page_refcounted(&pages[point]);
1073 1087
@@ -1075,6 +1089,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1075 region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; 1089 region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
1076 region->vm_start = (unsigned long) base; 1090 region->vm_start = (unsigned long) base;
1077 region->vm_end = region->vm_start + rlen; 1091 region->vm_end = region->vm_start + rlen;
1092 region->vm_top = region->vm_start + (total << PAGE_SHIFT);
1078 1093
1079 vma->vm_start = region->vm_start; 1094 vma->vm_start = region->vm_start;
1080 vma->vm_end = region->vm_start + len; 1095 vma->vm_end = region->vm_start + len;
@@ -1110,6 +1125,7 @@ error_free:
1110 free_page_series(region->vm_start, region->vm_end); 1125 free_page_series(region->vm_start, region->vm_end);
1111 region->vm_start = vma->vm_start = 0; 1126 region->vm_start = vma->vm_start = 0;
1112 region->vm_end = vma->vm_end = 0; 1127 region->vm_end = vma->vm_end = 0;
1128 region->vm_top = 0;
1113 return ret; 1129 return ret;
1114 1130
1115enomem: 1131enomem:
@@ -1401,7 +1417,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1401 npages = (addr - vma->vm_start) >> PAGE_SHIFT; 1417 npages = (addr - vma->vm_start) >> PAGE_SHIFT;
1402 1418
1403 if (new_below) { 1419 if (new_below) {
1404 region->vm_end = new->vm_end = addr; 1420 region->vm_top = region->vm_end = new->vm_end = addr;
1405 } else { 1421 } else {
1406 region->vm_start = new->vm_start = addr; 1422 region->vm_start = new->vm_start = addr;
1407 region->vm_pgoff = new->vm_pgoff += npages; 1423 region->vm_pgoff = new->vm_pgoff += npages;
@@ -1418,6 +1434,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1418 vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; 1434 vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
1419 } else { 1435 } else {
1420 vma->vm_region->vm_end = vma->vm_end = addr; 1436 vma->vm_region->vm_end = vma->vm_end = addr;
1437 vma->vm_region->vm_top = addr;
1421 } 1438 }
1422 add_nommu_region(vma->vm_region); 1439 add_nommu_region(vma->vm_region);
1423 add_nommu_region(new->vm_region); 1440 add_nommu_region(new->vm_region);
@@ -1454,10 +1471,12 @@ static int shrink_vma(struct mm_struct *mm,
1454 1471
1455 down_write(&nommu_region_sem); 1472 down_write(&nommu_region_sem);
1456 delete_nommu_region(region); 1473 delete_nommu_region(region);
1457 if (from > region->vm_start) 1474 if (from > region->vm_start) {
1458 region->vm_end = from; 1475 to = region->vm_top;
1459 else 1476 region->vm_top = region->vm_end = from;
1477 } else {
1460 region->vm_start = to; 1478 region->vm_start = to;
1479 }
1461 add_nommu_region(region); 1480 add_nommu_region(region);
1462 up_write(&nommu_region_sem); 1481 up_write(&nommu_region_sem);
1463 1482