diff options
author | Paul Mundt <lethal@linux-sh.org> | 2009-01-08 07:04:47 -0500 |
---|---|---|
committer | David Howells <dhowells@redhat.com> | 2009-01-08 07:04:47 -0500 |
commit | dd8632a12e500a684478fea0951f380478d56fed (patch) | |
tree | 1a12f441f9de14fd233faa92cf13a5fbb0319f41 | |
parent | 8feae13110d60cc6287afabc2887366b0eb226c2 (diff) |
NOMMU: Make mmap allocation page trimming behaviour configurable.
NOMMU mmap allocates a piece of memory for an mmap that's rounded up in size to
the nearest power-of-2 number of pages. Currently it then discards the excess
pages back to the page allocator, making that memory available for use by other
things. This can, however, cause greater amount of fragmentation.
To counter this, a sysctl is added in order to fine-tune the trimming
behaviour. The default behaviour remains to trim pages aggressively, while
this can either be disabled completely or set to a higher page-granular
watermark in order to have finer-grained control.
vm region vm_top bits taken from an earlier patch by David Howells.
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Mike Frysinger <vapier.adi@gmail.com>
-rw-r--r-- | Documentation/nommu-mmap.txt | 15 | ||||
-rw-r--r-- | Documentation/sysctl/vm.txt | 18 | ||||
-rw-r--r-- | include/linux/mm_types.h | 1 | ||||
-rw-r--r-- | kernel/sysctl.c | 14 | ||||
-rw-r--r-- | mm/nommu.c | 65 |
5 files changed, 90 insertions, 23 deletions
diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt index 02b89dcf38ac..b565e8279d13 100644 --- a/Documentation/nommu-mmap.txt +++ b/Documentation/nommu-mmap.txt | |||
@@ -248,3 +248,18 @@ PROVIDING SHAREABLE BLOCK DEVICE SUPPORT | |||
248 | Provision of shared mappings on block device files is exactly the same as for | 248 | Provision of shared mappings on block device files is exactly the same as for |
249 | character devices. If there isn't a real device underneath, then the driver | 249 | character devices. If there isn't a real device underneath, then the driver |
250 | should allocate sufficient contiguous memory to honour any supported mapping. | 250 | should allocate sufficient contiguous memory to honour any supported mapping. |
251 | |||
252 | |||
253 | ================================= | ||
254 | ADJUSTING PAGE TRIMMING BEHAVIOUR | ||
255 | ================================= | ||
256 | |||
257 | NOMMU mmap automatically rounds up to the nearest power-of-2 number of pages | ||
258 | when performing an allocation. This can have adverse effects on memory | ||
259 | fragmentation, and as such, is left configurable. The default behaviour is to | ||
260 | aggressively trim allocations and discard any excess pages back in to the page | ||
261 | allocator. In order to retain finer-grained control over fragmentation, this | ||
262 | behaviour can either be disabled completely, or bumped up to a higher page | ||
263 | watermark where trimming begins. | ||
264 | |||
265 | Page trimming behaviour is configurable via the sysctl `vm.nr_trim_pages'. | ||
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index cd05994a49e6..a3415070bcac 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
@@ -38,6 +38,7 @@ Currently, these files are in /proc/sys/vm: | |||
38 | - numa_zonelist_order | 38 | - numa_zonelist_order |
39 | - nr_hugepages | 39 | - nr_hugepages |
40 | - nr_overcommit_hugepages | 40 | - nr_overcommit_hugepages |
41 | - nr_trim_pages (only if CONFIG_MMU=n) | ||
41 | 42 | ||
42 | ============================================================== | 43 | ============================================================== |
43 | 44 | ||
@@ -348,3 +349,20 @@ Change the maximum size of the hugepage pool. The maximum is | |||
348 | nr_hugepages + nr_overcommit_hugepages. | 349 | nr_hugepages + nr_overcommit_hugepages. |
349 | 350 | ||
350 | See Documentation/vm/hugetlbpage.txt | 351 | See Documentation/vm/hugetlbpage.txt |
352 | |||
353 | ============================================================== | ||
354 | |||
355 | nr_trim_pages | ||
356 | |||
357 | This is available only on NOMMU kernels. | ||
358 | |||
359 | This value adjusts the excess page trimming behaviour of power-of-2 aligned | ||
360 | NOMMU mmap allocations. | ||
361 | |||
362 | A value of 0 disables trimming of allocations entirely, while a value of 1 | ||
363 | trims excess pages aggressively. Any value >= 1 acts as the watermark where | ||
364 | trimming of allocations is initiated. | ||
365 | |||
366 | The default value is 1. | ||
367 | |||
368 | See Documentation/nommu-mmap.txt for more information. | ||
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1c1e0d3a1714..92915e81443f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -106,6 +106,7 @@ struct vm_region { | |||
106 | unsigned long vm_flags; /* VMA vm_flags */ | 106 | unsigned long vm_flags; /* VMA vm_flags */ |
107 | unsigned long vm_start; /* start address of region */ | 107 | unsigned long vm_start; /* start address of region */ |
108 | unsigned long vm_end; /* region initialised to here */ | 108 | unsigned long vm_end; /* region initialised to here */ |
109 | unsigned long vm_top; /* region allocated to here */ | ||
109 | unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ | 110 | unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ |
110 | struct file *vm_file; /* the backing file or NULL */ | 111 | struct file *vm_file; /* the backing file or NULL */ |
111 | 112 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 92f6e5bc3c24..89d74436318c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -82,6 +82,9 @@ extern int percpu_pagelist_fraction; | |||
82 | extern int compat_log; | 82 | extern int compat_log; |
83 | extern int latencytop_enabled; | 83 | extern int latencytop_enabled; |
84 | extern int sysctl_nr_open_min, sysctl_nr_open_max; | 84 | extern int sysctl_nr_open_min, sysctl_nr_open_max; |
85 | #ifndef CONFIG_MMU | ||
86 | extern int sysctl_nr_trim_pages; | ||
87 | #endif | ||
85 | #ifdef CONFIG_RCU_TORTURE_TEST | 88 | #ifdef CONFIG_RCU_TORTURE_TEST |
86 | extern int rcutorture_runnable; | 89 | extern int rcutorture_runnable; |
87 | #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ | 90 | #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ |
@@ -1102,6 +1105,17 @@ static struct ctl_table vm_table[] = { | |||
1102 | .mode = 0644, | 1105 | .mode = 0644, |
1103 | .proc_handler = &proc_dointvec | 1106 | .proc_handler = &proc_dointvec |
1104 | }, | 1107 | }, |
1108 | #else | ||
1109 | { | ||
1110 | .ctl_name = CTL_UNNUMBERED, | ||
1111 | .procname = "nr_trim_pages", | ||
1112 | .data = &sysctl_nr_trim_pages, | ||
1113 | .maxlen = sizeof(sysctl_nr_trim_pages), | ||
1114 | .mode = 0644, | ||
1115 | .proc_handler = &proc_dointvec_minmax, | ||
1116 | .strategy = &sysctl_intvec, | ||
1117 | .extra1 = &zero, | ||
1118 | }, | ||
1105 | #endif | 1119 | #endif |
1106 | { | 1120 | { |
1107 | .ctl_name = VM_LAPTOP_MODE, | 1121 | .ctl_name = VM_LAPTOP_MODE, |
diff --git a/mm/nommu.c b/mm/nommu.c index 0d363dfcf10e..a6e8ccfbd400 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -10,7 +10,7 @@ | |||
10 | * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> | 10 | * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> |
11 | * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> | 11 | * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> |
12 | * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> | 12 | * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> |
13 | * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org> | 13 | * Copyright (c) 2007-2008 Paul Mundt <lethal@linux-sh.org> |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
@@ -66,6 +66,7 @@ atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); | |||
66 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 66 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
67 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 67 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
68 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | 68 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; |
69 | int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ | ||
69 | int heap_stack_gap = 0; | 70 | int heap_stack_gap = 0; |
70 | 71 | ||
71 | atomic_t mmap_pages_allocated; | 72 | atomic_t mmap_pages_allocated; |
@@ -455,6 +456,8 @@ static noinline void validate_nommu_regions(void) | |||
455 | last = rb_entry(lastp, struct vm_region, vm_rb); | 456 | last = rb_entry(lastp, struct vm_region, vm_rb); |
456 | if (unlikely(last->vm_end <= last->vm_start)) | 457 | if (unlikely(last->vm_end <= last->vm_start)) |
457 | BUG(); | 458 | BUG(); |
459 | if (unlikely(last->vm_top < last->vm_end)) | ||
460 | BUG(); | ||
458 | 461 | ||
459 | while ((p = rb_next(lastp))) { | 462 | while ((p = rb_next(lastp))) { |
460 | region = rb_entry(p, struct vm_region, vm_rb); | 463 | region = rb_entry(p, struct vm_region, vm_rb); |
@@ -462,7 +465,9 @@ static noinline void validate_nommu_regions(void) | |||
462 | 465 | ||
463 | if (unlikely(region->vm_end <= region->vm_start)) | 466 | if (unlikely(region->vm_end <= region->vm_start)) |
464 | BUG(); | 467 | BUG(); |
465 | if (unlikely(region->vm_start < last->vm_end)) | 468 | if (unlikely(region->vm_top < region->vm_end)) |
469 | BUG(); | ||
470 | if (unlikely(region->vm_start < last->vm_top)) | ||
466 | BUG(); | 471 | BUG(); |
467 | 472 | ||
468 | lastp = p; | 473 | lastp = p; |
@@ -536,7 +541,7 @@ static void free_page_series(unsigned long from, unsigned long to) | |||
536 | /* | 541 | /* |
537 | * release a reference to a region | 542 | * release a reference to a region |
538 | * - the caller must hold the region semaphore, which this releases | 543 | * - the caller must hold the region semaphore, which this releases |
539 | * - the region may not have been added to the tree yet, in which case vm_end | 544 | * - the region may not have been added to the tree yet, in which case vm_top |
540 | * will equal vm_start | 545 | * will equal vm_start |
541 | */ | 546 | */ |
542 | static void __put_nommu_region(struct vm_region *region) | 547 | static void __put_nommu_region(struct vm_region *region) |
@@ -547,7 +552,7 @@ static void __put_nommu_region(struct vm_region *region) | |||
547 | BUG_ON(!nommu_region_tree.rb_node); | 552 | BUG_ON(!nommu_region_tree.rb_node); |
548 | 553 | ||
549 | if (atomic_dec_and_test(®ion->vm_usage)) { | 554 | if (atomic_dec_and_test(®ion->vm_usage)) { |
550 | if (region->vm_end > region->vm_start) | 555 | if (region->vm_top > region->vm_start) |
551 | delete_nommu_region(region); | 556 | delete_nommu_region(region); |
552 | up_write(&nommu_region_sem); | 557 | up_write(&nommu_region_sem); |
553 | 558 | ||
@@ -558,7 +563,7 @@ static void __put_nommu_region(struct vm_region *region) | |||
558 | * from ramfs/tmpfs mustn't be released here */ | 563 | * from ramfs/tmpfs mustn't be released here */ |
559 | if (region->vm_flags & VM_MAPPED_COPY) { | 564 | if (region->vm_flags & VM_MAPPED_COPY) { |
560 | kdebug("free series"); | 565 | kdebug("free series"); |
561 | free_page_series(region->vm_start, region->vm_end); | 566 | free_page_series(region->vm_start, region->vm_top); |
562 | } | 567 | } |
563 | kmem_cache_free(vm_region_jar, region); | 568 | kmem_cache_free(vm_region_jar, region); |
564 | } else { | 569 | } else { |
@@ -999,6 +1004,10 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) | |||
999 | int ret; | 1004 | int ret; |
1000 | 1005 | ||
1001 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | 1006 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); |
1007 | if (ret == 0) { | ||
1008 | vma->vm_region->vm_top = vma->vm_region->vm_end; | ||
1009 | return ret; | ||
1010 | } | ||
1002 | if (ret != -ENOSYS) | 1011 | if (ret != -ENOSYS) |
1003 | return ret; | 1012 | return ret; |
1004 | 1013 | ||
@@ -1027,11 +1036,14 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1027 | */ | 1036 | */ |
1028 | if (vma->vm_file) { | 1037 | if (vma->vm_file) { |
1029 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | 1038 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); |
1030 | if (ret != -ENOSYS) { | 1039 | if (ret == 0) { |
1031 | /* shouldn't return success if we're not sharing */ | 1040 | /* shouldn't return success if we're not sharing */ |
1032 | BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE)); | 1041 | BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); |
1033 | return ret; /* success or a real error */ | 1042 | vma->vm_region->vm_top = vma->vm_region->vm_end; |
1043 | return ret; | ||
1034 | } | 1044 | } |
1045 | if (ret != -ENOSYS) | ||
1046 | return ret; | ||
1035 | 1047 | ||
1036 | /* getting an ENOSYS error indicates that direct mmap isn't | 1048 | /* getting an ENOSYS error indicates that direct mmap isn't |
1037 | * possible (as opposed to tried but failed) so we'll try to | 1049 | * possible (as opposed to tried but failed) so we'll try to |
@@ -1051,23 +1063,25 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1051 | if (!pages) | 1063 | if (!pages) |
1052 | goto enomem; | 1064 | goto enomem; |
1053 | 1065 | ||
1054 | /* we allocated a power-of-2 sized page set, so we need to trim off the | ||
1055 | * excess */ | ||
1056 | total = 1 << order; | 1066 | total = 1 << order; |
1057 | atomic_add(total, &mmap_pages_allocated); | 1067 | atomic_add(total, &mmap_pages_allocated); |
1058 | 1068 | ||
1059 | point = rlen >> PAGE_SHIFT; | 1069 | point = rlen >> PAGE_SHIFT; |
1060 | while (total > point) { | 1070 | |
1061 | order = ilog2(total - point); | 1071 | /* we allocated a power-of-2 sized page set, so we may want to trim off |
1062 | n = 1 << order; | 1072 | * the excess */ |
1063 | kdebug("shave %lu/%lu @%lu", n, total - point, total); | 1073 | if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { |
1064 | atomic_sub(n, &mmap_pages_allocated); | 1074 | while (total > point) { |
1065 | total -= n; | 1075 | order = ilog2(total - point); |
1066 | set_page_refcounted(pages + total); | 1076 | n = 1 << order; |
1067 | __free_pages(pages + total, order); | 1077 | kdebug("shave %lu/%lu @%lu", n, total - point, total); |
1078 | atomic_sub(n, &mmap_pages_allocated); | ||
1079 | total -= n; | ||
1080 | set_page_refcounted(pages + total); | ||
1081 | __free_pages(pages + total, order); | ||
1082 | } | ||
1068 | } | 1083 | } |
1069 | 1084 | ||
1070 | total = rlen >> PAGE_SHIFT; | ||
1071 | for (point = 1; point < total; point++) | 1085 | for (point = 1; point < total; point++) |
1072 | set_page_refcounted(&pages[point]); | 1086 | set_page_refcounted(&pages[point]); |
1073 | 1087 | ||
@@ -1075,6 +1089,7 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1075 | region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; | 1089 | region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; |
1076 | region->vm_start = (unsigned long) base; | 1090 | region->vm_start = (unsigned long) base; |
1077 | region->vm_end = region->vm_start + rlen; | 1091 | region->vm_end = region->vm_start + rlen; |
1092 | region->vm_top = region->vm_start + (total << PAGE_SHIFT); | ||
1078 | 1093 | ||
1079 | vma->vm_start = region->vm_start; | 1094 | vma->vm_start = region->vm_start; |
1080 | vma->vm_end = region->vm_start + len; | 1095 | vma->vm_end = region->vm_start + len; |
@@ -1110,6 +1125,7 @@ error_free: | |||
1110 | free_page_series(region->vm_start, region->vm_end); | 1125 | free_page_series(region->vm_start, region->vm_end); |
1111 | region->vm_start = vma->vm_start = 0; | 1126 | region->vm_start = vma->vm_start = 0; |
1112 | region->vm_end = vma->vm_end = 0; | 1127 | region->vm_end = vma->vm_end = 0; |
1128 | region->vm_top = 0; | ||
1113 | return ret; | 1129 | return ret; |
1114 | 1130 | ||
1115 | enomem: | 1131 | enomem: |
@@ -1401,7 +1417,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1401 | npages = (addr - vma->vm_start) >> PAGE_SHIFT; | 1417 | npages = (addr - vma->vm_start) >> PAGE_SHIFT; |
1402 | 1418 | ||
1403 | if (new_below) { | 1419 | if (new_below) { |
1404 | region->vm_end = new->vm_end = addr; | 1420 | region->vm_top = region->vm_end = new->vm_end = addr; |
1405 | } else { | 1421 | } else { |
1406 | region->vm_start = new->vm_start = addr; | 1422 | region->vm_start = new->vm_start = addr; |
1407 | region->vm_pgoff = new->vm_pgoff += npages; | 1423 | region->vm_pgoff = new->vm_pgoff += npages; |
@@ -1418,6 +1434,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1418 | vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; | 1434 | vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; |
1419 | } else { | 1435 | } else { |
1420 | vma->vm_region->vm_end = vma->vm_end = addr; | 1436 | vma->vm_region->vm_end = vma->vm_end = addr; |
1437 | vma->vm_region->vm_top = addr; | ||
1421 | } | 1438 | } |
1422 | add_nommu_region(vma->vm_region); | 1439 | add_nommu_region(vma->vm_region); |
1423 | add_nommu_region(new->vm_region); | 1440 | add_nommu_region(new->vm_region); |
@@ -1454,10 +1471,12 @@ static int shrink_vma(struct mm_struct *mm, | |||
1454 | 1471 | ||
1455 | down_write(&nommu_region_sem); | 1472 | down_write(&nommu_region_sem); |
1456 | delete_nommu_region(region); | 1473 | delete_nommu_region(region); |
1457 | if (from > region->vm_start) | 1474 | if (from > region->vm_start) { |
1458 | region->vm_end = from; | 1475 | to = region->vm_top; |
1459 | else | 1476 | region->vm_top = region->vm_end = from; |
1477 | } else { | ||
1460 | region->vm_start = to; | 1478 | region->vm_start = to; |
1479 | } | ||
1461 | add_nommu_region(region); | 1480 | add_nommu_region(region); |
1462 | up_write(&nommu_region_sem); | 1481 | up_write(&nommu_region_sem); |
1463 | 1482 | ||