diff options
| author | KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> | 2009-04-30 18:08:51 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-05-02 18:36:10 -0400 |
| commit | 00a62ce91e554198ef28234c91c36f850f5a3bc9 (patch) | |
| tree | 367ef134219deef91903c3fa0eb108c13658f2c7 | |
| parent | 0763ed2355198cdef2f6a2098e9d52eb1fe4365d (diff) | |
mm: fix Committed_AS underflow on large NR_CPUS environment
The Committed_AS field can underflow in certain situations:
> # while true; do cat /proc/meminfo | grep _AS; sleep 1; done | uniq -c
> 1 Committed_AS: 18446744073709323392 kB
> 11 Committed_AS: 18446744073709455488 kB
> 6 Committed_AS: 35136 kB
> 5 Committed_AS: 18446744073709454400 kB
> 7 Committed_AS: 35904 kB
> 3 Committed_AS: 18446744073709453248 kB
> 2 Committed_AS: 34752 kB
> 9 Committed_AS: 18446744073709453248 kB
> 8 Committed_AS: 34752 kB
> 3 Committed_AS: 18446744073709320960 kB
> 7 Committed_AS: 18446744073709454080 kB
> 3 Committed_AS: 18446744073709320960 kB
> 5 Committed_AS: 18446744073709454080 kB
> 6 Committed_AS: 18446744073709320960 kB
Because NR_CPUS can be greater than 1000 and meminfo_proc_show() does
not check for underflow.
But NR_CPUS proportional isn't good calculation. In general,
possibility of lock contention is proportional to the number of online
cpus, not theorical maximum cpus (NR_CPUS).
The current kernel has generic percpu-counter stuff. using it is right
way. it makes code simplify and percpu_counter_read_positive() don't
make underflow issue.
Reported-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Eric B Munson <ebmunson@us.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: <stable@kernel.org> [All kernel versions]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
| -rw-r--r-- | fs/proc/meminfo.c | 2 | ||||
| -rw-r--r-- | include/linux/mman.h | 9 | ||||
| -rw-r--r-- | mm/mmap.c | 12 | ||||
| -rw-r--r-- | mm/nommu.c | 13 | ||||
| -rw-r--r-- | mm/swap.c | 46 |
5 files changed, 17 insertions, 65 deletions
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 74ea974f5ca6..c6b0302af4c4 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
| @@ -35,7 +35,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
| 35 | #define K(x) ((x) << (PAGE_SHIFT - 10)) | 35 | #define K(x) ((x) << (PAGE_SHIFT - 10)) |
| 36 | si_meminfo(&i); | 36 | si_meminfo(&i); |
| 37 | si_swapinfo(&i); | 37 | si_swapinfo(&i); |
| 38 | committed = atomic_long_read(&vm_committed_space); | 38 | committed = percpu_counter_read_positive(&vm_committed_as); |
| 39 | allowed = ((totalram_pages - hugetlb_total_pages()) | 39 | allowed = ((totalram_pages - hugetlb_total_pages()) |
| 40 | * sysctl_overcommit_ratio / 100) + total_swap_pages; | 40 | * sysctl_overcommit_ratio / 100) + total_swap_pages; |
| 41 | 41 | ||
diff --git a/include/linux/mman.h b/include/linux/mman.h index 30d1073bac3b..9872d6ca58ae 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h | |||
| @@ -12,21 +12,18 @@ | |||
| 12 | 12 | ||
| 13 | #ifdef __KERNEL__ | 13 | #ifdef __KERNEL__ |
| 14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
| 15 | #include <linux/percpu_counter.h> | ||
| 15 | 16 | ||
| 16 | #include <asm/atomic.h> | 17 | #include <asm/atomic.h> |
| 17 | 18 | ||
| 18 | extern int sysctl_overcommit_memory; | 19 | extern int sysctl_overcommit_memory; |
| 19 | extern int sysctl_overcommit_ratio; | 20 | extern int sysctl_overcommit_ratio; |
| 20 | extern atomic_long_t vm_committed_space; | 21 | extern struct percpu_counter vm_committed_as; |
| 21 | 22 | ||
| 22 | #ifdef CONFIG_SMP | ||
| 23 | extern void vm_acct_memory(long pages); | ||
| 24 | #else | ||
| 25 | static inline void vm_acct_memory(long pages) | 23 | static inline void vm_acct_memory(long pages) |
| 26 | { | 24 | { |
| 27 | atomic_long_add(pages, &vm_committed_space); | 25 | percpu_counter_add(&vm_committed_as, pages); |
| 28 | } | 26 | } |
| 29 | #endif | ||
| 30 | 27 | ||
| 31 | static inline void vm_unacct_memory(long pages) | 28 | static inline void vm_unacct_memory(long pages) |
| 32 | { | 29 | { |
| @@ -85,7 +85,7 @@ EXPORT_SYMBOL(vm_get_page_prot); | |||
| 85 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 85 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
| 86 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 86 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
| 87 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 87 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
| 88 | atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); | 88 | struct percpu_counter vm_committed_as; |
| 89 | 89 | ||
| 90 | /* | 90 | /* |
| 91 | * Check that a process has enough memory to allocate a new virtual | 91 | * Check that a process has enough memory to allocate a new virtual |
| @@ -179,11 +179,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
| 179 | if (mm) | 179 | if (mm) |
| 180 | allowed -= mm->total_vm / 32; | 180 | allowed -= mm->total_vm / 32; |
| 181 | 181 | ||
| 182 | /* | 182 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) |
| 183 | * cast `allowed' as a signed long because vm_committed_space | ||
| 184 | * sometimes has a negative value | ||
| 185 | */ | ||
| 186 | if (atomic_long_read(&vm_committed_space) < (long)allowed) | ||
| 187 | return 0; | 183 | return 0; |
| 188 | error: | 184 | error: |
| 189 | vm_unacct_memory(pages); | 185 | vm_unacct_memory(pages); |
| @@ -2481,4 +2477,8 @@ void mm_drop_all_locks(struct mm_struct *mm) | |||
| 2481 | */ | 2477 | */ |
| 2482 | void __init mmap_init(void) | 2478 | void __init mmap_init(void) |
| 2483 | { | 2479 | { |
| 2480 | int ret; | ||
| 2481 | |||
| 2482 | ret = percpu_counter_init(&vm_committed_as, 0); | ||
| 2483 | VM_BUG_ON(ret); | ||
| 2484 | } | 2484 | } |
diff --git a/mm/nommu.c b/mm/nommu.c index 72eda4aee2cb..809998aa7b50 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -62,7 +62,7 @@ void *high_memory; | |||
| 62 | struct page *mem_map; | 62 | struct page *mem_map; |
| 63 | unsigned long max_mapnr; | 63 | unsigned long max_mapnr; |
| 64 | unsigned long num_physpages; | 64 | unsigned long num_physpages; |
| 65 | atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); | 65 | struct percpu_counter vm_committed_as; |
| 66 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 66 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
| 67 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 67 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
| 68 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | 68 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; |
| @@ -463,6 +463,10 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
| 463 | */ | 463 | */ |
| 464 | void __init mmap_init(void) | 464 | void __init mmap_init(void) |
| 465 | { | 465 | { |
| 466 | int ret; | ||
| 467 | |||
| 468 | ret = percpu_counter_init(&vm_committed_as, 0); | ||
| 469 | VM_BUG_ON(ret); | ||
| 466 | vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); | 470 | vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); |
| 467 | } | 471 | } |
| 468 | 472 | ||
| @@ -1847,12 +1851,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
| 1847 | if (mm) | 1851 | if (mm) |
| 1848 | allowed -= mm->total_vm / 32; | 1852 | allowed -= mm->total_vm / 32; |
| 1849 | 1853 | ||
| 1850 | /* | 1854 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) |
| 1851 | * cast `allowed' as a signed long because vm_committed_space | ||
| 1852 | * sometimes has a negative value | ||
| 1853 | */ | ||
| 1854 | if (atomic_long_read(&vm_committed_space) < (long)allowed) | ||
| 1855 | return 0; | 1855 | return 0; |
| 1856 | |||
| 1856 | error: | 1857 | error: |
| 1857 | vm_unacct_memory(pages); | 1858 | vm_unacct_memory(pages); |
| 1858 | 1859 | ||
| @@ -491,49 +491,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, | |||
| 491 | 491 | ||
| 492 | EXPORT_SYMBOL(pagevec_lookup_tag); | 492 | EXPORT_SYMBOL(pagevec_lookup_tag); |
| 493 | 493 | ||
| 494 | #ifdef CONFIG_SMP | ||
| 495 | /* | ||
| 496 | * We tolerate a little inaccuracy to avoid ping-ponging the counter between | ||
| 497 | * CPUs | ||
| 498 | */ | ||
| 499 | #define ACCT_THRESHOLD max(16, NR_CPUS * 2) | ||
| 500 | |||
| 501 | static DEFINE_PER_CPU(long, committed_space); | ||
| 502 | |||
| 503 | void vm_acct_memory(long pages) | ||
| 504 | { | ||
| 505 | long *local; | ||
| 506 | |||
| 507 | preempt_disable(); | ||
| 508 | local = &__get_cpu_var(committed_space); | ||
| 509 | *local += pages; | ||
| 510 | if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { | ||
| 511 | atomic_long_add(*local, &vm_committed_space); | ||
| 512 | *local = 0; | ||
| 513 | } | ||
| 514 | preempt_enable(); | ||
| 515 | } | ||
| 516 | |||
| 517 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 518 | |||
| 519 | /* Drop the CPU's cached committed space back into the central pool. */ | ||
| 520 | static int cpu_swap_callback(struct notifier_block *nfb, | ||
| 521 | unsigned long action, | ||
| 522 | void *hcpu) | ||
| 523 | { | ||
| 524 | long *committed; | ||
| 525 | |||
| 526 | committed = &per_cpu(committed_space, (long)hcpu); | ||
| 527 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | ||
| 528 | atomic_long_add(*committed, &vm_committed_space); | ||
| 529 | *committed = 0; | ||
| 530 | drain_cpu_pagevecs((long)hcpu); | ||
| 531 | } | ||
| 532 | return NOTIFY_OK; | ||
| 533 | } | ||
| 534 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 535 | #endif /* CONFIG_SMP */ | ||
| 536 | |||
| 537 | /* | 494 | /* |
| 538 | * Perform any setup for the swap system | 495 | * Perform any setup for the swap system |
| 539 | */ | 496 | */ |
| @@ -554,7 +511,4 @@ void __init swap_setup(void) | |||
| 554 | * Right now other parts of the system means that we | 511 | * Right now other parts of the system means that we |
| 555 | * _really_ don't want to cluster much more | 512 | * _really_ don't want to cluster much more |
| 556 | */ | 513 | */ |
| 557 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 558 | hotcpu_notifier(cpu_swap_callback, 0); | ||
| 559 | #endif | ||
| 560 | } | 514 | } |
