From f64ae042d94d376b54e7a343d93c48561e9d2e16 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 11 Nov 2011 08:33:48 +0800 Subject: slub: use correct parameter to add a page to partial list tail unfreeze_partials() needs add the page to partial list tail, since such page hasn't too many free objects. We now explictly use DEACTIVATE_TO_TAIL for this, while DEACTIVATE_TO_TAIL != 1. This will cause performance regression (eg, more lock contention in node->list_lock) without below fix. Signed-off-by: Shaohua Li Acked-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 7d2a996c307e..60e16c43f88c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1904,7 +1904,8 @@ static void unfreeze_partials(struct kmem_cache *s) if (l == M_PARTIAL) remove_partial(n, page); else - add_partial(n, page, 1); + add_partial(n, page, + DEACTIVATE_TO_TAIL); l = m; } -- cgit v1.2.2 From 9ada19342b2441f290f0043ed7c562682c8c4ede Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 14 Nov 2011 13:34:13 +0800 Subject: slub: move discard_slab out of node lock Lockdep reports there is potential deadlock for slub node list_lock. discard_slab() is called with the lock hold in unfreeze_partials(), which could trigger a slab allocation, which could hold the lock again. discard_slab() doesn't need hold the lock actually, if the slab is already removed from partial list. Acked-by: Christoph Lameter Reported-and-tested-by: Yong Zhang Reported-and-tested-by: Julie Sullivan Signed-off-by: Shaohua Li Signed-off-by: Pekka Enberg --- mm/slub.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 60e16c43f88c..00efbb56a268 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1862,7 +1862,7 @@ static void unfreeze_partials(struct kmem_cache *s) { struct kmem_cache_node *n = NULL; struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); - struct page *page; + struct page *page, *discard_page = NULL; while ((page = c->partial)) { enum slab_modes { M_PARTIAL, M_FREE }; @@ -1916,14 +1916,22 @@ static void unfreeze_partials(struct kmem_cache *s) "unfreezing slab")); if (m == M_FREE) { - stat(s, DEACTIVATE_EMPTY); - discard_slab(s, page); - stat(s, FREE_SLAB); + page->next = discard_page; + discard_page = page; } } if (n) spin_unlock(&n->list_lock); + + while (discard_page) { + page = discard_page; + discard_page = discard_page->next; + + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); + } } /* -- cgit v1.2.2 From 90459ce06f410b983540be56209c0abcbce23944 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Thu, 4 Aug 2011 11:02:33 +0200 Subject: percpu: rename pcpu_mem_alloc to pcpu_mem_zalloc Currently pcpu_mem_alloc() is implemented always return zeroed memory. So rename it to make user like pcpu_get_pages_and_bitmap() know don't reinit it. Signed-off-by: Bob Liu Reviewed-by: Pekka Enberg Reviewed-by: Michal Hocko Signed-off-by: Tejun Heo --- mm/percpu-vm.c | 5 ++--- mm/percpu.c | 17 +++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index ea534960a04b..29e3730d2ffd 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -50,14 +50,13 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, if (!pages || !bitmap) { if (may_alloc && !pages) - pages = pcpu_mem_alloc(pages_size); + pages = pcpu_mem_zalloc(pages_size); if (may_alloc && !bitmap) - bitmap = pcpu_mem_alloc(bitmap_size); + bitmap = pcpu_mem_zalloc(bitmap_size); if (!pages || !bitmap) return NULL; } - memset(pages, 0, pages_size); bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); *bitmapp = bitmap; diff --git a/mm/percpu.c b/mm/percpu.c index bf80e55dbed7..28c37a2e2de2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -273,11 +273,11 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) /** - * pcpu_mem_alloc - allocate memory + * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate * * Allocate @size bytes. If @size is smaller than PAGE_SIZE, - * kzalloc() is used; otherwise, vmalloc() is used. The returned + * kzalloc() is used; otherwise, vzalloc() is used. The returned * memory is always zeroed. * * CONTEXT: @@ -286,7 +286,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ -static void *pcpu_mem_alloc(size_t size) +static void *pcpu_mem_zalloc(size_t size) { if (WARN_ON_ONCE(!slab_is_available())) return NULL; @@ -302,7 +302,7 @@ static void *pcpu_mem_alloc(size_t size) * @ptr: memory to free * @size: size of the area * - * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc(). + * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). */ static void pcpu_mem_free(void *ptr, size_t size) { @@ -384,7 +384,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); unsigned long flags; - new = pcpu_mem_alloc(new_size); + new = pcpu_mem_zalloc(new_size); if (!new) return -ENOMEM; @@ -604,11 +604,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) { struct pcpu_chunk *chunk; - chunk = pcpu_mem_alloc(pcpu_chunk_struct_size); + chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); if (!chunk) return NULL; - chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * + sizeof(chunk->map[0])); if (!chunk->map) { kfree(chunk); return NULL; @@ -1889,7 +1890,7 @@ void __init percpu_init_late(void) BUILD_BUG_ON(size > PAGE_SIZE); - map = pcpu_mem_alloc(size); + map = pcpu_mem_zalloc(size); BUG_ON(!map); spin_lock_irqsave(&pcpu_lock, flags); -- cgit v1.2.2 From a855b84c3d8c73220d4d3cd392a7bee7c83de70e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Nov 2011 10:55:35 -0800 Subject: percpu: fix chunk range calculation Percpu allocator recorded the cpus which map to the first and last units in pcpu_first/last_unit_cpu respectively and used them to determine the address range of a chunk - e.g. it assumed that the first unit has the lowest address in a chunk while the last unit has the highest address. This simply isn't true. Groups in a chunk can have arbitrary positive or negative offsets from the previous one and there is no guarantee that the first unit occupies the lowest offset while the last one the highest. Fix it by actually comparing unit offsets to determine cpus occupying the lowest and highest offsets. Also, rename pcu_first/last_unit_cpu to pcpu_low/high_unit_cpu to avoid confusion. The chunk address range is used to flush cache on vmalloc area map/unmap and decide whether a given address is in the first chunk by per_cpu_ptr_to_phys() and the bug was discovered by invalid per_cpu_ptr_to_phys() translation for crash_note. Kudos to Dave Young for tracking down the problem. Signed-off-by: Tejun Heo Reported-by: WANG Cong Reported-by: Dave Young Tested-by: Dave Young LKML-Reference: <4EC21F67.10905@redhat.com> Cc: stable @kernel.org --- mm/percpu-vm.c | 12 ++++++------ mm/percpu.c | 34 ++++++++++++++++++++-------------- 2 files changed, 26 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 29e3730d2ffd..12a48a88c0d8 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -142,8 +142,8 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vunmap( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) @@ -205,8 +205,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_tlb_kernel_range( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, @@ -283,8 +283,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vmap( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } /** diff --git a/mm/percpu.c b/mm/percpu.c index 28c37a2e2de2..2473ff06dc76 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -116,9 +116,9 @@ static int pcpu_atom_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; -/* cpus with the lowest and highest unit numbers */ -static unsigned int pcpu_first_unit_cpu __read_mostly; -static unsigned int pcpu_last_unit_cpu __read_mostly; +/* cpus with the lowest and highest unit addresses */ +static unsigned int pcpu_low_unit_cpu __read_mostly; +static unsigned int pcpu_high_unit_cpu __read_mostly; /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __read_mostly; @@ -985,19 +985,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) { void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); bool in_first_chunk = false; - unsigned long first_start, first_end; + unsigned long first_low, first_high; unsigned int cpu; /* - * The following test on first_start/end isn't strictly + * The following test on unit_low/high isn't strictly * necessary but will speed up lookups of addresses which * aren't in the first chunk. */ - first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0); - first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu, - pcpu_unit_pages); - if ((unsigned long)addr >= first_start && - (unsigned long)addr < first_end) { + first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0); + first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu, + pcpu_unit_pages); + if ((unsigned long)addr >= first_low && + (unsigned long)addr < first_high) { for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); @@ -1234,7 +1234,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; - pcpu_first_unit_cpu = NR_CPUS; + + pcpu_low_unit_cpu = NR_CPUS; + pcpu_high_unit_cpu = NR_CPUS; for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { const struct pcpu_group_info *gi = &ai->groups[group]; @@ -1254,9 +1256,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, unit_map[cpu] = unit + i; unit_off[cpu] = gi->base_offset + i * ai->unit_size; - if (pcpu_first_unit_cpu == NR_CPUS) - pcpu_first_unit_cpu = cpu; - pcpu_last_unit_cpu = cpu; + /* determine low/high unit_cpu */ + if (pcpu_low_unit_cpu == NR_CPUS || + unit_off[cpu] < unit_off[pcpu_low_unit_cpu]) + pcpu_low_unit_cpu = cpu; + if (pcpu_high_unit_cpu == NR_CPUS || + unit_off[cpu] > unit_off[pcpu_high_unit_cpu]) + pcpu_high_unit_cpu = cpu; } } pcpu_nr_units = unit; -- cgit v1.2.2 From 67589c71456b0346500629967292dea3802230b6 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 23 Nov 2011 08:20:53 -0800 Subject: percpu: explain why per_cpu_ptr_to_phys() is more complicated than necessary Add comments about current per_cpu_ptr_to_phys implementation to explain why the logic is more complicated than necessary. -tj: relocated comment into kerneldoc comment Signed-off-by: Dave Young Signed-off-by: Tejun Heo --- mm/percpu.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 2473ff06dc76..3bb810a72006 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -978,6 +978,17 @@ bool is_kernel_percpu_address(unsigned long addr) * address. The caller is responsible for ensuring @addr stays valid * until this function finishes. * + * percpu allocator has special setup for the first chunk, which currently + * supports either embedding in linear address space or vmalloc mapping, + * and, from the second one, the backing allocator (currently either vm or + * km) provides translation. + * + * The addr can be tranlated simply without checking if it falls into the + * first chunk. But the current code reflects better how percpu allocator + * actually works, and the verification can discover both bugs in percpu + * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current + * code. + * * RETURNS: * The physical address for @addr. */ -- cgit v1.2.2 From 42d623a8cd08eb93ab221d22cee5a62618895bbf Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 23 Nov 2011 09:14:38 -0600 Subject: slub: use irqsafe_cpu_cmpxchg for put_cpu_partial The cmpxchg must be irq safe. The fallback for this_cpu_cmpxchg only disables preemption which results in per cpu partial page operation potentially failing on non x86 platforms. This patch fixes the following problem reported by Christian Kujau: I seem to hit it with heavy disk & cpu IO is in progress on this PowerBook G4. Full dmesg & .config: http://nerdbynature.de/bits/3.2.0-rc1/oops/ I've enabled some debug options and now it really points to slub.c:2166 http://nerdbynature.de/bits/3.2.0-rc1/oops/oops4m.jpg With debug options enabled I'm currently in the xmon debugger, not sure what to make of it yet, I'll try to get something useful out of it :) Reported-by: Christian Kujau Tested-by: Christian Kujau Acked-by: Eric Dumazet Acked-by: David Rientjes Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 00efbb56a268..2a9cfd72a3d7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1978,7 +1978,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) page->pobjects = pobjects; page->next = oldpage; - } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); + } while (irqsafe_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); stat(s, CPU_PARTIAL_FREE); return pobjects; } -- cgit v1.2.2 From bc6697d8a506dedf09e8e9974ffa3a316183e608 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Nov 2011 16:02:02 +0100 Subject: slub: avoid potential NULL dereference or corruption show_slab_objects() can trigger NULL dereferences or memory corruption. Another cpu can change its c->page to NULL or c->node to NUMA_NO_NODE while we use them. Use ACCESS_ONCE(c->page) and ACCESS_ONCE(c->node) to make sure this cannot happen. Acked-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Eric Dumazet Signed-off-by: Pekka Enberg --- mm/slub.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 2a9cfd72a3d7..ed3334d9b6da 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4444,30 +4444,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s, for_each_possible_cpu(cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + int node = ACCESS_ONCE(c->node); struct page *page; - if (!c || c->node < 0) + if (node < 0) continue; - - if (c->page) { - if (flags & SO_TOTAL) - x = c->page->objects; + page = ACCESS_ONCE(c->page); + if (page) { + if (flags & SO_TOTAL) + x = page->objects; else if (flags & SO_OBJECTS) - x = c->page->inuse; + x = page->inuse; else x = 1; total += x; - nodes[c->node] += x; + nodes[node] += x; } page = c->partial; if (page) { x = page->pobjects; - total += x; - nodes[c->node] += x; + total += x; + nodes[node] += x; } - per_cpu[c->node]++; + per_cpu[node]++; } } -- cgit v1.2.2 From a50527b19c62c808a7fca022816fff88a50b948d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 2 Dec 2011 09:17:02 +0800 Subject: fs: Make write(2) interruptible by a fatal signal Currently write(2) to a file is not interruptible by any signal. Sometimes this is desirable, e.g. when you want to quickly kill a process hogging your disk. Also, with commit 499d05ecf990 ("mm: Make task in balance_dirty_pages() killable"), it's necessary to abort the current write accordingly to avoid it quickly dirtying lots more pages at unthrottled rate. This patch makes write interruptible by SIGKILL. We do not allow write to be interruptible by any other signal because that has larger potential of screwing some badly written applications. Reported-by: Kazuya Mio Tested-by: Kazuya Mio Acked-by: Matthew Wilcox Signed-off-by: Jan Kara Signed-off-by: Wu Fengguang --- mm/filemap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index c0018f2d50e0..c106d3b3cc64 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2407,7 +2407,6 @@ static ssize_t generic_perform_write(struct file *file, iov_iter_count(i)); again: - /* * Bring in the user page that we will copy from _first_. * Otherwise there's a nasty deadlock on copying from the @@ -2463,7 +2462,10 @@ again: written += copied; balance_dirty_pages_ratelimited(mapping); - + if (fatal_signal_pending(current)) { + status = -EINTR; + break; + } } while (iov_iter_count(i)); return written ? written : status; -- cgit v1.2.2 From 52cef189165d74a5d6030184a8e05595194c69ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Nov 2011 21:12:40 +0100 Subject: slab, lockdep: Fix silly bug Commit 30765b92 ("slab, lockdep: Annotate the locks before using them") moves the init_lock_keys() call from after g_cpucache_up = FULL, to before it. And overlooks the fact that init_node_lock_keys() tests for it and ignores everything !FULL. Introduce a LATE stage and change the lockdep test to be Cc: Pekka Enberg Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- mm/slab.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 708efe886154..83311c9aaf9d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -595,6 +595,7 @@ static enum { PARTIAL_AC, PARTIAL_L3, EARLY, + LATE, FULL } g_cpucache_up; @@ -671,7 +672,7 @@ static void init_node_lock_keys(int q) { struct cache_sizes *s = malloc_sizes; - if (g_cpucache_up != FULL) + if (g_cpucache_up < LATE) return; for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { @@ -1666,6 +1667,8 @@ void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; + g_cpucache_up = LATE; + /* Annotate slab for lockdep -- annotate the malloc caches */ init_lock_keys(); -- cgit v1.2.2 From aed21ad28b1323b2807faea019e5ac388a7bc837 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 23 Nov 2011 11:44:41 -0600 Subject: writeback: comment on the bdi dirty threshold We do "floating proportions" to let active devices to grow its target share of dirty pages and stalled/inactive devices to decrease its target share over time. It works well except in the case of "an inactive disk suddenly goes busy", where the initial target share may be too small. To mitigate this, bdi_position_ratio() has the below line to raise a small bdi_thresh when it's safe to do so, so that the disk be feed with enough dirty pages for efficient IO and in turn fast rampup of bdi_thresh: bdi_thresh = max(bdi_thresh, (limit - dirty) / 8); balance_dirty_pages() normally does negative feedback control which adjusts ratelimit to balance the bdi dirty pages around the target. In some extreme cases when that is not enough, it will have to block the tasks completely until the bdi dirty pages drop below bdi_thresh. Acked-by: Jan Kara Acked-by: Peter Zijlstra Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 71252486bc6f..155efca4c123 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -411,8 +411,13 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) * * Returns @bdi's dirty limit in pages. The term "dirty" in the context of * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. - * And the "limit" in the name is not seriously taken as hard limit in - * balance_dirty_pages(). + * + * Note that balance_dirty_pages() will only seriously take it as a hard limit + * when sleeping max_pause per page is not enough to keep the dirty pages under + * control. For example, when the device is completely stalled due to some error + * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. + * In the other normal situations, it acts more gently by throttling the tasks + * more (rather than completely block them) when the bdi dirty pages go high. * * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices @@ -594,6 +599,13 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, */ if (unlikely(bdi_thresh > thresh)) bdi_thresh = thresh; + /* + * It's very possible that bdi_thresh is close to 0 not because the + * device is slow, but that it has remained inactive for long time. + * Honour such devices a reasonable good (hopefully IO efficient) + * threshold, so that the occasional writes won't be blocked and active + * writes can rampup the threshold quickly. + */ bdi_thresh = max(bdi_thresh, (limit - dirty) / 8); /* * scale global setpoint to bdi's: -- cgit v1.2.2 From c5c6343c4d75f9d3226e05a72e7861e967fc8099 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 2 Dec 2011 10:21:33 -0600 Subject: writeback: permit through good bdi even when global dirty exceeded On a system with 1 local mount and 1 NFS mount, if the NFS server becomes not responding when dd to the NFS mount, the NFS dirty pages may exceed the global dirty limit and _every_ task involving writing will be blocked. The whole system appears unresponsive. The workaround is to permit through the bdi's that only has a small number of dirty pages. The number chosen (bdi_stat_error pages) is not enough to enable the local disk to run in optimal throughput, however is enough to make the system responsive on a broken NFS mount. The user can then kill the dirtiers on the NFS mount and increase the global dirty limit to bring up the local disk's throughput. It risks allowing dirty pages to grow much larger than the global dirty limit when there are 1000+ mounts, however that's very unlikely to happen, especially in low memory profiles. Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 155efca4c123..17403e3a7c89 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1148,6 +1148,19 @@ pause: if (task_ratelimit) break; + /* + * In the case of an unresponding NFS server and the NFS dirty + * pages exceeds dirty_thresh, give the other good bdi's a pipe + * to go through, so that tasks on them still remain responsive. + * + * In theory 1 page is enough to keep the comsumer-producer + * pipe going: the flusher cleans 1 page => the task dirties 1 + * more page. However bdi_dirty has accounting errors. So use + * the larger and more IO friendly bdi_stat_error. + */ + if (bdi_dirty <= bdi_stat_error(bdi)) + break; + if (fatal_signal_pending(current)) break; } -- cgit v1.2.2 From 82e230a07de3812a5e87a27979f033dad59172e3 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 2 Dec 2011 18:21:51 -0600 Subject: writeback: set max_pause to lowest value on zero bdi_dirty Some trace shows lots of bdi_dirty=0 lines where it's actually some small value if w/o the accounting errors in the per-cpu bdi stats. In this case the max pause time should really be set to the smallest (non-zero) value to avoid IO queue underrun and improve throughput. Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 17403e3a7c89..50f08241f981 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -989,8 +989,7 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi, * * 8 serves as the safety ratio. */ - if (bdi_dirty) - t = min(t, bdi_dirty * HZ / (8 * bw + 1)); + t = min(t, bdi_dirty * HZ / (8 * bw + 1)); /* * The pause time will be settled within range (max_pause/4, max_pause). -- cgit v1.2.2 From 635697c663f38106063d5659f0cf2e45afcd4bb5 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 8 Dec 2011 14:33:51 -0800 Subject: vmscan: fix initial shrinker size handling A shrinker function can return -1, means that it cannot do anything without a risk of deadlock. For example prune_super() does this if it cannot grab a superblock refrence, even if nr_to_scan=0. Currently we interpret this -1 as a ULONG_MAX size shrinker and evaluate `total_scan' according to this. So the next time around this shrinker can cause really big pressure. Let's skip such shrinkers instead. Also make total_scan signed, otherwise the check (total_scan < 0) below never works. Signed-off-by: Konstantin Khlebnikov Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index a1893c050795..f5255442ae2b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -247,14 +247,18 @@ unsigned long shrink_slab(struct shrink_control *shrink, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; - unsigned long total_scan; - unsigned long max_pass; + long total_scan; + long max_pass; int shrink_ret = 0; long nr; long new_nr; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; + max_pass = do_shrinker_shrink(shrinker, shrink, 0); + if (max_pass <= 0) + continue; + /* * copy the current shrinker scan count into a local variable * and zero it so that other concurrent shrinker invocations @@ -265,7 +269,6 @@ unsigned long shrink_slab(struct shrink_control *shrink, } while (cmpxchg(&shrinker->nr, nr, 0) != nr); total_scan = nr; - max_pass = do_shrinker_shrink(shrinker, shrink, 0); delta = (4 * nr_pages_scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); -- cgit v1.2.2 From 83aeeada7c69f35e5100b27ec354335597a7a488 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 8 Dec 2011 14:33:54 -0800 Subject: vmscan: use atomic-long for shrinker batching Use atomic-long operations instead of looping around cmpxchg(). [akpm@linux-foundation.org: massage atomic.h inclusions] Signed-off-by: Konstantin Khlebnikov Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index f5255442ae2b..f54a05b7a61d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -183,7 +183,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, */ void register_shrinker(struct shrinker *shrinker) { - shrinker->nr = 0; + atomic_long_set(&shrinker->nr_in_batch, 0); down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); @@ -264,9 +264,7 @@ unsigned long shrink_slab(struct shrink_control *shrink, * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ - do { - nr = shrinker->nr; - } while (cmpxchg(&shrinker->nr, nr, 0) != nr); + nr = atomic_long_xchg(&shrinker->nr_in_batch, 0); total_scan = nr; delta = (4 * nr_pages_scanned) / shrinker->seeks; @@ -328,12 +326,11 @@ unsigned long shrink_slab(struct shrink_control *shrink, * manner that handles concurrent updates. If we exhausted the * scan, there is no need to do an update. */ - do { - nr = shrinker->nr; - new_nr = total_scan + nr; - if (total_scan <= 0) - break; - } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); + if (total_scan > 0) + new_nr = atomic_long_add_return(total_scan, + &shrinker->nr_in_batch); + else + new_nr = atomic_long_read(&shrinker->nr_in_batch); trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); } -- cgit v1.2.2 From 1dfb059b9438633b0546c5431538a47f6ed99028 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 8 Dec 2011 14:33:57 -0800 Subject: thp: reduce khugepaged freezing latency khugepaged can sometimes cause suspend to fail, requiring that the user retry the suspend operation. Use wait_event_freezable_timeout() instead of schedule_timeout_interruptible() to avoid missing freezer wakeups. A try_to_freeze() would have been needed in the khugepaged_alloc_hugepage tight loop too in case of the allocation failing repeatedly, and wait_event_freezable_timeout will provide it too. khugepaged would still freeze just fine by trying again the next minute but it's better if it freezes immediately. Reported-by: Jiri Slaby Signed-off-by: Andrea Arcangeli Tested-by: Jiri Slaby Cc: Tejun Heo Cc: Oleg Nesterov Cc: "Srivatsa S. Bhat" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4298abaae153..36b3d988b4ef 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2259,12 +2259,8 @@ static void khugepaged_do_scan(struct page **hpage) static void khugepaged_alloc_sleep(void) { - DEFINE_WAIT(wait); - add_wait_queue(&khugepaged_wait, &wait); - schedule_timeout_interruptible( - msecs_to_jiffies( - khugepaged_alloc_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); } #ifndef CONFIG_NUMA @@ -2313,14 +2309,10 @@ static void khugepaged_loop(void) if (unlikely(kthread_should_stop())) break; if (khugepaged_has_work()) { - DEFINE_WAIT(wait); if (!khugepaged_scan_sleep_millisecs) continue; - add_wait_queue(&khugepaged_wait, &wait); - schedule_timeout_interruptible( - msecs_to_jiffies( - khugepaged_scan_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); } else if (khugepaged_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); -- cgit v1.2.2 From 58a84aa92723d1ac3e1cc4e3b0ff49291663f7e1 Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Thu, 8 Dec 2011 14:34:18 -0800 Subject: thp: set compound tail page _count to zero Commit 70b50f94f1644 ("mm: thp: tail page refcounting fix") keeps all page_tail->_count zero at all times. But the current kernel does not set page_tail->_count to zero if a 1GB page is utilized. So when an IOMMU 1GB page is used by KVM, it wil result in a kernel oops because a tail page's _count does not equal zero. kernel BUG at include/linux/mm.h:386! invalid opcode: 0000 [#1] SMP Call Trace: gup_pud_range+0xb8/0x19d get_user_pages_fast+0xcb/0x192 ? trace_hardirqs_off+0xd/0xf hva_to_pfn+0x119/0x2f2 gfn_to_pfn_memslot+0x2c/0x2e kvm_iommu_map_pages+0xfd/0x1c1 kvm_iommu_map_memslots+0x7c/0xbd kvm_iommu_map_guest+0xaa/0xbf kvm_vm_ioctl_assigned_device+0x2ef/0xa47 kvm_vm_ioctl+0x36c/0x3a2 do_vfs_ioctl+0x49e/0x4e4 sys_ioctl+0x5a/0x7c system_call_fastpath+0x16/0x1b RIP gup_huge_pud+0xf2/0x159 Signed-off-by: Youquan Song Reviewed-by: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + mm/page_alloc.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bb28a5f9db8d..73f17c0293c0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -576,6 +576,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { __SetPageTail(p); + set_page_count(p, 0); p->first_page = page; } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9dd443d89d8b..850009a7101e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -356,8 +356,8 @@ void prep_compound_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - __SetPageTail(p); + set_page_count(p, 0); p->first_page = page; } } -- cgit v1.2.2 From 09761333ed47e899cc1482c13090b95f3f711971 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Thu, 8 Dec 2011 14:34:20 -0800 Subject: mm/migrate.c: pair unlock_page() and lock_page() when migrating huge pages Avoid unlocking and unlocked page if we failed to lock it. Signed-off-by: Hillf Danton Cc: Naoya Horiguchi Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 578e29174fa6..177aca424a06 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -871,9 +871,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (anon_vma) put_anon_vma(anon_vma); -out: unlock_page(hpage); +out: if (rc != -EAGAIN) { list_del(&hpage->lru); put_page(hpage); -- cgit v1.2.2 From d021563888312018ca65681096f62e36c20e63cc Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 8 Dec 2011 14:34:27 -0800 Subject: mm: Ensure that pfn_valid() is called once per pageblock when reserving pageblocks setup_zone_migrate_reserve() expects that zone->start_pfn starts at pageblock_nr_pages aligned pfn otherwise we could access beyond an existing memblock resulting in the following panic if CONFIG_HOLES_IN_ZONE is not configured and we do not check pfn_valid: IP: [] setup_zone_migrate_reserve+0xcd/0x180 *pdpt = 0000000000000000 *pde = f000ff53f000ff53 Oops: 0000 [#1] SMP Pid: 1, comm: swapper Not tainted 3.0.7-0.7-pae #1 VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform EIP: 0060:[] EFLAGS: 00010006 CPU: 0 EIP is at setup_zone_migrate_reserve+0xcd/0x180 EAX: 000c0000 EBX: f5801fc0 ECX: 000c0000 EDX: 00000000 ESI: 000c01fe EDI: 000c01fe EBP: 00140000 ESP: f2475f58 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 Process swapper (pid: 1, ti=f2474000 task=f2472cd0 task.ti=f2474000) Call Trace: [] __setup_per_zone_wmarks+0xec/0x160 [] setup_per_zone_wmarks+0xf/0x20 [] init_per_zone_wmark_min+0x27/0x86 [] do_one_initcall+0x2b/0x160 [] kernel_init+0xbe/0x157 [] kernel_thread_helper+0x6/0xd Code: a5 39 f5 89 f7 0f 46 fd 39 cf 76 40 8b 03 f6 c4 08 74 32 eb 91 90 89 c8 c1 e8 0e 0f be 80 80 2f 86 c0 8b 14 85 60 2f 86 c0 89 c8 <2b> 82 b4 12 00 00 c1 e0 05 03 82 ac 12 00 00 8b 00 f6 c4 08 0f EIP: [] setup_zone_migrate_reserve+0xcd/0x180 SS:ESP 0068:f2475f58 CR2: 00000000000012b4 We crashed in pageblock_is_reserved() when accessing pfn 0xc0000 because highstart_pfn = 0x36ffe. The issue was introduced in 3.0-rc1 by 6d3163ce ("mm: check if any page in a pageblock is reserved before marking it MIGRATE_RESERVE"). Make sure that start_pfn is always aligned to pageblock_nr_pages to ensure that pfn_valid s always called at the start of each pageblock. Architectures with holes in pageblocks will be correctly handled by pfn_valid_within in pageblock_is_reserved. Signed-off-by: Michal Hocko Signed-off-by: Mel Gorman Tested-by: Dang Bo Reviewed-by: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Cc: David Rientjes Cc: Arve Hjnnevg Cc: KOSAKI Motohiro Cc: John Stultz Cc: Dave Hansen Cc: [3.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 850009a7101e..2b8ba3aebf6e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3377,9 +3377,15 @@ static void setup_zone_migrate_reserve(struct zone *zone) unsigned long block_migratetype; int reserve; - /* Get the start pfn, end pfn and the number of blocks to reserve */ + /* + * Get the start pfn, end pfn and the number of blocks to reserve + * We have to be careful to be aligned to pageblock_nr_pages to + * make sure that we always check pfn_valid for the first page in + * the block. + */ start_pfn = zone->zone_start_pfn; end_pfn = start_pfn + zone->spanned_pages; + start_pfn = roundup(start_pfn, pageblock_nr_pages); reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; -- cgit v1.2.2 From 1368edf0647ac112d8cfa6ce47257dc950c50f5c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 8 Dec 2011 14:34:30 -0800 Subject: mm: vmalloc: check for page allocation failure before vmlist insertion Commit f5252e00 ("mm: avoid null pointer access in vm_struct via /proc/vmallocinfo") adds newly allocated vm_structs to the vmlist after it is fully initialised. Unfortunately, it did not check that __vmalloc_area_node() successfully populated the area. In the event of allocation failure, the vmalloc area is freed but the pointer to freed memory is inserted into the vmlist leading to a a crash later in get_vmalloc_info(). This patch adds a check for ____vmalloc_area_node() failure within __vmalloc_node_range. It does not use "goto fail" as in the previous error path as a warning was already displayed by __vmalloc_area_node() before it called vfree in its failure path. Credit goes to Luciano Chavez for doing all the real work of identifying exactly where the problem was. Signed-off-by: Mel Gorman Reported-by: Luciano Chavez Tested-by: Luciano Chavez Reviewed-by: Rik van Riel Acked-by: David Rientjes Cc: [3.1.x+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3231bf332878..1d8b32f07139 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1633,6 +1633,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); + if (!addr) + return NULL; /* * In this function, newly allocated vm_struct is not added -- cgit v1.2.2 From 9f57bd4d6dc69a4e3bf43044fa00fcd24dd363e3 Mon Sep 17 00:00:00 2001 From: Eugene Surovegin Date: Thu, 15 Dec 2011 11:25:59 -0800 Subject: percpu: fix per_cpu_ptr_to_phys() handling of non-page-aligned addresses per_cpu_ptr_to_phys() incorrectly rounds up its result for non-kmalloc case to the page boundary, which is bogus for any non-page-aligned address. This affects the only in-tree user of this function - sysfs handler for per-cpu 'crash_notes' physical address. The trouble is that the crash_notes per-cpu variable is not page-aligned: crash_notes = 0xc08e8ed4 PER-CPU OFFSET VALUES: CPU 0: 3711f000 CPU 1: 37129000 CPU 2: 37133000 CPU 3: 3713d000 So, the per-cpu addresses are: crash_notes on CPU 0: f7a07ed4 => phys 36b57ed4 crash_notes on CPU 1: f7a11ed4 => phys 36b4ded4 crash_notes on CPU 2: f7a1bed4 => phys 36b43ed4 crash_notes on CPU 3: f7a25ed4 => phys 36b39ed4 However, /sys/devices/system/cpu/cpu*/crash_notes says: /sys/devices/system/cpu/cpu0/crash_notes: 36b57000 /sys/devices/system/cpu/cpu1/crash_notes: 36b4d000 /sys/devices/system/cpu/cpu2/crash_notes: 36b43000 /sys/devices/system/cpu/cpu3/crash_notes: 36b39000 As you can see, all values are rounded down to a page boundary. Consequently, this is where kexec sets up the NOTE segments, and thus where the secondary kernel is looking for them. However, when the first kernel crashes, it saves the notes to the unaligned addresses, where they are not found. Fix it by adding offset_in_page() to the translated page address. -tj: Combined Eugene's and Petr's commit messages. Signed-off-by: Eugene Surovegin Signed-off-by: Tejun Heo Reported-by: Petr Tesarik Cc: stable@kernel.org --- mm/percpu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 3bb810a72006..716eb4acf2fc 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1023,9 +1023,11 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) if (!is_vmalloc_addr(addr)) return __pa(addr); else - return page_to_phys(vmalloc_to_page(addr)); + return page_to_phys(vmalloc_to_page(addr)) + + offset_in_page(addr); } else - return page_to_phys(pcpu_addr_to_page(addr)); + return page_to_phys(pcpu_addr_to_page(addr)) + + offset_in_page(addr); } /** -- cgit v1.2.2 From a41c58a6665cc995e237303b05db42100b71b65e Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Mon, 19 Dec 2011 17:11:57 -0800 Subject: memcg: keep root group unchanged if creation fails If the request is to create non-root group and we fail to meet it, we should leave the root unchanged. Signed-off-by: Hillf Danton Acked-by: Hugh Dickins Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Balbir Singh Cc: David Rientjes Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6aff93c98aca..b63f5f7dfa07 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4907,9 +4907,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) int cpu; enable_swap_cgroup(); parent = NULL; - root_mem_cgroup = memcg; if (mem_cgroup_soft_limit_tree_init()) goto free_out; + root_mem_cgroup = memcg; for_each_possible_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); @@ -4948,7 +4948,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) return &memcg->css; free_out: __mem_cgroup_free(memcg); - root_mem_cgroup = NULL; return ERR_PTR(error); } -- cgit v1.2.2 From ff05b6f7ae762b6eb464183eec994b28ea09f6dd Mon Sep 17 00:00:00 2001 From: Frantisek Hrbata Date: Mon, 19 Dec 2011 17:11:59 -0800 Subject: oom: fix integer overflow of points in oom_badness An integer overflow will happen on 64bit archs if task's sum of rss, swapents and nr_ptes exceeds (2^31)/1000 value. This was introduced by commit f755a04 oom: use pte pages in OOM score where the oom score computation was divided into several steps and it's no longer computed as one expression in unsigned long(rss, swapents, nr_pte are unsigned long), where the result value assigned to points(int) is in range(1..1000). So there could be an int overflow while computing 176 points *= 1000; and points may have negative value. Meaning the oom score for a mem hog task will be one. 196 if (points <= 0) 197 return 1; For example: [ 3366] 0 3366 35390480 24303939 5 0 0 oom01 Out of memory: Kill process 3366 (oom01) score 1 or sacrifice child Here the oom1 process consumes more than 24303939(rss)*4096~=92GB physical memory, but it's oom score is one. In this situation the mem hog task is skipped and oom killer kills another and most probably innocent task with oom score greater than one. The points variable should be of type long instead of int to prevent the int overflow. Signed-off-by: Frantisek Hrbata Acked-by: KOSAKI Motohiro Acked-by: Oleg Nesterov Acked-by: David Rientjes Cc: [2.6.36+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 76f2c5ae908e..069b64e521fc 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -176,7 +176,7 @@ static bool oom_unkillable_task(struct task_struct *p, unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, const nodemask_t *nodemask, unsigned long totalpages) { - int points; + long points; if (oom_unkillable_task(p, mem, nodemask)) return 0; -- cgit v1.2.2 From 0006526d78e93c3684c806bf7cf3f67dfa49c3c8 Mon Sep 17 00:00:00 2001 From: Kautuk Consul Date: Mon, 19 Dec 2011 17:12:04 -0800 Subject: mm/vmalloc.c: remove static declaration of va from __get_vm_area_node Static storage is not required for the struct vmap_area in __get_vm_area_node. Removing "static" to store this variable on the stack instead. Signed-off-by: Kautuk Consul Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1d8b32f07139..27be2f0d4cb7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1290,7 +1290,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, void *caller) { - static struct vmap_area *va; + struct vmap_area *va; struct vm_struct *area; BUG_ON(in_interrupt()); -- cgit v1.2.2