From baaf1dd491433a78826150aff7411015de7e9b65 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 5 Aug 2012 15:00:58 +0000 Subject: mm/slob: use min_t() to compare ARCH_SLAB_MINALIGN The definition of ARCH_SLAB_MINALIGN is architecture dependent and can be either of type size_t or int. Comparing that value with ARCH_KMALLOC_MINALIGN can cause harmless warnings on platforms where they are different. Since both are always small positive integer numbers, using the size_t type to compare them is safe and gets rid of the warning. Without this patch, building ARM collie_defconfig results in: mm/slob.c: In function '__kmalloc_node': mm/slob.c:431:152: warning: comparison of distinct pointer types lacks a cast [enabled by default] mm/slob.c: In function 'kfree': mm/slob.c:484:153: warning: comparison of distinct pointer types lacks a cast [enabled by default] mm/slob.c: In function 'ksize': mm/slob.c:503:153: warning: comparison of distinct pointer types lacks a cast [enabled by default] Signed-off-by: Arnd Bergmann Acked-by: Christoph Lameter Cc: Pekka Enberg --- mm/slob.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index 45d4ca79933a..497c55e318a1 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -428,7 +428,7 @@ out: void *__kmalloc_node(size_t size, gfp_t gfp, int node) { unsigned int *m; - int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); void *ret; gfp &= gfp_allowed_mask; @@ -481,7 +481,7 @@ void kfree(const void *block) sp = virt_to_page(block); if (PageSlab(sp)) { - int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); } else @@ -500,7 +500,7 @@ size_t ksize(const void *block) sp = virt_to_page(block); if (PageSlab(sp)) { - int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; } else -- cgit v1.2.2 From 325adeb55e32c50055c654e5b06a49f0bd88420b Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Mon, 15 Oct 2012 13:44:56 +0200 Subject: mm: huge_memory: Fix build error. Certain configurations won't implicitly pull in resulting in the following build error: mm/huge_memory.c: In function 'release_pte_page': mm/huge_memory.c:1697:2: error: implicit declaration of function 'unlock_page' [-Werror=implicit-function-declaration] mm/huge_memory.c: In function '__collapse_huge_page_isolate': mm/huge_memory.c:1757:3: error: implicit declaration of function 'trylock_page' [-Werror=implicit-function-declaration] cc1: some warnings being treated as errors Reported-by: David Daney Signed-off-by: Ralf Baechle Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a863af26c79c..40f17c34b415 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include "internal.h" -- cgit v1.2.2 From 32f8516a8c733d281faa9f6666b509035246505c Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 16 Oct 2012 17:31:23 -0700 Subject: mm, mempolicy: fix printing stack contents in numa_maps When reading /proc/pid/numa_maps, it's possible to return the contents of the stack where the mempolicy string should be printed if the policy gets freed from beneath us. This happens because mpol_to_str() may return an error the stack-allocated buffer is then printed without ever being stored. There are two possible error conditions in mpol_to_str(): - if the buffer allocated is insufficient for the string to be stored, and - if the mempolicy has an invalid mode. The first error condition is not triggered in any of the callers to mpol_to_str(): at least 50 bytes is always allocated on the stack and this is sufficient for the string to be written. A future patch should convert this into BUILD_BUG_ON() since we know the maximum strlen possible, but that's not -rc material. The second error condition is possible if a race occurs in dropping a reference to a task's mempolicy causing it to be freed during the read(). The slab poison value is then used for the mode and mpol_to_str() returns -EINVAL. This race is only possible because get_vma_policy() believes that mm->mmap_sem protects task->mempolicy, which isn't true. The exit path does not hold mm->mmap_sem when dropping the reference or setting task->mempolicy to NULL: it uses task_lock(task) instead. Thus, it's required for the caller of a task mempolicy to hold task_lock(task) while grabbing the mempolicy and reading it. Callers with a vma policy store their mempolicy earlier and can simply increment the reference count so it's guaranteed not to be freed. Reported-by: Dave Jones Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0b78fb9ea65b..d04a8a54c294 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1536,9 +1536,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, * * Returns effective policy for a VMA at specified address. * Falls back to @task or system default policy, as necessary. - * Current or other task's task mempolicy and non-shared vma policies - * are protected by the task's mmap_sem, which must be held for read by - * the caller. + * Current or other task's task mempolicy and non-shared vma policies must be + * protected by task_lock(task) by the caller. * Shared policies [those marked as MPOL_F_SHARED] require an extra reference * count--added by the get_policy() vm_op, as appropriate--to protect against * freeing by another task. It is the caller's responsibility to free the -- cgit v1.2.2 From deb521c44fa529b24cc78a64702757a683f82487 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 19 Oct 2012 13:37:57 -0700 Subject: remap_file_pages: correctly handle the case of a NULL vm_ops pointer In commit 0b173bc4daa8 ("mm: kill vma flag VM_CAN_NONLINEAR") we replaced the VM_CAN_NONLINEAR test with checking whether the mapping has a '->remap_pages()' vm operation, but there is no guarantee that there it even has a vm_ops pointer at all. Add the appropriate test for NULL vm_ops. Reported-by: Sasha Levin Cc: Konstantin Khlebnikov Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 3899a86851ce..a0aaf0e56800 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -169,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) goto out; - if (!vma->vm_ops->remap_pages) + if (!vma->vm_ops || !vma->vm_ops->remap_pages) goto out; if (start < vma->vm_start || start + size > vma->vm_end) -- cgit v1.2.2 From 0db63d7e25f96e2c6da925c002badf6f144ddf30 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 19 Oct 2012 13:56:57 -0700 Subject: mm: compaction: correct the nr_strict va isolated check for CMA Thierry reported that the "iron out" patch for isolate_freepages_block() had problems due to the strict check being too strict with "mm: compaction: Iron out isolate_freepages_block() and isolate_freepages_range() -fix1". It's possible that more pages than necessary are isolated but the check still fails and I missed that this fix was not picked up before RC1. This same problem has been identified in 3.7-RC1 by Tony Prisk and should be addressed by the following patch. Signed-off-by: Mel Gorman Tested-by: Tony Prisk Reported-by: Thierry Reding Acked-by: Rik van Riel Acked-by: Minchan Kim Cc: Richard Davies Cc: Shaohua Li Cc: Avi Kivity Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 2c4ce17651d8..9eef55838fca 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -346,7 +346,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, * pages requested were isolated. If there were any failures, 0 is * returned and CMA will fail. */ - if (strict && nr_strict_required != total_isolated) + if (strict && nr_strict_required > total_isolated) total_isolated = 0; if (locked) -- cgit v1.2.2 From 6ede1fd3cb404c0016de6ac529df46d561bd558b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 22 Oct 2012 16:35:18 -0700 Subject: x86, mm: Trim memory in memblock to be page aligned We will not map partial pages, so need to make sure memblock allocation will not allocate those bytes out. Also we will use for_each_mem_pfn_range() to loop to map memory range to keep them consistent. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/CAE9FiQVZirvaBMFYRfXMmWEcHbKSicQEHz4VAwUv0xFCk51ZNw@mail.gmail.com Acked-by: Jacob Shin Signed-off-by: H. Peter Anvin Cc: --- mm/memblock.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 931eef145af5..625905523c2a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -930,6 +930,30 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; } +void __init_memblock memblock_trim_memory(phys_addr_t align) +{ + int i; + phys_addr_t start, end, orig_start, orig_end; + struct memblock_type *mem = &memblock.memory; + + for (i = 0; i < mem->cnt; i++) { + orig_start = mem->regions[i].base; + orig_end = mem->regions[i].base + mem->regions[i].size; + start = round_up(orig_start, align); + end = round_down(orig_end, align); + + if (start == orig_start && end == orig_end) + continue; + + if (start < end) { + mem->regions[i].base = start; + mem->regions[i].size = end - start; + } else { + memblock_remove_region(mem, i); + i--; + } + } +} void __init_memblock memblock_set_current_limit(phys_addr_t limit) { -- cgit v1.2.2 From ef5d437f71afdf4afdbab99213add99f4b1318fd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 25 Oct 2012 13:37:31 -0700 Subject: mm: fix XFS oops due to dirty pages without buffers on s390 On s390 any write to a page (even from kernel itself) sets architecture specific page dirty bit. Thus when a page is written to via buffered write, HW dirty bit gets set and when we later map and unmap the page, page_remove_rmap() finds the dirty bit and calls set_page_dirty(). Dirtying of a page which shouldn't be dirty can cause all sorts of problems to filesystems. The bug we observed in practice is that buffers from the page get freed, so when the page gets later marked as dirty and writeback writes it, XFS crashes due to an assertion BUG_ON(!PagePrivate(page)) in page_buffers() called from xfs_count_page_state(). Similar problem can also happen when zero_user_segment() call from xfs_vm_writepage() (or block_write_full_page() for that matter) set the hardware dirty bit during writeback, later buffers get freed, and then page unmapped. Fix the issue by ignoring s390 HW dirty bit for page cache pages of mappings with mapping_cap_account_dirty(). This is safe because for such mappings when a page gets marked as writeable in PTE it is also marked dirty in do_wp_page() or do_page_fault(). When the dirty bit is cleared by clear_page_dirty_for_io(), the page gets writeprotected in page_mkclean(). So pagecache page is writeable if and only if it is dirty. Thanks to Hugh Dickins for pointing out mapping has to have mapping_cap_account_dirty() for things to work and proposing a cleaned up variant of the patch. The patch has survived about two hours of running fsx-linux on tmpfs while heavily swapping and several days of running on out build machines where the original problem was triggered. Signed-off-by: Jan Kara Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Hugh Dickins Cc: Heiko Carstens Cc: [3.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 7df7984d476c..2ee1ef0f317b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -56,6 +56,7 @@ #include #include #include +#include #include @@ -926,11 +927,8 @@ int page_mkclean(struct page *page) if (page_mapped(page)) { struct address_space *mapping = page_mapping(page); - if (mapping) { + if (mapping) ret = page_mkclean_file(mapping, page); - if (page_test_and_clear_dirty(page_to_pfn(page), 1)) - ret = 1; - } } return ret; @@ -1116,6 +1114,7 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { + struct address_space *mapping = page_mapping(page); bool anon = PageAnon(page); bool locked; unsigned long flags; @@ -1138,8 +1137,19 @@ void page_remove_rmap(struct page *page) * this if the page is anon, so about to be freed; but perhaps * not if it's in swapcache - there might be another pte slot * containing the swap entry, but page not yet written to swap. + * + * And we can skip it on file pages, so long as the filesystem + * participates in dirty tracking; but need to catch shm and tmpfs + * and ramfs pages which have been modified since creation by read + * fault. + * + * Note that mapping must be decided above, before decrementing + * mapcount (which luckily provides a barrier): once page is unmapped, + * it could be truncated and page->mapping reset to NULL at any moment. + * Note also that we are relying on page_mapping(page) to set mapping + * to &swapper_space when PageSwapCache(page). */ - if ((!anon || PageSwapCache(page)) && + if (mapping && !mapping_cap_account_dirty(mapping) && page_test_and_clear_dirty(page_to_pfn(page), 1)) set_page_dirty(page); /* -- cgit v1.2.2 From 86a595f961360c9c31d00fb111dd09e5d5ba9a40 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Thu, 25 Oct 2012 13:37:56 -0700 Subject: mm/page_alloc.c:alloc_contig_range(): return early for err path If start_isolate_page_range() failed, unset_migratetype_isolate() has been done inside it. Signed-off-by: Bob Liu Cc: Ni zhan Chen Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bb90971182bd..b0012ab372a4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5825,7 +5825,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = start_isolate_page_range(pfn_max_align_down(start), pfn_max_align_up(end), migratetype); if (ret) - goto done; + return ret; ret = __alloc_contig_migrate_range(&cc, start, end); if (ret) -- cgit v1.2.2 From 35cfa2b0b491c37e23527822bf365610dbb188e5 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 25 Oct 2012 13:38:01 -0700 Subject: mm/mmu_notifier: allocate mmu_notifier in advance While allocating mmu_notifier with parameter GFP_KERNEL, swap would start to work in case of tight available memory. Eventually, that would lead to a deadlock while the swap deamon swaps anonymous pages. It was caused by commit e0f3c3f78da29b ("mm/mmu_notifier: init notifier if necessary"). ================================= [ INFO: inconsistent lock state ] 3.7.0-rc1+ #518 Not tainted --------------------------------- inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage. kswapd0/35 [HC0[0]:SC0[0]:HE1:SE1] takes: (&mapping->i_mmap_mutex){+.+.?.}, at: page_referenced+0x9c/0x2e0 {RECLAIM_FS-ON-W} state was registered at: mark_held_locks+0x86/0x150 lockdep_trace_alloc+0x67/0xc0 kmem_cache_alloc_trace+0x33/0x230 do_mmu_notifier_register+0x87/0x180 mmu_notifier_register+0x13/0x20 kvm_dev_ioctl+0x428/0x510 do_vfs_ioctl+0x98/0x570 sys_ioctl+0x91/0xb0 system_call_fastpath+0x16/0x1b irq event stamp: 825 hardirqs last enabled at (825): _raw_spin_unlock_irq+0x30/0x60 hardirqs last disabled at (824): _raw_spin_lock_irq+0x19/0x80 softirqs last enabled at (0): copy_process+0x630/0x17c0 softirqs last disabled at (0): (null) ... Simply back out the above commit, which was a small performance optimization. Signed-off-by: Gavin Shan Reported-by: Andrea Righi Tested-by: Andrea Righi Cc: Wanpeng Li Cc: Andrea Arcangeli Cc: Avi Kivity Cc: Hugh Dickins Cc: Marcelo Tosatti Cc: Xiao Guangrong Cc: Sagi Grimberg Cc: Haggai Eran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 479a1e751a73..8a5ac8c686b0 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -196,28 +196,28 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, BUG_ON(atomic_read(&mm->mm_users) <= 0); /* - * Verify that mmu_notifier_init() already run and the global srcu is - * initialized. - */ + * Verify that mmu_notifier_init() already run and the global srcu is + * initialized. + */ BUG_ON(!srcu.per_cpu_ref); + ret = -ENOMEM; + mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); + if (unlikely(!mmu_notifier_mm)) + goto out; + if (take_mmap_sem) down_write(&mm->mmap_sem); ret = mm_take_all_locks(mm); if (unlikely(ret)) - goto out; + goto out_clean; if (!mm_has_notifiers(mm)) { - mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), - GFP_KERNEL); - if (unlikely(!mmu_notifier_mm)) { - ret = -ENOMEM; - goto out_of_mem; - } INIT_HLIST_HEAD(&mmu_notifier_mm->list); spin_lock_init(&mmu_notifier_mm->lock); mm->mmu_notifier_mm = mmu_notifier_mm; + mmu_notifier_mm = NULL; } atomic_inc(&mm->mm_count); @@ -233,12 +233,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); spin_unlock(&mm->mmu_notifier_mm->lock); -out_of_mem: mm_drop_all_locks(mm); -out: +out_clean: if (take_mmap_sem) up_write(&mm->mmap_sem); - + kfree(mmu_notifier_mm); +out: BUG_ON(atomic_read(&mm->mm_users) <= 0); return ret; } -- cgit v1.2.2 From 6b187d0260b6cd1d0904309f32659b7ed5948af8 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 25 Oct 2012 13:38:08 -0700 Subject: mm, numa: avoid setting zone_reclaim_mode unless a node is sufficiently distant Commit 957f822a0ab9 ("mm, numa: reclaim from all nodes within reclaim distance") caused zone_reclaim_mode to be set for all systems where two nodes are within RECLAIM_DISTANCE of each other. This is the opposite of what we actually want: zone_reclaim_mode should be set if two nodes are sufficiently distant. Signed-off-by: David Rientjes Reported-by: Julian Wollrath Tested-by: Julian Wollrath Cc: Hugh Dickins Cc: Patrik Kullman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b0012ab372a4..5b74de6702e0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1809,10 +1809,10 @@ static void __paginginit init_zone_allows_reclaim(int nid) int i; for_each_online_node(i) - if (node_distance(nid, i) <= RECLAIM_DISTANCE) { + if (node_distance(nid, i) <= RECLAIM_DISTANCE) node_set(i, NODE_DATA(nid)->reclaim_nodes); + else zone_reclaim_mode = 1; - } } #else /* CONFIG_NUMA */ -- cgit v1.2.2 From b0a8cc58e6b9aaae3045752059e5e6260c0b94bc Mon Sep 17 00:00:00 2001 From: Takamori Yamaguchi Date: Thu, 8 Nov 2012 15:53:39 -0800 Subject: mm: bugfix: set current->reclaim_state to NULL while returning from kswapd() In kswapd(), set current->reclaim_state to NULL before returning, as current->reclaim_state holds reference to variable on kswapd()'s stack. In rare cases, while returning from kswapd() during memory offlining, __free_slab() and freepages() can access the dangling pointer of current->reclaim_state. Signed-off-by: Takamori Yamaguchi Signed-off-by: Aaditya Kumar Acked-by: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 2624edcfb420..8b055e9379bc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3017,6 +3017,8 @@ static int kswapd(void *p) &balanced_classzone_idx); } } + + current->reclaim_state = NULL; return 0; } -- cgit v1.2.2 From 63c3b902e517012b127d6528434b928ceaa10f7b Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 16 Nov 2012 14:14:47 -0800 Subject: mm: add anon_vma_lock to validate_mm() Iterating over the vma->anon_vma_chain without anon_vma_lock may cause NULL ptr deref in anon_vma_interval_tree_verify(), because the node in the chain might have been removed. BUG: unable to handle kernel paging request at fffffffffffffff0 IP: [] anon_vma_interval_tree_verify+0xc/0xa0 PGD 4e28067 PUD 4e29067 PMD 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC CPU 0 Pid: 9050, comm: trinity-child64 Tainted: G W 3.7.0-rc2-next-20121025-sasha-00001-g673f98e-dirty #77 RIP: 0010: anon_vma_interval_tree_verify+0xc/0xa0 Process trinity-child64 (pid: 9050, threadinfo ffff880045f80000, task ffff880048eb0000) Call Trace: validate_mm+0x58/0x1e0 vma_adjust+0x635/0x6b0 __split_vma.isra.22+0x161/0x220 split_vma+0x24/0x30 sys_madvise+0x5da/0x7b0 tracesys+0xe1/0xe6 RIP anon_vma_interval_tree_verify+0xc/0xa0 CR2: fffffffffffffff0 Figured out by Bob Liu. Reported-by: Sasha Levin Cc: Bob Liu Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 2d942353d681..9a796c41e7d9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -334,8 +334,10 @@ void validate_mm(struct mm_struct *mm) struct vm_area_struct *vma = mm->mmap; while (vma) { struct anon_vma_chain *avc; + vma_lock_anon_vma(vma); list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_verify(avc); + vma_unlock_anon_vma(vma); vma = vma->vm_next; i++; } -- cgit v1.2.2 From 1756954c61cb5c97c618ccb366482b9c1f891d6d Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 16 Nov 2012 14:14:48 -0800 Subject: mm: fix build warning for uninitialized value do_wp_page() sets mmun_called if mmun_start and mmun_end were initialized and, if so, may call mmu_notifier_invalidate_range_end() with these values. This doesn't prevent gcc from emitting a build warning though: mm/memory.c: In function `do_wp_page': mm/memory.c:2530: warning: `mmun_start' may be used uninitialized in this function mm/memory.c:2531: warning: `mmun_end' may be used uninitialized in this function It's much easier to initialize the variables to impossible values and do a simple comparison to determine if they were initialized to remove the bool entirely. Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index fb135ba4aba9..221fc9ffcab1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2527,9 +2527,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, int ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ - bool mmun_called = false; /* For mmu_notifiers */ + unsigned long mmun_start = 0; /* For mmu_notifiers */ + unsigned long mmun_end = 0; /* For mmu_notifiers */ old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { @@ -2708,8 +2707,7 @@ gotten: goto oom_free_new; mmun_start = address & PAGE_MASK; - mmun_end = (address & PAGE_MASK) + PAGE_SIZE; - mmun_called = true; + mmun_end = mmun_start + PAGE_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* @@ -2778,7 +2776,7 @@ gotten: page_cache_release(new_page); unlock: pte_unmap_unlock(page_table, ptl); - if (mmun_called) + if (mmun_end > mmun_start) mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* -- cgit v1.2.2 From 9a5a8f19b43430752067ecaee62fc59e11e88fa6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 16 Nov 2012 14:14:49 -0800 Subject: memcg: oom: fix totalpages calculation for memory.swappiness==0 oom_badness() takes a totalpages argument which says how many pages are available and it uses it as a base for the score calculation. The value is calculated by mem_cgroup_get_limit which considers both limit and total_swap_pages (resp. memsw portion of it). This is usually correct but since fe35004fbf9e ("mm: avoid swapping out with swappiness==0") we do not swap when swappiness is 0 which means that we cannot really use up all the totalpages pages. This in turn confuses oom score calculation if the memcg limit is much smaller than the available swap because the used memory (capped by the limit) is negligible comparing to totalpages so the resulting score is too small if adj!=0 (typically task with CAP_SYS_ADMIN or non zero oom_score_adj). A wrong process might be selected as result. The problem can be worked around by checking mem_cgroup_swappiness==0 and not considering swap at all in such a case. Signed-off-by: Michal Hocko Acked-by: David Rientjes Acked-by: Johannes Weiner Acked-by: KOSAKI Motohiro Acked-by: KAMEZAWA Hiroyuki Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7acf43bf04a2..93a7e36ded89 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1452,17 +1452,26 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) { u64 limit; - u64 memsw; limit = res_counter_read_u64(&memcg->res, RES_LIMIT); - limit += total_swap_pages << PAGE_SHIFT; - memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); /* - * If memsw is finite and limits the amount of swap space available - * to this memcg, return that limit. + * Do not consider swap space if we cannot swap due to swappiness */ - return min(limit, memsw); + if (mem_cgroup_swappiness(memcg)) { + u64 memsw; + + limit += total_swap_pages << PAGE_SHIFT; + memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); + + /* + * If memsw is finite and limits the amount of swap space + * available to this memcg, return that limit. + */ + limit = min(limit, memsw); + } + + return limit; } void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, -- cgit v1.2.2 From bea8c150a7efbc0f204e709b7274fe273f55e0d3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 16 Nov 2012 14:14:54 -0800 Subject: memcg: fix hotplugged memory zone oops When MEMCG is configured on (even when it's disabled by boot option), when adding or removing a page to/from its lru list, the zone pointer used for stats updates is nowadays taken from the struct lruvec. (On many configurations, calculating zone from page is slower.) But we have no code to update all the lruvecs (per zone, per memcg) when a memory node is hotadded. Here's an extract from the oops which results when running numactl to bind a program to a newly onlined node: BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60 IP: __mod_zone_page_state+0x9/0x60 Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0) Call Trace: __pagevec_lru_add_fn+0xdf/0x140 pagevec_lru_move_fn+0xb1/0x100 __pagevec_lru_add+0x1c/0x30 lru_add_drain_cpu+0xa3/0x130 lru_add_drain+0x2f/0x40 ... The natural solution might be to use a memcg callback whenever memory is hotadded; but that solution has not been scoped out, and it happens that we do have an easy location at which to update lruvec->zone. The lruvec pointer is discovered either by mem_cgroup_zone_lruvec() or by mem_cgroup_page_lruvec(), and both of those do know the right zone. So check and set lruvec->zone in those; and remove the inadequate attempt to set lruvec->zone from lruvec_init(), which is called before NODE_DATA(node) has been allocated in such cases. Ah, there was one exceptionr. For no particularly good reason, mem_cgroup_force_empty_list() has its own code for deciding lruvec. Change it to use the standard mem_cgroup_zone_lruvec() and mem_cgroup_get_lru_size() too. In fact it was already safe against such an oops (the lru lists in danger could only be empty), but we're better proofed against future changes this way. I've marked this for stable (3.6) since we introduced the problem in 3.5 (now closed to stable); but I have no idea if this is the only fix needed to get memory hotadd working with memcg in 3.6, and received no answer when I enquired twice before. Reported-by: Tang Chen Signed-off-by: Hugh Dickins Acked-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Konstantin Khlebnikov Cc: Wen Congyang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 46 +++++++++++++++++++++++++++++++++++----------- mm/mmzone.c | 6 +----- mm/page_alloc.c | 2 +- 3 files changed, 37 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 93a7e36ded89..dd39ba000b31 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1055,12 +1055,24 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, struct mem_cgroup *memcg) { struct mem_cgroup_per_zone *mz; + struct lruvec *lruvec; - if (mem_cgroup_disabled()) - return &zone->lruvec; + if (mem_cgroup_disabled()) { + lruvec = &zone->lruvec; + goto out; + } mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); - return &mz->lruvec; + lruvec = &mz->lruvec; +out: + /* + * Since a node can be onlined after the mem_cgroup was created, + * we have to be prepared to initialize lruvec->zone here; + * and if offlined then reonlined, we need to reinitialize it. + */ + if (unlikely(lruvec->zone != zone)) + lruvec->zone = zone; + return lruvec; } /* @@ -1087,9 +1099,12 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) struct mem_cgroup_per_zone *mz; struct mem_cgroup *memcg; struct page_cgroup *pc; + struct lruvec *lruvec; - if (mem_cgroup_disabled()) - return &zone->lruvec; + if (mem_cgroup_disabled()) { + lruvec = &zone->lruvec; + goto out; + } pc = lookup_page_cgroup(page); memcg = pc->mem_cgroup; @@ -1107,7 +1122,16 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) pc->mem_cgroup = memcg = root_mem_cgroup; mz = page_cgroup_zoneinfo(memcg, page); - return &mz->lruvec; + lruvec = &mz->lruvec; +out: + /* + * Since a node can be onlined after the mem_cgroup was created, + * we have to be prepared to initialize lruvec->zone here; + * and if offlined then reonlined, we need to reinitialize it. + */ + if (unlikely(lruvec->zone != zone)) + lruvec->zone = zone; + return lruvec; } /** @@ -3697,17 +3721,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { - struct mem_cgroup_per_zone *mz; + struct lruvec *lruvec; unsigned long flags, loop; struct list_head *list; struct page *busy; struct zone *zone; zone = &NODE_DATA(node)->node_zones[zid]; - mz = mem_cgroup_zoneinfo(memcg, node, zid); - list = &mz->lruvec.lists[lru]; + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + list = &lruvec->lists[lru]; - loop = mz->lru_size[lru]; + loop = mem_cgroup_get_lru_size(lruvec, lru); /* give some margin against EBUSY etc...*/ loop += 256; busy = NULL; @@ -4745,7 +4769,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; - lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]); + lruvec_init(&mz->lruvec); mz->usage_in_excess = 0; mz->on_tree = false; mz->memcg = memcg; diff --git a/mm/mmzone.c b/mm/mmzone.c index 3cef80f6ac79..4596d81b89b1 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pfn, } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ -void lruvec_init(struct lruvec *lruvec, struct zone *zone) +void lruvec_init(struct lruvec *lruvec) { enum lru_list lru; @@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone) for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); - -#ifdef CONFIG_MEMCG - lruvec->zone = zone; -#endif } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b74de6702e0..c91598b1b4c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4505,7 +4505,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone_pcp_init(zone); - lruvec_init(&zone->lruvec, zone); + lruvec_init(&zone->lruvec); if (!size) continue; -- cgit v1.2.2 From f58b59c1df3cb990d644018e1461cd4acd3c1700 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Fri, 16 Nov 2012 14:14:55 -0800 Subject: swapfile: fix name leak in swapoff There's a name leak introduced by commit 91a27b2a7567 ("vfs: define struct filename and have getname() return it"). Add the missing putname. [akpm@linux-foundation.org: cleanup] Signed-off-by: Xiaotian Feng Reviewed-by: Jeff Layton Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 71cd288b2001..f91a25547ffe 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1494,9 +1494,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) BUG_ON(!current->mm); pathname = getname(specialfile); - err = PTR_ERR(pathname); if (IS_ERR(pathname)) - goto out; + return PTR_ERR(pathname); victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); err = PTR_ERR(victim); @@ -1608,6 +1607,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) out_dput: filp_close(victim, NULL); out: + putname(pathname); return err; } -- cgit v1.2.2 From 96710098ee124951ff2fed7cd8406da92aad011a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 16 Nov 2012 14:14:59 -0800 Subject: mm: revert "mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures" Jiri Slaby reported the following: (It's an effective revert of "mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures".) Given kswapd had hours of runtime in ps/top output yesterday in the morning and after the revert it's now 2 minutes in sum for the last 24h, I would say, it's gone. The intention of the patch in question was to compensate for the loss of lumpy reclaim. Part of the reason lumpy reclaim worked is because it aggressively reclaimed pages and this patch was meant to be a sane compromise. When compaction fails, it gets deferred and both compaction and reclaim/compaction is deferred avoid excessive reclaim. However, since commit c654345924f7 ("mm: remove __GFP_NO_KSWAPD"), kswapd is woken up each time and continues reclaiming which was not taken into account when the patch was developed. Attempts to address the problem ended up just changing the shape of the problem instead of fixing it. The release window gets closer and while a THP allocation failing is not a major problem, kswapd chewing up a lot of CPU is. This patch reverts commit 83fde0f22872 ("mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures") and will be revisited in the future. Signed-off-by: Mel Gorman Cc: Zdenek Kabelac Tested-by: Valdis Kletnieks Cc: Jiri Slaby Cc: Rik van Riel Cc: Jiri Slaby Cc: Johannes Hirte Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 8b055e9379bc..48550c66f1f2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1760,28 +1760,6 @@ static bool in_reclaim_compaction(struct scan_control *sc) return false; } -#ifdef CONFIG_COMPACTION -/* - * If compaction is deferred for sc->order then scale the number of pages - * reclaimed based on the number of consecutive allocation failures - */ -static unsigned long scale_for_compaction(unsigned long pages_for_compaction, - struct lruvec *lruvec, struct scan_control *sc) -{ - struct zone *zone = lruvec_zone(lruvec); - - if (zone->compact_order_failed <= sc->order) - pages_for_compaction <<= zone->compact_defer_shift; - return pages_for_compaction; -} -#else -static unsigned long scale_for_compaction(unsigned long pages_for_compaction, - struct lruvec *lruvec, struct scan_control *sc) -{ - return pages_for_compaction; -} -#endif - /* * Reclaim/compaction is used for high-order allocation requests. It reclaims * order-0 pages before compacting the zone. should_continue_reclaim() returns @@ -1829,9 +1807,6 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - - pages_for_compaction = scale_for_compaction(pages_for_compaction, - lruvec, sc); inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); if (nr_swap_pages > 0) inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); -- cgit v1.2.2 From 498c2280212327858e521e9d21345d4cc2637f54 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 16 Nov 2012 14:15:00 -0800 Subject: mm: highmem: don't treat PKMAP_ADDR(LAST_PKMAP) as a highmem address kmap_to_page returns the corresponding struct page for a virtual address of an arbitrary mapping. This works by checking whether the address falls in the pkmap region and using the pkmap page tables instead of the linear mapping if appropriate. Unfortunately, the bounds checking means that PKMAP_ADDR(LAST_PKMAP) is incorrectly treated as a highmem address and we can end up walking off the end of pkmap_page_table and subsequently passing junk to pte_page. This patch fixes the bound check to stay within the pkmap tables. Signed-off-by: Will Deacon Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index d517cd16a6eb..2da13a5c50e2 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -98,7 +98,7 @@ struct page *kmap_to_page(void *vaddr) { unsigned long addr = (unsigned long)vaddr; - if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) { + if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; return pte_page(pkmap_page_table[i]); } -- cgit v1.2.2 From 215c02bc33bbd5ff4d7379a909462d11f0103218 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 16 Nov 2012 14:15:03 -0800 Subject: tmpfs: fix shmem_getpage_gfp() VM_BUG_ON Fuzzing with trinity hit the "impossible" VM_BUG_ON(error) (which Fedora has converted to WARNING) in shmem_getpage_gfp(): WARNING: at mm/shmem.c:1151 shmem_getpage_gfp+0xa5c/0xa70() Pid: 29795, comm: trinity-child4 Not tainted 3.7.0-rc2+ #49 Call Trace: warn_slowpath_common+0x7f/0xc0 warn_slowpath_null+0x1a/0x20 shmem_getpage_gfp+0xa5c/0xa70 shmem_fault+0x4f/0xa0 __do_fault+0x71/0x5c0 handle_pte_fault+0x97/0xae0 handle_mm_fault+0x289/0x350 __do_page_fault+0x18e/0x530 do_page_fault+0x2b/0x50 page_fault+0x28/0x30 tracesys+0xe1/0xe6 Thanks to Johannes for pointing to truncation: free_swap_and_cache() only does a trylock on the page, so the page lock we've held since before confirming swap is not enough to protect against truncation. What cleanup is needed in this case? Just delete_from_swap_cache(), which takes care of the memcg uncharge. Signed-off-by: Hugh Dickins Reported-by: Dave Jones Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 67afba5117f2..dc12264f44f5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1145,8 +1145,20 @@ repeat: if (!error) { error = shmem_add_to_page_cache(page, mapping, index, gfp, swp_to_radix_entry(swap)); - /* We already confirmed swap, and make no allocation */ - VM_BUG_ON(error); + /* + * We already confirmed swap under page lock, and make + * no memory allocation here, so usually no possibility + * of error; but free_swap_and_cache() only trylocks a + * page, so it is just possible that the entry has been + * truncated or holepunched since swap was confirmed. + * shmem_undo_range() will have done some of the + * unaccounting, now delete_from_swap_cache() will do + * the rest (including mem_cgroup_uncharge_swapcache). + * Reset swap.val? No, leave it so "failed" goes back to + * "repeat": reading a hole and writing should succeed. + */ + if (error) + delete_from_swap_cache(page); } if (error) goto failed; -- cgit v1.2.2 From 0f3c42f522dc1ad7e27affc0a4aa8c790bce0a66 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 16 Nov 2012 14:15:04 -0800 Subject: tmpfs: change final i_blocks BUG to WARNING Under a particular load on one machine, I have hit shmem_evict_inode()'s BUG_ON(inode->i_blocks), enough times to narrow it down to a particular race between swapout and eviction. It comes from the "if (freed > 0)" asymmetry in shmem_recalc_inode(), and the lack of coherent locking between mapping's nrpages and shmem's swapped count. There's a window in shmem_writepage(), between lowering nrpages in shmem_delete_from_page_cache() and then raising swapped count, when the freed count appears to be +1 when it should be 0, and then the asymmetry stops it from being corrected with -1 before hitting the BUG. One answer is coherent locking: using tree_lock throughout, without info->lock; reasonable, but the raw_spin_lock in percpu_counter_add() on used_blocks makes that messier than expected. Another answer may be a further effort to eliminate the weird shmem_recalc_inode() altogether, but previous attempts at that failed. So far undecided, but for now change the BUG_ON to WARN_ON: in usual circumstances it remains a useful consistency check. Signed-off-by: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index dc12264f44f5..89341b658bd0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -643,7 +643,7 @@ static void shmem_evict_inode(struct inode *inode) kfree(info->symlink); simple_xattrs_free(&info->xattrs); - BUG_ON(inode->i_blocks); + WARN_ON(inode->i_blocks); shmem_free_inode(inode->i_sb); clear_inode(inode); } -- cgit v1.2.2 From 5576646f3c1abd60d72d19829de6f5d8c2ca8ecf Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 16 Nov 2012 14:15:06 -0800 Subject: revert "mm: fix-up zone present pages" Revert commit 7f1290f2f2a4 ("mm: fix-up zone present pages") That patch tried to fix a issue when calculating zone->present_pages, but it caused a regression on 32bit systems with HIGHMEM. With that change, reset_zone_present_pages() resets all zone->present_pages to zero, and fixup_zone_present_pages() is called to recalculate zone->present_pages when the boot allocator frees core memory pages into buddy allocator. Because highmem pages are not freed by bootmem allocator, all highmem zones' present_pages becomes zero. Various options for improving the situation are being discussed but for now, let's return to the 3.6 code. Cc: Jianguo Wu Cc: Jiang Liu Cc: Petr Tesarik Cc: "Luck, Tony" Cc: Mel Gorman Cc: Yinghai Lu Cc: Minchan Kim Cc: Johannes Weiner Acked-by: David Rientjes Tested-by: Chris Clayton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 10 +--------- mm/memory_hotplug.c | 7 ------- mm/nobootmem.c | 3 --- mm/page_alloc.c | 34 ---------------------------------- 4 files changed, 1 insertion(+), 53 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 434be4ae7a04..f468185b3b28 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -198,8 +198,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) int order = ilog2(BITS_PER_LONG); __free_pages_bootmem(pfn_to_page(start), order); - fixup_zone_present_pages(page_to_nid(pfn_to_page(start)), - start, start + BITS_PER_LONG); count += BITS_PER_LONG; start += BITS_PER_LONG; } else { @@ -210,9 +208,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) if (vec & 1) { page = pfn_to_page(start + off); __free_pages_bootmem(page, 0); - fixup_zone_present_pages( - page_to_nid(page), - start + off, start + off + 1); count++; } vec >>= 1; @@ -226,11 +221,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages); count += pages; - while (pages--) { - fixup_zone_present_pages(page_to_nid(page), - page_to_pfn(page), page_to_pfn(page) + 1); + while (pages--) __free_pages_bootmem(page++, 0); - } bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 56b758ae57d2..e4eeacae2b91 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -106,7 +106,6 @@ static void get_page_bootmem(unsigned long info, struct page *page, void __ref put_page_bootmem(struct page *page) { unsigned long type; - struct zone *zone; type = (unsigned long) page->lru.next; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || @@ -117,12 +116,6 @@ void __ref put_page_bootmem(struct page *page) set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); __free_pages_bootmem(page, 0); - - zone = page_zone(page); - zone_span_writelock(zone); - zone->present_pages++; - zone_span_writeunlock(zone); - totalram_pages++; } } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 714d5d650470..bd82f6b31411 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -116,8 +116,6 @@ static unsigned long __init __free_memory_core(phys_addr_t start, return 0; __free_pages_memory(start_pfn, end_pfn); - fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT), - start_pfn, end_pfn); return end_pfn - start_pfn; } @@ -128,7 +126,6 @@ unsigned long __init free_low_memory_core_early(int nodeid) phys_addr_t start, end, size; u64 i; - reset_zone_present_pages(); for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) count += __free_memory_core(start, end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c91598b1b4c0..7bb35ac0964a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6098,37 +6098,3 @@ void dump_page(struct page *page) dump_page_flags(page->flags); mem_cgroup_print_bad_page(page); } - -/* reset zone->present_pages */ -void reset_zone_present_pages(void) -{ - struct zone *z; - int i, nid; - - for_each_node_state(nid, N_HIGH_MEMORY) { - for (i = 0; i < MAX_NR_ZONES; i++) { - z = NODE_DATA(nid)->node_zones + i; - z->present_pages = 0; - } - } -} - -/* calculate zone's present pages in buddy system */ -void fixup_zone_present_pages(int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - struct zone *z; - unsigned long zone_start_pfn, zone_end_pfn; - int i; - - for (i = 0; i < MAX_NR_ZONES; i++) { - z = NODE_DATA(nid)->node_zones + i; - zone_start_pfn = z->zone_start_pfn; - zone_end_pfn = zone_start_pfn + z->spanned_pages; - - /* if the two regions intersect */ - if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn)) - z->present_pages += min(end_pfn, zone_end_pfn) - - max(start_pfn, zone_start_pfn); - } -} -- cgit v1.2.2 From ef6c5be658f6a70c1256fbd18e18ee0dc24c3386 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 21 Nov 2012 14:21:51 -0500 Subject: fix incorrect NR_FREE_PAGES accounting (appears like memory leak) There have been some 3.7-rc reports of vm issues, including some kswapd bugs and, more importantly, some memory "leaks": http://www.spinics.net/lists/linux-mm/msg46187.html https://bugzilla.kernel.org/show_bug.cgi?id=50181 Commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available") took split_free_page() and reused it for the compaction code. It does something curious with capture_free_page() (previously known as split_free_page()): int capture_free_page(struct page *page, int alloc_order, ... __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); - /* Split into individual pages */ - set_page_refcounted(page); - split_page(page, order); + if (alloc_order != order) + expand(zone, page, alloc_order, order, + &zone->free_area[order], migratetype); Note that expand() puts the pages _back_ in the allocator, but it does not bump NR_FREE_PAGES. We "return" 'alloc_order' worth of pages, but we accounted for removing 'order' in the __mod_zone_page_state() call. For the old split_page()-style use (order==alloc_order) the bug will not trigger. But, when called from the compaction code where we occasionally get a larger page out of the buddy allocator than we need, we will run in to this. This patch simply changes the NR_FREE_PAGES manipulation to the correct 'alloc_order' instead of 'order'. I've been able to repeatedly trigger this in my testing environment. The amount "leaked" very closely tracks the imbalance I see in buddy pages vs. NR_FREE_PAGES. I have confirmed that this patch fixes the imbalance Signed-off-by: Dave Hansen Acked-by: Mel Gorman Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7bb35ac0964a..bcb72c6e2b2d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1405,7 +1405,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) mt = get_pageblock_migratetype(page); if (unlikely(mt != MIGRATE_ISOLATE)) - __mod_zone_freepage_state(zone, -(1UL << order), mt); + __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); if (alloc_order != order) expand(zone, page, alloc_order, order, -- cgit v1.2.2 From 82b212f40059bffd6808c07266a942d444d5558a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 26 Nov 2012 16:29:45 -0800 Subject: Revert "mm: remove __GFP_NO_KSWAPD" With "mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures" reverted, Zdenek Kabelac reported the following Hmm, so it's just took longer to hit the problem and observe kswapd0 spinning on my CPU again - it's not as endless like before - but still it easily eats minutes - it helps to turn off Firefox or TB (memory hungry apps) so kswapd0 stops soon - and restart those apps again. (And I still have like >1GB of cached memory) kswapd0 R running task 0 30 2 0x00000000 Call Trace: preempt_schedule+0x42/0x60 _raw_spin_unlock+0x55/0x60 put_super+0x31/0x40 drop_super+0x22/0x30 prune_super+0x149/0x1b0 shrink_slab+0xba/0x510 The sysrq+m indicates the system has no swap so it'll never reclaim anonymous pages as part of reclaim/compaction. That is one part of the problem but not the root cause as file-backed pages could also be reclaimed. The likely underlying problem is that kswapd is woken up or kept awake for each THP allocation request in the page allocator slow path. If compaction fails for the requesting process then compaction will be deferred for a time and direct reclaim is avoided. However, if there are a storm of THP requests that are simply rejected, it will still be the the case that kswapd is awake for a prolonged period of time as pgdat->kswapd_max_order is updated each time. This is noticed by the main kswapd() loop and it will not call kswapd_try_to_sleep(). Instead it will loopp, shrinking a small number of pages and calling shrink_slab() on each iteration. The temptation is to supply a patch that checks if kswapd was woken for THP and if so ignore pgdat->kswapd_max_order but it'll be a hack and not backed up by proper testing. As 3.7 is very close to release and this is not a bug we should release with, a safer path is to revert "mm: remove __GFP_NO_KSWAPD" for now and revisit it with the view to ironing out the balance_pgdat() logic in general. Signed-off-by: Mel Gorman Cc: Zdenek Kabelac Cc: Seth Jennings Cc: Valdis Kletnieks Cc: Jiri Slaby Cc: Rik van Riel Cc: Robert Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bcb72c6e2b2d..92871579cbee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2416,8 +2416,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2494,7 +2495,7 @@ rebalance: * system then fail the allocation instead of entering direct reclaim. */ if ((deferred_compaction || contended_compaction) && - (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) + (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; /* Try direct reclaim and then allocating */ -- cgit v1.2.2 From 50694c28f1e1dbea18272980d265742a5027fb63 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 26 Nov 2012 16:29:48 -0800 Subject: mm: vmscan: check for fatal signals iff the process was throttled Commit 5515061d22f0 ("mm: throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage") introduced a check for fatal signals after a process gets throttled for network storage. The intention was that if a process was throttled and got killed that it should not trigger the OOM killer. As pointed out by Minchan Kim and David Rientjes, this check is in the wrong place and too broad. If a system is in am OOM situation and a process is exiting, it can loop in __alloc_pages_slowpath() and calling direct reclaim in a loop. As the fatal signal is pending it returns 1 as if it is making forward progress and can effectively deadlock. This patch moves the fatal_signal_pending() check after throttling to throttle_direct_reclaim() where it belongs. If the process is killed while throttled, it will return immediately without direct reclaim except now it will have TIF_MEMDIE set and will use the PFMEMALLOC reserves. Minchan pointed out that it may be better to direct reclaim before returning to avoid using the reserves because there may be pages that can easily reclaim that would avoid using the reserves. However, we do no such targetted reclaim and there is no guarantee that suitable pages are available. As it is expected that this throttling happens when swap-over-NFS is used there is a possibility that the process will instead swap which may allocate network buffers from the PFMEMALLOC reserves. Hence, in the swap-over-nfs case where a process can be throtted and be killed it can use the reserves to exit or it can potentially use reserves to swap a few pages and then exit. This patch takes the option of using the reserves if necessary to allow the process exit quickly. If this patch passes review it should be considered a -stable candidate for 3.6. Signed-off-by: Mel Gorman Cc: David Rientjes Cc: Luigi Semenzato Cc: Dan Magenheimer Cc: KOSAKI Motohiro Cc: Sonny Rao Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 48550c66f1f2..cbf84e152f04 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2207,9 +2207,12 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) * Throttle direct reclaimers if backing storage is backed by the network * and the PFMEMALLOC reserve for the preferred node is getting dangerously * depleted. kswapd will continue to make progress and wake the processes - * when the low watermark is reached + * when the low watermark is reached. + * + * Returns true if a fatal signal was delivered during throttling. If this + * happens, the page allocator should not consider triggering the OOM killer. */ -static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, +static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, nodemask_t *nodemask) { struct zone *zone; @@ -2224,13 +2227,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, * processes to block on log_wait_commit(). */ if (current->flags & PF_KTHREAD) - return; + goto out; + + /* + * If a fatal signal is pending, this process should not throttle. + * It should return quickly so it can exit and free its memory + */ + if (fatal_signal_pending(current)) + goto out; /* Check if the pfmemalloc reserves are ok */ first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); pgdat = zone->zone_pgdat; if (pfmemalloc_watermark_ok(pgdat)) - return; + goto out; /* Account for the throttling */ count_vm_event(PGSCAN_DIRECT_THROTTLE); @@ -2246,12 +2256,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, if (!(gfp_mask & __GFP_FS)) { wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, pfmemalloc_watermark_ok(pgdat), HZ); - return; + + goto check_pending; } /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, pfmemalloc_watermark_ok(pgdat)); + +check_pending: + if (fatal_signal_pending(current)) + return true; + +out: + return false; } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, @@ -2273,13 +2291,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .gfp_mask = sc.gfp_mask, }; - throttle_direct_reclaim(gfp_mask, zonelist, nodemask); - /* - * Do not enter reclaim if fatal signal is pending. 1 is returned so - * that the page allocator does not consider triggering OOM + * Do not enter reclaim if fatal signal was delivered while throttled. + * 1 is returned so that the page allocator does not OOM kill at this + * point. */ - if (fatal_signal_pending(current)) + if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) return 1; trace_mm_vmscan_direct_reclaim_begin(order, -- cgit v1.2.2