From eab309494ae2b9e15f85520f00de3893162c2e43 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 24 May 2012 00:45:21 -0700 Subject: memblock: Document memblock_is_region_{memory,reserved}() At first glance one would think that memblock_is_region_memory() and memblock_is_region_reserved() would be implemented in the same way. Unfortunately they aren't and the former returns whether the region specified is a subset of a memory bank while the latter returns whether the region specified intersects with reserved memory. Document the two functions so that users aren't tempted to make the implementation the same between them and to clarify the purpose of the functions. Signed-off-by: Stephen Boyd Cc: Tejun Heo Link: http://lkml.kernel.org/r/1337845521-32755-1-git-send-email-sboyd@codeaurora.org Signed-off-by: Ingo Molnar --- mm/memblock.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 952123eba43..32a0a5e4d79 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -867,6 +867,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) return memblock_search(&memblock.memory, addr) != -1; } +/** + * memblock_is_region_memory - check if a region is a subset of memory + * @base: base of region to check + * @size: size of region to check + * + * Check if the region [@base, @base+@size) is a subset of a memory block. + * + * RETURNS: + * 0 if false, non-zero if true + */ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) { int idx = memblock_search(&memblock.memory, base); @@ -879,6 +889,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size memblock.memory.regions[idx].size) >= end; } +/** + * memblock_is_region_reserved - check if a region intersects reserved memory + * @base: base of region to check + * @size: size of region to check + * + * Check if the region [@base, @base+@size) intersects a reserved memory block. + * + * RETURNS: + * 0 if false, non-zero if true + */ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) { memblock_cap_size(base, &size); -- cgit v1.2.2 From 1e11ad8dc42975d5c2bab7d478f6cd875602eda4 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 8 Jun 2012 13:21:26 -0700 Subject: mm, oom: fix badness score underflow If the privileges given to root threads (3% of allowable memory) or a negative value of /proc/pid/oom_score_adj happen to exceed the amount of rss of a thread, its badness score overflows as a result of commit a7f638f999ff ("mm, oom: normalize oom scores to oom_score_adj scale only for userspace"). Fix this by making the type signed and return 1, meaning the thread is still eligible for kill, if the value is negative. Reported-by: Dave Jones Acked-by: Oleg Nesterov Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ed0e1967736..416637f0e92 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -183,7 +183,7 @@ static bool oom_unkillable_task(struct task_struct *p, unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages) { - unsigned long points; + long points; if (oom_unkillable_task(p, memcg, nodemask)) return 0; @@ -223,7 +223,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * Never return 0 for an eligible task regardless of the root bonus and * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). */ - return points ? points : 1; + return points > 0 ? points : 1; } /* -- cgit v1.2.2 From 047fe3605235888f3ebcda0c728cb31937eadfe6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Jun 2012 15:24:40 +0200 Subject: splice: fix racy pipe->buffers uses Dave Jones reported a kernel BUG at mm/slub.c:3474! triggered by splice_shrink_spd() called from vmsplice_to_pipe() commit 35f3d14dbbc5 (pipe: add support for shrinking and growing pipes) added capability to adjust pipe->buffers. Problem is some paths don't hold pipe mutex and assume pipe->buffers doesn't change for their duration. Fix this by adding nr_pages_max field in struct splice_pipe_desc, and use it in place of pipe->buffers where appropriate. splice_shrink_spd() loses its struct pipe_inode_info argument. Reported-by: Dave Jones Signed-off-by: Eric Dumazet Cc: Jens Axboe Cc: Alexander Viro Cc: Tom Herbert Cc: stable # 2.6.35 Tested-by: Dave Jones Signed-off-by: Jens Axboe --- mm/shmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 585bd220a21..c244e93a70f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1577,6 +1577,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages, .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &page_cache_pipe_buf_ops, .spd_release = spd_release_page, @@ -1665,7 +1666,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (spd.nr_pages) error = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); if (error > 0) { *ppos += error; -- cgit v1.2.2 From 9b15b817f3d62409290fd56fe3cbb076a931bb0a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 15 Jun 2012 17:55:50 -0700 Subject: swap: fix shmem swapping when more than 8 areas Minchan Kim reports that when a system has many swap areas, and tmpfs swaps out to the ninth or more, shmem_getpage_gfp()'s attempts to read back the page cannot locate it, and the read fails with -ENOMEM. Whoops. Yes, I blindly followed read_swap_header()'s pte_to_swp_entry( swp_entry_to_pte()) technique for determining maximum usable swap offset, without stopping to realize that that actually depends upon the pte swap encoding shifting swap offset to the higher bits and truncating it there. Whereas our radix_tree swap encoding leaves offset in the lower bits: it's swap "type" (that is, index of swap area) that was truncated. Fix it by reducing the SWP_TYPE_SHIFT() in swapops.h, and removing the broken radix_to_swp_entry(swp_to_radix_entry()) from read_swap_header(). This does not reduce the usable size of a swap area any further, it leaves it as claimed when making the original commit: no change from 3.0 on x86_64, nor on i386 without PAE; but 3.0's 512GB is reduced to 128GB per swapfile on i386 with PAE. It's not a change I would have risked five years ago, but with x86_64 supported for ten years, I believe it's appropriate now. Hmm, and what if some architecture implements its swap pte with offset encoded below type? That would equally break the maximum usable swap offset check. Happily, they all follow the same tradition of encoding offset above type, but I'll prepare a check on that for next. Reported-and-Reviewed-and-Tested-by: Minchan Kim Signed-off-by: Hugh Dickins Cc: stable@vger.kernel.org [3.1, 3.2, 3.3, 3.4] Signed-off-by: Linus Torvalds --- mm/swapfile.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index de5bc51c4a6..71373d03fce 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1916,24 +1916,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p, /* * Find out how many pages are allowed for a single swap - * device. There are three limiting factors: 1) the number + * device. There are two limiting factors: 1) the number * of bits for the swap offset in the swp_entry_t type, and * 2) the number of bits in the swap pte as defined by the - * the different architectures, and 3) the number of free bits - * in an exceptional radix_tree entry. In order to find the + * different architectures. In order to find the * largest possible bit mask, a swap entry with swap type 0 * and swap offset ~0UL is created, encoded to a swap pte, * decoded to a swp_entry_t again, and finally the swap * offset is extracted. This will mask all the bits from * the initial ~0UL mask that can't be encoded in either * the swp_entry_t or the architecture definition of a - * swap pte. Then the same is done for a radix_tree entry. + * swap pte. */ maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))); - maxpages = swp_offset(radix_to_swp_entry( - swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; - + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; if (maxpages > swap_header->info.last_page) { maxpages = swap_header->info.last_page + 1; /* p->max is an unsigned int: don't overflow it */ -- cgit v1.2.2 From 61eafb00d55dfbccdfce543c6b60e369ff4f8f18 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 20 Jun 2012 12:52:58 -0700 Subject: mm, oom: fix and cleanup oom score calculations The divide in p->signal->oom_score_adj * totalpages / 1000 within oom_badness() was causing an overflow of the signed long data type. This adds both the root bias and p->signal->oom_score_adj before doing the normalization which fixes the issue and also cleans up the calculation. Tested-by: Dave Jones Signed-off-by: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 416637f0e92..77775138e93 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -184,6 +184,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages) { long points; + long adj; if (oom_unkillable_task(p, memcg, nodemask)) return 0; @@ -192,7 +193,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, if (!p) return 0; - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + adj = p->signal->oom_score_adj; + if (adj == OOM_SCORE_ADJ_MIN) { task_unlock(p); return 0; } @@ -210,14 +212,11 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * implementation used by LSMs. */ if (has_capability_noaudit(p, CAP_SYS_ADMIN)) - points -= 30 * totalpages / 1000; + adj -= 30; - /* - * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may - * either completely disable oom killing or always prefer a certain - * task. - */ - points += p->signal->oom_score_adj * totalpages / 1000; + /* Normalize to oom_score_adj units */ + adj *= totalpages / 1000; + points += adj; /* * Never return 0 for an eligible task regardless of the root bonus and -- cgit v1.2.2 From 3a981f482cc29f7d0aeab509e51ea15519a6e961 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 20 Jun 2012 12:52:58 -0700 Subject: memcg: fix use_hierarchy css_is_ancestor oops regression If use_hierarchy is set, reclaim testing soon oopses in css_is_ancestor() called from __mem_cgroup_same_or_subtree() called from page_referenced(): when processes are exiting, it's easy for mm_match_cgroup() to pass along a NULL memcg coming from a NULL mm->owner. Check for that in __mem_cgroup_same_or_subtree(). Return true or false? False because we cannot know if it was in the hierarchy, but also false because it's better not to count a reference from an exiting process. Signed-off-by: Hugh Dickins Acked-by: Johannes Weiner Acked-by: Konstantin Khlebnikov Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac35bccadb7..f8517cd28fe 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1148,7 +1148,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, { if (root_memcg == memcg) return true; - if (!root_memcg->use_hierarchy) + if (!root_memcg->use_hierarchy || !memcg) return false; return css_is_ancestor(&memcg->css, &root_memcg->css); } -- cgit v1.2.2 From e0897d75f0b22e8c3a7287a48548c5686ef73447 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 20 Jun 2012 12:53:00 -0700 Subject: mm, thp: print useful information when mmap_sem is unlocked in zap_pmd_range Andrea asked for addr, end, vma->vm_start, and vma->vm_end to be emitted when !rwsem_is_locked(&tlb->mm->mmap_sem). Otherwise, debugging the underlying issue is more difficult. Suggested-by: Andrea Arcangeli Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 1b7dc662bf9..8762c4f915f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1225,7 +1225,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { - VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); +#ifdef CONFIG_DEBUG_VM + if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { + pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", + __func__, addr, end, + vma->vm_start, + vma->vm_end); + BUG(); + } +#endif split_huge_page_pmd(vma->vm_mm, pmd); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; -- cgit v1.2.2 From dad7557eb705688040aac134efa5418b66d5ed92 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 20 Jun 2012 12:53:01 -0700 Subject: mm: fix kernel-doc warnings Fix kernel-doc warnings such as Warning(../mm/page_cgroup.c:432): No description found for parameter 'id' Warning(../mm/page_cgroup.c:432): Excess function parameter 'mem' description in 'swap_cgroup_record' Signed-off-by: Wanpeng Li Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 12 ++++++------ mm/memcontrol.c | 4 ++-- mm/oom_kill.c | 2 +- mm/page_cgroup.c | 4 ++-- mm/pagewalk.c | 1 - mm/percpu-vm.c | 1 - 6 files changed, 11 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 32a0a5e4d79..b65b687e236 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -540,9 +540,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable * @nid: nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL + * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @out_nid: ptr to int for nid of the range, can be %NULL * * Find the first free area from *@idx which matches @nid, fill the out * parameters, and update *@idx for the next iteration. The lower 32bit of @@ -616,9 +616,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() * @idx: pointer to u64 loop variable * @nid: nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL + * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @out_nid: ptr to int for nid of the range, can be %NULL * * Reverse of __next_free_mem_range(). */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f8517cd28fe..f72b5e52451 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1234,7 +1234,7 @@ int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup - * @mem: the memory cgroup + * @memcg: the memory cgroup * * Returns the maximum amount of memory @mem can be charged with, in * pages. @@ -1508,7 +1508,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, /** * test_mem_cgroup_node_reclaimable - * @mem: the target memcg + * @memcg: the target memcg * @nid: the node ID to be checked. * @noswap : specify true here if the user wants flle only information. * diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 77775138e93..ac300c99baf 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -365,7 +365,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, /** * dump_tasks - dump current memory state of all system tasks - * @mem: current's memory controller, if constrained + * @memcg: current's memory controller, if constrained * @nodemask: nodemask passed to page allocator for mempolicy ooms * * Dumps the current memory state of all eligible tasks. Tasks not in the same diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 1ccbd714059..eb750f85139 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, /** * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. - * @end: swap entry to be cmpxchged + * @ent: swap entry to be cmpxchged * @old: old id * @new: new id * @@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, /** * swap_cgroup_record - record mem_cgroup for this swp_entry. * @ent: swap entry to be recorded into - * @mem: mem_cgroup to be recorded + * @id: mem_cgroup to be recorded * * Returns old value at success, 0 at failure. * (Of course, old value can be 0.) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index aa9701e1271..6c118d012bb 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, /** * walk_page_range - walk a memory map's page tables with a callback - * @mm: memory map to walk * @addr: starting address * @end: ending address * @walk: set of callbacks to invoke for each level of the tree diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 405d331804c..3707c71ae4c 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -360,7 +360,6 @@ err_free: * @chunk: chunk to depopulate * @off: offset to the area to depopulate * @size: size of the area to depopulate in bytes - * @flush: whether to flush cache and tlb or not * * For each cpu, depopulate and unmap pages [@page_start,@page_end) * from @chunk. If @flush is true, vcache is flushed before unmapping -- cgit v1.2.2 From eb4546bbbdb160aff084d50511165f385756af18 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 20 Jun 2012 12:53:02 -0700 Subject: mm/memory.c: fix kernel-doc warnings Fix kernel-doc warnings in mm/memory.c: Warning(mm/memory.c:1377): No description found for parameter 'start' Warning(mm/memory.c:1377): Excess function parameter 'address' description in 'zap_page_range' Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 8762c4f915f..2466d125023 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1374,7 +1374,7 @@ void unmap_vmas(struct mmu_gather *tlb, /** * zap_page_range - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap + * @start: starting address of pages to zap * @size: number of bytes to zap * @details: details of nonlinear truncation or shared cache invalidation * -- cgit v1.2.2 From 48c3b583bbddad2220ca4c22319ca5d1f78b2090 Mon Sep 17 00:00:00 2001 From: Greg Pearson Date: Wed, 20 Jun 2012 12:53:05 -0700 Subject: mm/memblock: fix overlapping allocation when doubling reserved array __alloc_memory_core_early() asks memblock for a range of memory then try to reserve it. If the reserved region array lacks space for the new range, memblock_double_array() is called to allocate more space for the array. If memblock is used to allocate memory for the new array it can end up using a range that overlaps with the range originally allocated in __alloc_memory_core_early(), leading to possible data corruption. With this patch memblock_double_array() now calls memblock_find_in_range() with a narrowed candidate range (in cases where the reserved.regions array is being doubled) so any memory allocated will not overlap with the original range that was being reserved. The range is narrowed by passing in the starting address and size of the previously allocated range. Then the range above the ending address is searched and if a candidate is not found, the range below the starting address is searched. Signed-off-by: Greg Pearson Signed-off-by: Yinghai Lu Acked-by: Tejun Heo Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index b65b687e236..d4382095f8b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -184,7 +184,24 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } } -static int __init_memblock memblock_double_array(struct memblock_type *type) +/** + * memblock_double_array - double the size of the memblock regions array + * @type: memblock type of the regions array being doubled + * @new_area_start: starting address of memory range to avoid overlap with + * @new_area_size: size of memory range to avoid overlap with + * + * Double the size of the @type regions array. If memblock is being used to + * allocate memory for a new reserved regions array and there is a previously + * allocated memory range [@new_area_start,@new_area_start+@new_area_size] + * waiting to be reserved, ensure the memory used by the new array does + * not overlap. + * + * RETURNS: + * 0 on success, -1 on failure. + */ +static int __init_memblock memblock_double_array(struct memblock_type *type, + phys_addr_t new_area_start, + phys_addr_t new_area_size) { struct memblock_region *new_array, *old_array; phys_addr_t old_size, new_size, addr; @@ -222,7 +239,18 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) new_array = kmalloc(new_size, GFP_KERNEL); addr = new_array ? __pa(new_array) : 0; } else { - addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); + /* only exclude range when trying to double reserved.regions */ + if (type != &memblock.reserved) + new_area_start = new_area_size = 0; + + addr = memblock_find_in_range(new_area_start + new_area_size, + memblock.current_limit, + new_size, sizeof(phys_addr_t)); + if (!addr && new_area_size) + addr = memblock_find_in_range(0, + min(new_area_start, memblock.current_limit), + new_size, sizeof(phys_addr_t)); + new_array = addr ? __va(addr) : 0; } if (!addr) { @@ -399,7 +427,7 @@ repeat: */ if (!insert) { while (type->cnt + nr_new > type->max) - if (memblock_double_array(type) < 0) + if (memblock_double_array(type, obase, size) < 0) return -ENOMEM; insert = true; goto repeat; @@ -450,7 +478,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, /* we'll create at most two more regions */ while (type->cnt + 2 > type->max) - if (memblock_double_array(type) < 0) + if (memblock_double_array(type, base, size) < 0) return -ENOMEM; for (i = 0; i < type->cnt; i++) { -- cgit v1.2.2 From c4c0e9e544a0eb640798cc66e68f394fa4a561bf Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 20 Jun 2012 18:00:12 -0700 Subject: mm, mempolicy: fix mbind() to do synchronous migration If the range passed to mbind() is not allocated on nodes set in the nodemask, it migrates the pages to respect the constraint. The final formal of migrate_pages() is a mode of type enum migrate_mode, not a boolean. do_mbind() is currently passing "true" which is the equivalent of MIGRATE_SYNC_LIGHT. This should instead be MIGRATE_SYNC for synchronous page migration. Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f15c1b24ca1..1d771e4200d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1177,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, - false, true); + false, MIGRATE_SYNC); if (nr_failed) putback_lru_pages(&pagelist); } -- cgit v1.2.2 From 9ab4233dd08036fe34a89c7dc6f47a8bf2eb29eb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 5 Jul 2012 16:00:11 -0700 Subject: mm: Hold a file reference in madvise_remove Otherwise the code races with munmap (causing a use-after-free of the vma) or with close (causing a use-after-free of the struct file). The bug was introduced by commit 90ed52ebe481 ("[PATCH] holepunch: fix mmap_sem i_mutex deadlock") Cc: Hugh Dickins Cc: Miklos Szeredi Cc: Badari Pulavarty Cc: Nick Piggin Cc: stable@vger.kernel.org Signed-off-by: Andy Lutomirski Signed-off-by: Linus Torvalds --- mm/madvise.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index deff1b64a08..14d260fa0d1 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -15,6 +15,7 @@ #include #include #include +#include /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -204,14 +205,16 @@ static long madvise_remove(struct vm_area_struct *vma, { loff_t offset; int error; + struct file *f; *prev = NULL; /* tell sys_madvise we drop mmap_sem */ if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) return -EINVAL; - if (!vma->vm_file || !vma->vm_file->f_mapping - || !vma->vm_file->f_mapping->host) { + f = vma->vm_file; + + if (!f || !f->f_mapping || !f->f_mapping->host) { return -EINVAL; } @@ -221,11 +224,18 @@ static long madvise_remove(struct vm_area_struct *vma, offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - /* filesystem's fallocate may need to take i_mutex */ + /* + * Filesystem's fallocate may need to take i_mutex. We need to + * explicitly grab a reference because the vma (and hence the + * vma's reference to the file) can go away as soon as we drop + * mmap_sem. + */ + get_file(f); up_read(¤t->mm->mmap_sem); - error = do_fallocate(vma->vm_file, + error = do_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); + fput(f); down_read(¤t->mm->mmap_sem); return error; } -- cgit v1.2.2