From 5d53cb27d849c899136c048ec84c940ac449494b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jan 2012 10:14:12 -0800 Subject: memblock: Fix alloc failure due to dumb underflow protection in memblock_find_in_range_node() 7bd0b0f0da ("memblock: Reimplement memblock allocation using reverse free area iterator") implemented a simple top-down allocator using a reverse memblock iterator. To avoid underflow in the allocator loop, it simply raised the lower boundary to the requested size under the assumption that requested size would be far smaller than available memblocks. This causes early page table allocation failure under certain configurations in Xen. Fix it by checking for underflow directly instead of bumping up lower bound. Signed-off-by: Tejun Heo Reported-by: Konrad Rzeszutek Wilk Cc: rjw@sisk.pl Cc: xen-devel@lists.xensource.com Cc: Benjamin Herrenschmidt Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20120113181412.GA11112@google.com Signed-off-by: Ingo Molnar --- mm/memblock.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 2f55f19b7c86..77b5f227e1d8 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -106,14 +106,17 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, if (end == MEMBLOCK_ALLOC_ACCESSIBLE) end = memblock.current_limit; - /* adjust @start to avoid underflow and allocating the first page */ - start = max3(start, size, (phys_addr_t)PAGE_SIZE); + /* avoid allocating the first page */ + start = max_t(phys_addr_t, start, PAGE_SIZE); end = max(start, end); for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); + if (this_end < size) + continue; + cand = round_down(this_end - size, align); if (cand >= this_start) return cand; -- cgit v1.2.2 From b469d4329cf949043f9b93a6644f2c64015ef8cd Mon Sep 17 00:00:00 2001 From: Tiejun Chen Date: Wed, 11 Jan 2012 05:51:10 +0000 Subject: kmemleak: Only scan non-zero-size areas Kmemleak should only track valid scan areas with a non-zero size. Otherwise, such area may reside just at the end of an object and kmemleak would report "Adding scan area to unknown object". Signed-off-by: Tiejun Chen Signed-off-by: Catalin Marinas --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c833addd94d7..f9f7310f0fdb 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1036,7 +1036,7 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (atomic_read(&kmemleak_enabled) && ptr && size && !IS_ERR(ptr)) add_scan_area((unsigned long)ptr, size, gfp); else if (atomic_read(&kmemleak_early_log)) log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); -- cgit v1.2.2 From b370d29ea7565a638ccf85389488364b5abb39fa Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 20 Jan 2012 10:42:40 +0000 Subject: kmemleak: Disable early logging when kmemleak is off by default Commit b6693005 (kmemleak: When the early log buffer is exceeded, report the actual number) deferred the disabling of the early logging to kmemleak_init(). However, when CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y, the early logging was no longer disabled causing __init kmemleak functions to be called even after the kernel freed the init memory. This patch disables the early logging during kmemleak_init() if kmemleak is left disabled. Reported-by: Dirk Gouders Tested-by: Dirk Gouders Tested-by: Josh Boyer Signed-off-by: Catalin Marinas --- mm/kmemleak.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f9f7310f0fdb..45eb6217bf38 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1757,6 +1757,7 @@ void __init kmemleak_init(void) #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF if (!kmemleak_skip_disable) { + atomic_set(&kmemleak_early_log, 0); kmemleak_disable(); return; } -- cgit v1.2.2 From 26dd8e0291fd699142722632c6588a438d6ef0e4 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sat, 21 Jan 2012 00:15:23 +0900 Subject: percpu: use bitmap_clear Use bitmap_clear rather than clearing individual bits in a memory region. Signed-off-by: Akinobu Mita Acked-by: Christoph Lameter Signed-off-by: Tejun Heo --- mm/percpu-vm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 12a48a88c0d8..405d331804c3 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -184,8 +184,7 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, page_end - page_start); } - for (i = page_start; i < page_end; i++) - __clear_bit(i, populated); + bitmap_clear(populated, page_start, page_end - page_start); } /** -- cgit v1.2.2 From 2673b4cf5d59c3ee5e0c12f6d734d38770324dc4 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Sun, 29 Jan 2012 12:17:33 -0600 Subject: backing-dev: fix wakeup timer races with bdi_unregister() While 7a401a972df8e18 ("backing-dev: ensure wakeup_timer is deleted") addressed the problem of the bdi being freed with a queued wakeup timer, there are other races that could happen if the wakeup timer expires after/during bdi_unregister(), before bdi_destroy() is called. wakeup_timer_fn() could attempt to wakeup a task which has already has been freed, or could access a NULL bdi->dev via the wake_forker_thread tracepoint. Cc: Cc: Jens Axboe Reported-by: Chanho Min Reviewed-by: Namjae Jeon Signed-off-by: Rabin Vincent Signed-off-by: Wu Fengguang --- mm/backing-dev.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7ba8feae11b8..dd8e2aafb07e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -318,7 +318,7 @@ static void wakeup_timer_fn(unsigned long data) if (bdi->wb.task) { trace_writeback_wake_thread(bdi); wake_up_process(bdi->wb.task); - } else { + } else if (bdi->dev) { /* * When bdi tasks are inactive for long time, they are killed. * In this case we have to wake-up the forker thread which @@ -584,6 +584,8 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { + struct task_struct *task; + if (!bdi_cap_writeback_dirty(bdi)) return; @@ -602,8 +604,13 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) * Finally, kill the kernel thread. We don't need to be RCU * safe anymore, since the bdi is gone from visibility. */ - if (bdi->wb.task) - kthread_stop(bdi->wb.task); + spin_lock_bh(&bdi->wb_lock); + task = bdi->wb.task; + bdi->wb.task = NULL; + spin_unlock_bh(&bdi->wb_lock); + + if (task) + kthread_stop(task); } /* @@ -623,7 +630,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { - if (bdi->dev) { + struct device *dev = bdi->dev; + + if (dev) { bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); @@ -632,8 +641,12 @@ void bdi_unregister(struct backing_dev_info *bdi) if (!bdi_cap_flush_forker(bdi)) bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); - device_unregister(bdi->dev); + + spin_lock_bh(&bdi->wb_lock); bdi->dev = NULL; + spin_unlock_bh(&bdi->wb_lock); + + device_unregister(dev); } } EXPORT_SYMBOL(bdi_unregister); -- cgit v1.2.2 From 8cdb878dcb359fd1137e9abdee9322f5e9bcfdf8 Mon Sep 17 00:00:00 2001 From: Christopher Yeoh Date: Thu, 2 Feb 2012 11:34:09 +1030 Subject: Fix race in process_vm_rw_core This fixes the race in process_vm_core found by Oleg (see http://article.gmane.org/gmane.linux.kernel/1235667/ for details). This has been updated since I last sent it as the creation of the new mm_access() function did almost exactly the same thing as parts of the previous version of this patch did. In order to use mm_access() even when /proc isn't enabled, we move it to kernel/fork.c where other related process mm access functions already are. Signed-off-by: Chris Yeoh Signed-off-by: Linus Torvalds --- mm/process_vm_access.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index e920aa3ce104..c20ff48994c2 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -298,23 +298,18 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, goto free_proc_pages; } - task_lock(task); - if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) { - task_unlock(task); - rc = -EPERM; - goto put_task_struct; - } - mm = task->mm; - - if (!mm || (task->flags & PF_KTHREAD)) { - task_unlock(task); - rc = -EINVAL; + mm = mm_access(task, PTRACE_MODE_ATTACH); + if (!mm || IS_ERR(mm)) { + rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + /* + * Explicitly map EACCES to EPERM as EPERM is a more a + * appropriate error code for process_vw_readv/writev + */ + if (rc == -EACCES) + rc = -EPERM; goto put_task_struct; } - atomic_inc(&mm->mm_users); - task_unlock(task); - for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, -- cgit v1.2.2 From 35512ecaef03250fe50ad81430dd467f01d9a96b Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 3 Feb 2012 15:37:13 -0800 Subject: mm: postpone migrated page mapping reset Postpone resetting page->mapping until the final remove_migration_ptes(). Otherwise the expression PageAnon(migration_entry_to_page(entry)) does not work. Signed-off-by: Konstantin Khlebnikov Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 9871a56d82c3..df141f60289e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -445,7 +445,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) ClearPageSwapCache(page); ClearPagePrivate(page); set_page_private(page, 0); - page->mapping = NULL; /* * If any waiters have accumulated on the new page then @@ -667,6 +666,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, } else { if (remap_swapcache) remove_migration_ptes(page, newpage); + page->mapping = NULL; } unlock_page(newpage); -- cgit v1.2.2 From 82b3f2a7171731cce62f25058d25afb91a14710c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 3 Feb 2012 15:37:14 -0800 Subject: mm/memcontrol.c: fix warning with CONFIG_NUMA=n mm/memcontrol.c: In function 'memcg_check_events': mm/memcontrol.c:779: warning: unused variable 'do_numainfo' Acked-by: Michal Hocko Cc: Li Zefan Cc: Hiroyuki KAMEZAWA Cc: Johannes Weiner Acked-by: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 556859fec4ef..6728a7ae6f2d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -776,7 +776,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { - bool do_softlimit, do_numainfo; + bool do_softlimit; + bool do_numainfo __maybe_unused; do_softlimit = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_SOFTLIMIT); -- cgit v1.2.2 From 99f02ef1f18631eb0a4e0ea0a3d56878dbcb4b90 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Fri, 3 Feb 2012 15:37:14 -0800 Subject: mm/filemap_xip.c: fix race condition in xip_file_fault() Fix a race condition that shows in conjunction with xip_file_fault() when two threads of the same user process fault on the same memory page. In this case, the race winner will install the page table entry and the unlucky loser will cause an oops: xip_file_fault calls vm_insert_pfn (via vm_insert_mixed) which drops out at this check: retval = -EBUSY; if (!pte_none(*pte)) goto out_unlock; The resulting -EBUSY return value will trigger a BUG_ON() in xip_file_fault. This fix simply considers the fault as fixed in this case, because the race winner has successfully installed the pte. [akpm@linux-foundation.org: use conventional (and consistent) comment layout] Reported-by: David Sadler Signed-off-by: Carsten Otte Reported-by: Louis Alex Eisner Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap_xip.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index f91b2f687343..a4eb31132229 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -263,7 +263,12 @@ found: xip_pfn); if (err == -ENOMEM) return VM_FAULT_OOM; - BUG_ON(err); + /* + * err == -EBUSY is fine, we've raced against another thread + * that faulted-in the same page + */ + if (err != -EBUSY) + BUG_ON(err); return VM_FAULT_NOPAGE; } else { int err, ret = VM_FAULT_OOM; -- cgit v1.2.2 From 3deaa7190a8da38453c4fabd9dec7f66d17fff67 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 3 Feb 2012 15:37:17 -0800 Subject: readahead: fix pipeline break caused by block plug Herbert Poetzl reported a performance regression since 2.6.39. The test is a simple dd read, but with big block size. The reason is: T1: ra (A, A+128k), (A+128k, A+256k) T2: lock_page for page A, submit the 256k T3: hit page A+128K, ra (A+256k, A+384). the range isn't submitted because of plug and there isn't any lock_page till we hit page A+256k because all pages from A to A+256k is in memory T4: hit page A+256k, ra (A+384, A+ 512). Because of plug, the range isn't submitted again. T5: lock_page A+256k, so (A+256k, A+512k) will be submitted. The task is waitting for (A+256k, A+512k) finish. There is no request to disk in T3 and T4, so readahead pipeline breaks. We really don't need block plug for generic_file_aio_read() for buffered I/O. The readahead already has plug and has fine grained control when I/O should be submitted. Deleting plug for buffered I/O fixes the regression. One side effect is plug makes the request size 256k, the size is 128k without it. This is because default ra size is 128k and not a reason we need plug here. Vivek said: : We submit some readahead IO to device request queue but because of nested : plug, queue never gets unplugged. When read logic reaches a page which is : not in page cache, it waits for page to be read from the disk : (lock_page_killable()) and that time we flush the plug list. : : So effectively read ahead logic is kind of broken in parts because of : nested plugging. Removing top level plug (generic_file_aio_read()) for : buffered reads, will allow unplugging queue earlier for readahead. Signed-off-by: Shaohua Li Signed-off-by: Wu Fengguang Reported-by: Herbert Poetzl Tested-by: Eric Dumazet Cc: Christoph Hellwig Cc: Jens Axboe Cc: Vivek Goyal Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 97f49ed35bd2..b66275757c28 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1400,15 +1400,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; - struct blk_plug plug; count = 0; retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (retval) return retval; - blk_start_plug(&plug); - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { loff_t size; @@ -1424,8 +1421,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, retval = filemap_write_and_wait_range(mapping, pos, pos + iov_length(iov, nr_segs) - 1); if (!retval) { + struct blk_plug plug; + + blk_start_plug(&plug); retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); + blk_finish_plug(&plug); } if (retval > 0) { *ppos = pos + retval; @@ -1481,7 +1482,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, break; } out: - blk_finish_plug(&plug); return retval; } EXPORT_SYMBOL(generic_file_aio_read); -- cgit v1.2.2 From 0bf380bc70ecba68cb4d74dc656cc2fa8c4d801a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 3 Feb 2012 15:37:18 -0800 Subject: mm: compaction: check pfn_valid when entering a new MAX_ORDER_NR_PAGES block during isolation for migration When isolating for migration, migration starts at the start of a zone which is not necessarily pageblock aligned. Further, it stops isolating when COMPACT_CLUSTER_MAX pages are isolated so migrate_pfn is generally not aligned. This allows isolate_migratepages() to call pfn_to_page() on an invalid PFN which can result in a crash. This was originally reported against a 3.0-based kernel with the following trace in a crash dump. PID: 9902 TASK: d47aecd0 CPU: 0 COMMAND: "memcg_process_s" #0 [d72d3ad0] crash_kexec at c028cfdb #1 [d72d3b24] oops_end at c05c5322 #2 [d72d3b38] __bad_area_nosemaphore at c0227e60 #3 [d72d3bec] bad_area at c0227fb6 #4 [d72d3c00] do_page_fault at c05c72ec #5 [d72d3c80] error_code (via page_fault) at c05c47a4 EAX: 00000000 EBX: 000c0000 ECX: 00000001 EDX: 00000807 EBP: 000c0000 DS: 007b ESI: 00000001 ES: 007b EDI: f3000a80 GS: 6f50 CS: 0060 EIP: c030b15a ERR: ffffffff EFLAGS: 00010002 #6 [d72d3cb4] isolate_migratepages at c030b15a #7 [d72d3d14] zone_watermark_ok at c02d26cb #8 [d72d3d2c] compact_zone at c030b8de #9 [d72d3d68] compact_zone_order at c030bba1 #10 [d72d3db4] try_to_compact_pages at c030bc84 #11 [d72d3ddc] __alloc_pages_direct_compact at c02d61e7 #12 [d72d3e08] __alloc_pages_slowpath at c02d66c7 #13 [d72d3e78] __alloc_pages_nodemask at c02d6a97 #14 [d72d3eb8] alloc_pages_vma at c030a845 #15 [d72d3ed4] do_huge_pmd_anonymous_page at c03178eb #16 [d72d3f00] handle_mm_fault at c02f36c6 #17 [d72d3f30] do_page_fault at c05c70ed #18 [d72d3fb0] error_code (via page_fault) at c05c47a4 EAX: b71ff000 EBX: 00000001 ECX: 00001600 EDX: 00000431 DS: 007b ESI: 08048950 ES: 007b EDI: bfaa3788 SS: 007b ESP: bfaa36e0 EBP: bfaa3828 GS: 6f50 CS: 0073 EIP: 080487c8 ERR: ffffffff EFLAGS: 00010202 It was also reported by Herbert van den Bergh against 3.1-based kernel with the following snippet from the console log. BUG: unable to handle kernel paging request at 01c00008 IP: [] isolate_migratepages+0x119/0x390 *pdpt = 000000002f7ce001 *pde = 0000000000000000 It is expected that it also affects 3.2.x and current mainline. The problem is that pfn_valid is only called on the first PFN being checked and that PFN is not necessarily aligned. Lets say we have a case like this H = MAX_ORDER_NR_PAGES boundary | = pageblock boundary m = cc->migrate_pfn f = cc->free_pfn o = memory hole H------|------H------|----m-Hoooooo|ooooooH-f----|------H The migrate_pfn is just below a memory hole and the free scanner is beyond the hole. When isolate_migratepages started, it scans from migrate_pfn to migrate_pfn+pageblock_nr_pages which is now in a memory hole. It checks pfn_valid() on the first PFN but then scans into the hole where there are not necessarily valid struct pages. This patch ensures that isolate_migratepages calls pfn_valid when necessary. Reported-by: Herbert van den Bergh Tested-by: Herbert van den Bergh Signed-off-by: Mel Gorman Acked-by: Michal Nazarewicz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 71a58f67f481..bd939a574b84 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -313,6 +313,19 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } else if (!locked) spin_lock_irq(&zone->lru_lock); + /* + * migrate_pfn does not necessarily start aligned to a + * pageblock. Ensure that pfn_valid is called when moving + * into a new MAX_ORDER_NR_PAGES range in case of large + * memory holes within the zone + */ + if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { + if (!pfn_valid(low_pfn)) { + low_pfn += MAX_ORDER_NR_PAGES - 1; + continue; + } + } + if (!pfn_valid_within(low_pfn)) continue; nr_scanned++; -- cgit v1.2.2 From dc9086004b3d5db75997a645b3fe08d9138b7ad0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 8 Feb 2012 17:13:38 -0800 Subject: mm: compaction: check for overlapping nodes during isolation for migration When isolating pages for migration, migration starts at the start of a zone while the free scanner starts at the end of the zone. Migration avoids entering a new zone by never going beyond the free scanned. Unfortunately, in very rare cases nodes can overlap. When this happens, migration isolates pages without the LRU lock held, corrupting lists which will trigger errors in reclaim or during page free such as in the following oops BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] free_pcppages_bulk+0xcc/0x450 PGD 1dda554067 PUD 1e1cb58067 PMD 0 Oops: 0000 [#1] SMP CPU 37 Pid: 17088, comm: memcg_process_s Tainted: G X RIP: free_pcppages_bulk+0xcc/0x450 Process memcg_process_s (pid: 17088, threadinfo ffff881c2926e000, task ffff881c2926c0c0) Call Trace: free_hot_cold_page+0x17e/0x1f0 __pagevec_free+0x90/0xb0 release_pages+0x22a/0x260 pagevec_lru_move_fn+0xf3/0x110 putback_lru_page+0x66/0xe0 unmap_and_move+0x156/0x180 migrate_pages+0x9e/0x1b0 compact_zone+0x1f3/0x2f0 compact_zone_order+0xa2/0xe0 try_to_compact_pages+0xdf/0x110 __alloc_pages_direct_compact+0xee/0x1c0 __alloc_pages_slowpath+0x370/0x830 __alloc_pages_nodemask+0x1b1/0x1c0 alloc_pages_vma+0x9b/0x160 do_huge_pmd_anonymous_page+0x160/0x270 do_page_fault+0x207/0x4c0 page_fault+0x25/0x30 The "X" in the taint flag means that external modules were loaded but but is unrelated to the bug triggering. The real problem was because the PFN layout looks like this Zone PFN ranges: DMA 0x00000010 -> 0x00001000 DMA32 0x00001000 -> 0x00100000 Normal 0x00100000 -> 0x01e80000 Movable zone start PFN for each node early_node_map[14] active PFN ranges 0: 0x00000010 -> 0x0000009b 0: 0x00000100 -> 0x0007a1ec 0: 0x0007a354 -> 0x0007a379 0: 0x0007f7ff -> 0x0007f800 0: 0x00100000 -> 0x00680000 1: 0x00680000 -> 0x00e80000 0: 0x00e80000 -> 0x01080000 1: 0x01080000 -> 0x01280000 0: 0x01280000 -> 0x01480000 1: 0x01480000 -> 0x01680000 0: 0x01680000 -> 0x01880000 1: 0x01880000 -> 0x01a80000 0: 0x01a80000 -> 0x01c80000 1: 0x01c80000 -> 0x01e80000 The fix is straight-forward. isolate_migratepages() has to make a similar check to isolate_freepage to ensure that it never isolates pages from a zone it does not hold the LRU lock for. This was discovered in a 3.0-based kernel but it affects 3.1.x, 3.2.x and current mainline. Signed-off-by: Mel Gorman Acked-by: Michal Nazarewicz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index bd939a574b84..d9ebebe1a2aa 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -330,8 +330,17 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; nr_scanned++; - /* Get the page and skip if free */ + /* + * Get the page and ensure the page is within the same zone. + * See the comment in isolate_freepages about overlapping + * nodes. It is deliberate that the new zone lock is not taken + * as memory compaction should not move pages between nodes. + */ page = pfn_to_page(low_pfn); + if (page_zone(page) != zone) + continue; + + /* Skip if free */ if (PageBuddy(page)) continue; -- cgit v1.2.2 From b9980cdcf2524c5fe15d8cbae9c97b3ed6385563 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 8 Feb 2012 17:13:40 -0800 Subject: mm: fix UP THP spin_is_locked BUGs Fix CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_SMP=n CONFIG_DEBUG_VM=y CONFIG_DEBUG_SPINLOCK=n kernel: spin_is_locked() is then always false, and so triggers some BUGs in Transparent HugePage codepaths. asm-generic/bug.h mentions this problem, and provides a WARN_ON_SMP(x); but being too lazy to add VM_BUG_ON_SMP, BUG_ON_SMP, WARN_ON_SMP_ONCE, VM_WARN_ON_SMP_ONCE, just test NR_CPUS != 1 in the existing VM_BUG_ONs. Signed-off-by: Hugh Dickins Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- mm/swap.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b3ffc21ce801..91d3efb25d15 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2083,7 +2083,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) { struct mm_struct *mm = mm_slot->mm; - VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); if (khugepaged_test_exit(mm)) { /* free mm_slot */ @@ -2113,7 +2113,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int progress = 0; VM_BUG_ON(!pages); - VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); if (khugepaged_scan.mm_slot) mm_slot = khugepaged_scan.mm_slot; diff --git a/mm/swap.c b/mm/swap.c index b0f529b38979..fff1ff7fb9ad 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -659,7 +659,7 @@ void lru_add_page_tail(struct zone* zone, VM_BUG_ON(!PageHead(page)); VM_BUG_ON(PageCompound(page_tail)); VM_BUG_ON(PageLRU(page_tail)); - VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock)); SetPageLRU(page_tail); -- cgit v1.2.2 From 074b85175a43a23fdbde60f55feea636e0bf0f85 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 8 Feb 2012 12:39:07 -0800 Subject: vfs: fix panic in __d_lookup() with high dentry hashtable counts When the number of dentry cache hash table entries gets too high (2147483648 entries), as happens by default on a 16TB system, use of a signed integer in the dcache_init() initialization loop prevents the dentry_hashtable from getting initialized, causing a panic in __d_lookup(). Fix this in dcache_init() and similar areas. Signed-off-by: Dimitri Sivanich Acked-by: David S. Miller Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d2186ecb36f7..a13ded1938f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5236,6 +5236,7 @@ void *__init alloc_large_system_hash(const char *tablename, max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; do_div(max, bucketsize); } + max = min(max, 0x80000000ULL); if (numentries > max) numentries = max; -- cgit v1.2.2 From 371528caec553785c37f73fa3926ea0de84f986f Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Fri, 24 Feb 2012 05:14:46 +0400 Subject: mm: memcg: Correct unregistring of events attached to the same eventfd There is an issue when memcg unregisters events that were attached to the same eventfd: - On the first call mem_cgroup_usage_unregister_event() removes all events attached to a given eventfd, and if there were no events left, thresholds->primary would become NULL; - Since there were several events registered, cgroups core will call mem_cgroup_usage_unregister_event() again, but now kernel will oops, as the function doesn't expect that threshold->primary may be NULL. That's a good question whether mem_cgroup_usage_unregister_event() should actually remove all events in one go, but nowadays it can't do any better as cftype->unregister_event callback doesn't pass any private event-associated cookie. So, let's fix the issue by simply checking for threshold->primary. FWIW, w/o the patch the following oops may be observed: BUG: unable to handle kernel NULL pointer dereference at 0000000000000004 IP: [] mem_cgroup_usage_unregister_event+0x9c/0x1f0 Pid: 574, comm: kworker/0:2 Not tainted 3.3.0-rc4+ #9 Bochs Bochs RIP: 0010:[] [] mem_cgroup_usage_unregister_event+0x9c/0x1f0 RSP: 0018:ffff88001d0b9d60 EFLAGS: 00010246 Process kworker/0:2 (pid: 574, threadinfo ffff88001d0b8000, task ffff88001de91cc0) Call Trace: [] cgroup_event_remove+0x2b/0x60 [] process_one_work+0x174/0x450 [] worker_thread+0x123/0x2d0 Cc: stable Signed-off-by: Anton Vorontsov Acked-by: KAMEZAWA Hiroyuki Cc: Kirill A. Shutemov Cc: Michal Hocko Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6728a7ae6f2d..228d6461c12a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4414,6 +4414,9 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, */ BUG_ON(!thresholds); + if (!thresholds->primary) + goto unlock; + usage = mem_cgroup_usage(memcg, type == _MEMSWAP); /* Check if a threshold crossed before removing */ @@ -4462,7 +4465,7 @@ swap_buffers: /* To be sure that nobody uses thresholds */ synchronize_rcu(); - +unlock: mutex_unlock(&memcg->thresholds_lock); } -- cgit v1.2.2 From 918e556ec214ed2f584e4cac56d7b29e4bb6bf27 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 23 Feb 2012 13:50:35 +0000 Subject: NOMMU: Lock i_mmap_mutex for access to the VMA prio list Lock i_mmap_mutex for access to the VMA prio list to prevent concurrent access. Currently, certain parts of the mmap handling are protected by the region mutex, but not all. Reported-by: Al Viro Signed-off-by: David Howells Acked-by: Al Viro cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- mm/nommu.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index b982290fd962..ee7e57ebef6a 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -696,9 +696,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_file) { mapping = vma->vm_file->f_mapping; + mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma_prio_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); + mutex_unlock(&mapping->i_mmap_mutex); } /* add the VMA to the tree */ @@ -760,9 +762,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) if (vma->vm_file) { mapping = vma->vm_file->f_mapping; + mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma_prio_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); + mutex_unlock(&mapping->i_mmap_mutex); } /* remove from the MM's tree and list */ @@ -2052,6 +2056,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; down_write(&nommu_region_sem); + mutex_lock(&inode->i_mapping->i_mmap_mutex); /* search for VMAs that fall within the dead zone */ vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, @@ -2059,6 +2064,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, /* found one - only interested if it's shared out of the page * cache */ if (vma->vm_flags & VM_SHARED) { + mutex_unlock(&inode->i_mapping->i_mmap_mutex); up_write(&nommu_region_sem); return -ETXTBSY; /* not quite true, but near enough */ } @@ -2086,6 +2092,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, } } + mutex_unlock(&inode->i_mapping->i_mmap_mutex); up_write(&nommu_region_sem); return 0; } -- cgit v1.2.2 From b94cfaf6685d691dc3fab023cf32f65e9b7be09c Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 23 Feb 2012 13:51:00 +0000 Subject: NOMMU: Don't need to clear vm_mm when deleting a VMA Don't clear vm_mm in a deleted VMA as it's unnecessary and might conceivably break the filesystem or driver VMA close routine. Reported-by: Al Viro Signed-off-by: David Howells Acked-by: Al Viro cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- mm/nommu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index ee7e57ebef6a..f59e170fceb4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -779,8 +779,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) if (vma->vm_next) vma->vm_next->vm_prev = vma->vm_prev; - - vma->vm_mm = NULL; } /* -- cgit v1.2.2 From 847854f5988a04fe7e02d2fdd4fa0df9f96360fe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 29 Feb 2012 05:56:21 +0900 Subject: memblock: Fix size aligning of memblock_alloc_base_nid() memblock allocator aligns @size to @align to reduce the amount of fragmentation. Commit: 7bd0b0f0da ("memblock: Reimplement memblock allocation using reverse free area iterator") Broke it by incorrectly relocating @size aligning to memblock_find_in_range_node(). As the aligned size is not propagated back to memblock_alloc_base_nid(), the actually reserved size isn't aligned. While this increases memory use for memblock reserved array, this shouldn't cause any critical failure; however, it seems that the size aligning was hiding a use-beyond-allocation bug in sparc64 and losing the aligning causes boot failure. The underlying problem is currently being debugged but this is a proper fix in itself, it's already pretty late in -rc cycle for boot failures and reverting the change for debugging isn't difficult. Restore the size aligning moving it to memblock_alloc_base_nid(). Reported-by: Meelis Roos Signed-off-by: Tejun Heo Cc: David S. Miller Cc: Grant Likely Cc: Rob Herring Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20120228205621.GC3252@dhcp-172-17-108-109.mtv.corp.google.com Signed-off-by: Ingo Molnar LKML-Reference: --- mm/memblock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 77b5f227e1d8..99f285599501 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -99,9 +99,6 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, phys_addr_t this_start, this_end, cand; u64 i; - /* align @size to avoid excessive fragmentation on reserved array */ - size = round_up(size, align); - /* pump up @end */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) end = memblock.current_limit; @@ -731,6 +728,9 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, { phys_addr_t found; + /* align @size to avoid excessive fragmentation on reserved array */ + size = round_up(size, align); + found = memblock_find_in_range_node(0, max_addr, size, align, nid); if (found && !memblock_reserve(found, size)) return found; -- cgit v1.2.2 From 835ee7978cb47de94cf70232a694f19295d2993f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Mar 2012 06:39:47 +0000 Subject: VM_GROWS{UP,DOWN} shouldn't be set on shmem VMAs Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 3f758c7f4c81..22e1a0b2f70c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1293,6 +1293,8 @@ munmap_back: pgoff = vma->vm_pgoff; vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { + if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP))) + goto free_vma; error = shmem_zero_setup(vma); if (error) goto free_vma; -- cgit v1.2.2 From cd2934a3b3057eb048f8b4fb82e941d24a043207 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Mar 2012 06:40:29 +0000 Subject: flush_tlb_range() needs ->page_table_lock when ->mmap_sem is not held All other callers already hold either ->mmap_sem (exclusive) or ->page_table_lock. And we need it because some page table flushing instanced do work explicitly with ge tables. See e.g. arch/powerpc/mm/tlb_hash32.c, flush_tlb_range() and flush_range() in there. The same goes for uml, with a lot more extensive playing with page tables. Almost all callers are actually fine - flush_tlb_range() may have no need to bother playing with page tables, but it can do so safely; again, this caller is the sole exception - everything else either has exclusive ->mmap_sem on the mm in question, or mm->page_table_lock is held. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5f34bd8dda34..a876871f6be5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2277,8 +2277,8 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, set_page_dirty(page); list_add(&page->lru, &page_list); } - spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); + spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { page_remove_rmap(page); -- cgit v1.2.2 From 9ce70c0240d01309b34712f87eda4fbfba3c3764 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 5 Mar 2012 14:59:16 -0800 Subject: memcg: fix deadlock by inverting lrucare nesting We have forgotten the rules of lock nesting: the irq-safe ones must be taken inside the non-irq-safe ones, otherwise we are open to deadlock: CPU0 CPU1 ---- ---- lock(&(&pc->lock)->rlock); local_irq_disable(); lock(&(&zone->lru_lock)->rlock); lock(&(&pc->lock)->rlock); lock(&(&zone->lru_lock)->rlock); To check a different locking issue, I happened to add a spin_lock to memcg's bit_spin_lock in lock_page_cgroup(), and lockdep very quickly complained about __mem_cgroup_commit_charge_lrucare() (on CPU1 above). So delete __mem_cgroup_commit_charge_lrucare(), passing a bool lrucare to __mem_cgroup_commit_charge() instead, taking zone->lru_lock under lock_page_cgroup() in the lrucare case. The original was using spin_lock_irqsave, but we'd be in more trouble if it were ever called at interrupt time: unconditional _irq is enough. And ClearPageLRU before del from lru, SetPageLRU before add to lru: no strong reason, but that is the ordering used consistently elsewhere. Fixes 36b62ad539498d00c2d280a151a ("memcg: simplify corner case handling of LRU"). Signed-off-by: Hugh Dickins Acked-by: Johannes Weiner Cc: Konstantin Khlebnikov Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 72 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 37 insertions(+), 35 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 228d6461c12a..1097d8098f8c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2408,8 +2408,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, struct page *page, unsigned int nr_pages, struct page_cgroup *pc, - enum charge_type ctype) + enum charge_type ctype, + bool lrucare) { + struct zone *uninitialized_var(zone); + bool was_on_lru = false; + lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); @@ -2420,6 +2424,21 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, * we don't need page_cgroup_lock about tail pages, becase they are not * accessed by any other context at this point. */ + + /* + * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page + * may already be on some other mem_cgroup's LRU. Take care of it. + */ + if (lrucare) { + zone = page_zone(page); + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + ClearPageLRU(page); + del_page_from_lru_list(zone, page, page_lru(page)); + was_on_lru = true; + } + } + pc->mem_cgroup = memcg; /* * We access a page_cgroup asynchronously without lock_page_cgroup(). @@ -2443,9 +2462,18 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, break; } + if (lrucare) { + if (was_on_lru) { + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + add_page_to_lru_list(zone, page, page_lru(page)); + } + spin_unlock_irq(&zone->lru_lock); + } + mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); unlock_page_cgroup(pc); - WARN_ON_ONCE(PageLRU(page)); + /* * "charge_statistics" updated event counter. Then, check it. * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. @@ -2643,7 +2671,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); if (ret == -ENOMEM) return ret; - __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); + __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false); return 0; } @@ -2663,35 +2691,6 @@ static void __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, enum charge_type ctype); -static void -__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg, - enum charge_type ctype) -{ - struct page_cgroup *pc = lookup_page_cgroup(page); - struct zone *zone = page_zone(page); - unsigned long flags; - bool removed = false; - - /* - * In some case, SwapCache, FUSE(splice_buf->radixtree), the page - * is already on LRU. It means the page may on some other page_cgroup's - * LRU. Take care of it. - */ - spin_lock_irqsave(&zone->lru_lock, flags); - if (PageLRU(page)) { - del_page_from_lru_list(zone, page, page_lru(page)); - ClearPageLRU(page); - removed = true; - } - __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); - if (removed) { - add_page_to_lru_list(zone, page, page_lru(page)); - SetPageLRU(page); - } - spin_unlock_irqrestore(&zone->lru_lock, flags); - return; -} - int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { @@ -2769,13 +2768,16 @@ static void __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, enum charge_type ctype) { + struct page_cgroup *pc; + if (mem_cgroup_disabled()) return; if (!memcg) return; cgroup_exclude_rmdir(&memcg->css); - __mem_cgroup_commit_charge_lrucare(page, memcg, ctype); + pc = lookup_page_cgroup(page); + __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true); /* * Now swap is on-memory. This means this page may be * counted both as mem and swap....double count. @@ -3248,7 +3250,7 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype); + __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false); return ret; } @@ -3332,7 +3334,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, * the newpage may be on LRU(or pagevec for LRU) already. We lock * LRU while we overwrite pc->mem_cgroup. */ - __mem_cgroup_commit_charge_lrucare(newpage, memcg, type); + __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true); } #ifdef CONFIG_DEBUG_VM -- cgit v1.2.2 From 7512102cf64d36e3c7444480273623c7aab3563f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 5 Mar 2012 14:59:18 -0800 Subject: memcg: fix GPF when cgroup removal races with last exit When moving tasks from old memcg (with move_charge_at_immigrate on new memcg), followed by removal of old memcg, hit General Protection Fault in mem_cgroup_lru_del_list() (called from release_pages called from free_pages_and_swap_cache from tlb_flush_mmu from tlb_finish_mmu from exit_mmap from mmput from exit_mm from do_exit). Somewhat reproducible, takes a few hours: the old struct mem_cgroup has been freed and poisoned by SLAB_DEBUG, but mem_cgroup_lru_del_list() is still trying to update its stats, and take page off lru before freeing. A task, or a charge, or a page on lru: each secures a memcg against removal. In this case, the last task has been moved out of the old memcg, and it is exiting: anonymous pages are uncharged one by one from the memcg, as they are zapped from its pagetables, so the charge gets down to 0; but the pages themselves are queued in an mmu_gather for freeing. Most of those pages will be on lru (and force_empty is careful to lru_add_drain_all, to add pages from pagevec to lru first), but not necessarily all: perhaps some have been isolated for page reclaim, perhaps some isolated for other reasons. So, force_empty may find no task, no charge and no page on lru, and let the removal proceed. There would still be no problem if these pages were immediately freed; but typically (and the put_page_testzero protocol demands it) they have to be added back to lru before they are found freeable, then removed from lru and freed. We don't see the issue when adding, because the mem_cgroup_iter() loops keep their own reference to the memcg being scanned; but when it comes to mem_cgroup_lru_del_list(). I believe this was not an issue in v3.2: there, PageCgroupAcctLRU and PageCgroupUsed flags were used (like a trick with mirrors) to deflect view of pc->mem_cgroup to the stable root_mem_cgroup when neither set. 38c5d72f3ebe ("memcg: simplify LRU handling by new rule") mercifully removed those convolutions, but left this General Protection Fault. But it's surprisingly easy to restore the old behaviour: just check PageCgroupUsed in mem_cgroup_lru_add_list() (which decides on which lruvec to add), and reset pc to root_mem_cgroup if page is uncharged. A risky change? just going back to how it worked before; testing, and an audit of uses of pc->mem_cgroup, show no problem. And there's a nice bonus: with mem_cgroup_lru_add_list() itself making sure that an uncharged page goes to root lru, mem_cgroup_reset_owner() no longer has any purpose, and we can safely revert 4e5f01c2b9b9 ("memcg: clear pc->mem_cgroup if necessary"). Calling update_page_reclaim_stat() after add_page_to_lru_list() in swap.c is not strictly necessary: the lru_lock there, with RCU before memcg structures are freed, makes mem_cgroup_get_reclaim_stat_from_page safe without that; but it seems cleaner to rely on one dependency less. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 11 ----------- mm/memcontrol.c | 30 +++++++++++++----------------- mm/migrate.c | 2 -- mm/swap.c | 8 +++++--- mm/swap_state.c | 10 ---------- 5 files changed, 18 insertions(+), 43 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 1925ffbfb27f..310544a379ae 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -1572,16 +1571,6 @@ struct page *ksm_does_need_to_copy(struct page *page, new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (new_page) { - /* - * The memcg-specific accounting when moving - * pages around the LRU lists relies on the - * page's owner (memcg) to be valid. Usually, - * pages are assigned to a new owner before - * being put on the LRU list, but since this - * is not the case here, the stale owner from - * a previous allocation cycle must be reset. - */ - mem_cgroup_reset_owner(new_page); copy_user_highpage(new_page, page, address, vma); SetPageDirty(new_page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1097d8098f8c..d0e57a3cda18 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1042,6 +1042,19 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, pc = lookup_page_cgroup(page); memcg = pc->mem_cgroup; + + /* + * Surreptitiously switch any uncharged page to root: + * an uncharged page off lru does nothing to secure + * its former mem_cgroup from sudden removal. + * + * Our caller holds lru_lock, and PageCgroupUsed is updated + * under page_cgroup lock: between them, they make all uses + * of pc->mem_cgroup safe. + */ + if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) + pc->mem_cgroup = memcg = root_mem_cgroup; + mz = page_cgroup_zoneinfo(memcg, page); /* compound_order() is stabilized through lru_lock */ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); @@ -3029,23 +3042,6 @@ void mem_cgroup_uncharge_end(void) batch->memcg = NULL; } -/* - * A function for resetting pc->mem_cgroup for newly allocated pages. - * This function should be called if the newpage will be added to LRU - * before start accounting. - */ -void mem_cgroup_reset_owner(struct page *newpage) -{ - struct page_cgroup *pc; - - if (mem_cgroup_disabled()) - return; - - pc = lookup_page_cgroup(newpage); - VM_BUG_ON(PageCgroupUsed(pc)); - pc->mem_cgroup = root_mem_cgroup; -} - #ifdef CONFIG_SWAP /* * called after __delete_from_swap_cache() and drop "page" account. diff --git a/mm/migrate.c b/mm/migrate.c index df141f60289e..1503b6b54ecb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -839,8 +839,6 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!newpage) return -ENOMEM; - mem_cgroup_reset_owner(newpage); - if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ goto out; diff --git a/mm/swap.c b/mm/swap.c index fff1ff7fb9ad..14380e9fbe33 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -652,7 +652,7 @@ EXPORT_SYMBOL(__pagevec_release); void lru_add_page_tail(struct zone* zone, struct page *page, struct page *page_tail) { - int active; + int uninitialized_var(active); enum lru_list lru; const int file = 0; @@ -672,7 +672,6 @@ void lru_add_page_tail(struct zone* zone, active = 0; lru = LRU_INACTIVE_ANON; } - update_page_reclaim_stat(zone, page_tail, file, active); } else { SetPageUnevictable(page_tail); lru = LRU_UNEVICTABLE; @@ -693,6 +692,9 @@ void lru_add_page_tail(struct zone* zone, list_head = page_tail->lru.prev; list_move_tail(&page_tail->lru, list_head); } + + if (!PageUnevictable(page)) + update_page_reclaim_stat(zone, page_tail, file, active); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -710,8 +712,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg) SetPageLRU(page); if (active) SetPageActive(page); - update_page_reclaim_stat(zone, page, file, active); add_page_to_lru_list(zone, page, lru); + update_page_reclaim_stat(zone, page, file, active); } /* diff --git a/mm/swap_state.c b/mm/swap_state.c index 470038a91873..ea6b32d61873 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -300,16 +300,6 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, new_page = alloc_page_vma(gfp_mask, vma, addr); if (!new_page) break; /* Out of memory */ - /* - * The memcg-specific accounting when moving - * pages around the LRU lists relies on the - * page's owner (memcg) to be valid. Usually, - * pages are assigned to a new owner before - * being put on the LRU list, but since this - * is not the case here, the stale owner from - * a previous allocation cycle must be reset. - */ - mem_cgroup_reset_owner(new_page); } /* -- cgit v1.2.2 From 1c641e84719429bbfe62a95ed3545ee7fe24408f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 5 Mar 2012 14:59:20 -0800 Subject: mm: thp: fix BUG on mm->nr_ptes Dave Jones reports a few Fedora users hitting the BUG_ON(mm->nr_ptes...) in exit_mmap() recently. Quoting Hugh's discovery and explanation of the SMP race condition: "mm->nr_ptes had unusual locking: down_read mmap_sem plus page_table_lock when incrementing, down_write mmap_sem (or mm_users 0) when decrementing; whereas THP is careful to increment and decrement it under page_table_lock. Now most of those paths in THP also hold mmap_sem for read or write (with appropriate checks on mm_users), but two do not: when split_huge_page() is called by hwpoison_user_mappings(), and when called by add_to_swap(). It's conceivable that the latter case is responsible for the exit_mmap() BUG_ON mm->nr_ptes that has been reported on Fedora." The simplest way to fix it without having to alter the locking is to make split_huge_page() a noop in nr_ptes terms, so by counting the preallocated pagetables that exists for every mapped hugepage. It was an arbitrary choice not to count them and either way is not wrong or right, because they are not used but they're still allocated. Reported-by: Dave Jones Reported-by: Hugh Dickins Signed-off-by: Andrea Arcangeli Acked-by: Hugh Dickins Cc: David Rientjes Cc: Josh Boyer Cc: [3.0.x, 3.1.x, 3.2.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 91d3efb25d15..8f7fc394f636 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -671,6 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, set_pmd_at(mm, haddr, pmd, entry); prepare_pmd_huge_pte(pgtable, mm); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm->nr_ptes++; spin_unlock(&mm->page_table_lock); } @@ -789,6 +790,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd = pmd_mkold(pmd_wrprotect(pmd)); set_pmd_at(dst_mm, addr, dst_pmd, pmd); prepare_pmd_huge_pte(pgtable, dst_mm); + dst_mm->nr_ptes++; ret = 0; out_unlock: @@ -887,7 +889,6 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, } kfree(pages); - mm->nr_ptes++; smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); page_remove_rmap(page); @@ -1047,6 +1048,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(page_mapcount(page) < 0); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON(!PageHead(page)); + tlb->mm->nr_ptes--; spin_unlock(&tlb->mm->page_table_lock); tlb_remove_page(tlb, page); pte_free(tlb->mm, pgtable); @@ -1375,7 +1377,6 @@ static int __split_huge_page_map(struct page *page, pte_unmap(pte); } - mm->nr_ptes++; smp_wmb(); /* make pte visible before pmd */ /* * Up to this point the pmd is present and huge and @@ -1988,7 +1989,6 @@ static void collapse_huge_page(struct mm_struct *mm, set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache(vma, address, _pmd); prepare_pmd_huge_pte(pgtable, mm); - mm->nr_ptes--; spin_unlock(&mm->page_table_lock); #ifndef CONFIG_NUMA -- cgit v1.2.2 From e6ca7b89dc76abf77c80887fed54e0a60c87c0a8 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 5 Mar 2012 14:59:20 -0800 Subject: memcg: fix mapcount check in move charge code for anonymous page Currently the charge on shared anonyous pages is supposed not to moved in task migration. To implement this, we need to check that mapcount > 1, instread of > 2. So this patch fixes it. Signed-off-by: Naoya Horiguchi Reviewed-by: Daisuke Nishimura Cc: Andrea Arcangeli Cc: KAMEZAWA Hiroyuki Cc: Hillf Danton Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d0e57a3cda18..5585dc3d3646 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5075,7 +5075,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, return NULL; if (PageAnon(page)) { /* we don't move shared anon */ - if (!move_anon() || page_mapcount(page) > 2) + if (!move_anon() || page_mapcount(page) > 1) return NULL; } else if (!move_file()) /* we ignore mapcount for file pages */ -- cgit v1.2.2 From c09ff089aa62380ad904ea785bd713c56720270e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 5 Mar 2012 20:52:55 -0800 Subject: page_cgroup: fix horrid swap accounting regression Why is memcg's swap accounting so broken? Insane counts, wrong ownership, unfreeable structures, which later get freed and then accessed after free. Turns out to be a tiny a little 3.3-rc1 regression in 9fb4b7cc0724 "page_cgroup: add helper function to get swap_cgroup": the helper function (actually named lookup_swap_cgroup()) returns an address using void* arithmetic, but the structure in question is a short. Signed-off-by: Hugh Dickins Reviewed-by: Bob Liu Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index de1616aa9b1e..1ccbd714059c 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -379,13 +379,15 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, pgoff_t offset = swp_offset(ent); struct swap_cgroup_ctrl *ctrl; struct page *mappage; + struct swap_cgroup *sc; ctrl = &swap_cgroup_ctrl[swp_type(ent)]; if (ctrlp) *ctrlp = ctrl; mappage = ctrl->map[offset / SC_PER_PAGE]; - return page_address(mappage) + offset % SC_PER_PAGE; + sc = page_address(mappage); + return sc + offset % SC_PER_PAGE; } /** -- cgit v1.2.2 From ce8fea7aa4ad9e3b40999a08622ef27c77159659 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Mar 2012 12:28:52 -0800 Subject: mmap: EINVAL not ENOMEM when rejecting VM_GROWS Currently error is -ENOMEM when rejecting VM_GROWSDOWN|VM_GROWSUP from shared anonymous: hoist the file case's -EINVAL up for both. Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/mmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 22e1a0b2f70c..09ce2cae07ca 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1266,8 +1266,9 @@ munmap_back: vma->vm_pgoff = pgoff; INIT_LIST_HEAD(&vma->anon_vma_chain); + error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */ + if (file) { - error = -EINVAL; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) goto free_vma; if (vm_flags & VM_DENYWRITE) { -- cgit v1.2.2 From 83cd904d271ba960c53f3adbb037f3486518f1e6 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 4 Mar 2012 19:52:03 -0500 Subject: mm: fix find_vma_prev Commit 6bd4837de96e ("mm: simplify find_vma_prev()") broke memory management on PA-RISC. After application of the patch, programs that allocate big arrays on the stack crash with segfault, for example, this will crash if compiled without optimization: int main() { char array[200000]; array[199999] = 0; return 0; } The reason is that PA-RISC has up-growing stack and the stack is usually the last memory area. In the above example, a page fault happens above the stack. Previously, if we passed too high address to find_vma_prev, it returned NULL and stored the last VMA in *pprev. After "simplify find_vma_prev" change, it stores NULL in *pprev. Consequently, the stack area is not found and it is not expanded, as it used to be before the change. This patch restores the old behavior and makes it return the last VMA in *pprev if the requested address is higher than address of any other VMA. Signed-off-by: Mikulas Patocka Acked-by: KOSAKI Motohiro Signed-off-by: Linus Torvalds --- mm/mmap.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 09ce2cae07ca..da15a79b1441 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1608,7 +1608,6 @@ EXPORT_SYMBOL(find_vma); /* * Same as find_vma, but also return a pointer to the previous VMA in *pprev. - * Note: pprev is set to NULL when return value is NULL. */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, @@ -1617,7 +1616,16 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct *vma; vma = find_vma(mm, addr); - *pprev = vma ? vma->vm_prev : NULL; + if (vma) { + *pprev = vma->vm_prev; + } else { + struct rb_node *rb_node = mm->mm_rb.rb_node; + *pprev = NULL; + while (rb_node) { + *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); + rb_node = rb_node->rb_right; + } + } return vma; } -- cgit v1.2.2 From 097d59106a8e4b42d07c9892fdd7790f1659c6ff Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 6 Mar 2012 18:23:36 -0800 Subject: vm: avoid using find_vma_prev() unnecessarily Several users of "find_vma_prev()" were not in fact interested in the previous vma if there was no primary vma to be found either. And in those cases, we're much better off just using the regular "find_vma()", and then "prev" can be looked up by just checking vma->vm_prev. The find_vma_prev() semantics are fairly subtle (see Mikulas' recent commit 83cd904d271b: "mm: fix find_vma_prev"), and the whole "return prev by reference" means that it generates worse code too. Thus this "let's avoid using this inconvenient and clearly too subtle interface when we don't really have to" patch. Cc: Mikulas Patocka Cc: KOSAKI Motohiro Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 ++- mm/mlock.c | 3 ++- mm/mprotect.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 06b145fb64ab..47296fee23db 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -640,10 +640,11 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long vmstart; unsigned long vmend; - vma = find_vma_prev(mm, start, &prev); + vma = find_vma(mm, start); if (!vma || vma->vm_start > start) return -EFAULT; + prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; diff --git a/mm/mlock.c b/mm/mlock.c index 4f4f53bdc65d..ef726e8aa8e9 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -385,10 +385,11 @@ static int do_mlock(unsigned long start, size_t len, int on) return -EINVAL; if (end == start) return 0; - vma = find_vma_prev(current->mm, start, &prev); + vma = find_vma(current->mm, start); if (!vma || vma->vm_start > start) return -ENOMEM; + prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; diff --git a/mm/mprotect.c b/mm/mprotect.c index 5a688a2756be..f437d054c3bf 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -262,10 +262,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, down_write(¤t->mm->mmap_sem); - vma = find_vma_prev(current->mm, start, &prev); + vma = find_vma(current->mm, start); error = -ENOMEM; if (!vma) goto out; + prev = vma->vm_prev; if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; -- cgit v1.2.2 From be22aece684f5a700e6247b9861c3759d5798a3c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 9 Mar 2012 13:37:32 -0800 Subject: memcg: revert fix to mapcount check for this release Respectfully revert commit e6ca7b89dc76 "memcg: fix mapcount check in move charge code for anonymous page" for the 3.3 release, so that it behaves exactly like releases 2.6.35 through 3.2 in this respect. Horiguchi-san's commit is correct in itself, 1 makes much more sense than 2 in that check; but it does not go far enough - swapcount should be considered too - if we really want such a check at all. We appear to have reached agreement now, and expect that 3.4 will remove the mapcount check, but had better not make 3.3 different. Signed-off-by: Hugh Dickins Reviewed-by: Naoya Horiguchi Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5585dc3d3646..d0e57a3cda18 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5075,7 +5075,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, return NULL; if (PageAnon(page)) { /* we don't move shared anon */ - if (!move_anon() || page_mapcount(page) > 1) + if (!move_anon() || page_mapcount(page) > 2) return NULL; } else if (!move_file()) /* we ignore mapcount for file pages */ -- cgit v1.2.2