From 9b46333406b9cb3397ab538485a4d57c316af0ff Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 28 Oct 2008 19:22:34 +1100 Subject: vmap: cope with vm_unmap_aliases before vmalloc_init() Xen can end up calling vm_unmap_aliases() before vmalloc_init() has been called. In this case its safe to make it a simple no-op. Signed-off-by: Jeremy Fitzhardinge Cc: Linux Memory Management List Cc: Nick Piggin Signed-off-by: Ingo Molnar --- mm/vmalloc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 66fad3fc02b1..ba6b0f5f7fac 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -592,6 +592,8 @@ static void free_unmap_vmap_area_addr(unsigned long addr) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) +static bool vmap_initialized __read_mostly = false; + struct vmap_block_queue { spinlock_t lock; struct list_head free; @@ -828,6 +830,9 @@ void vm_unmap_aliases(void) int cpu; int flush = 0; + if (unlikely(!vmap_initialized)) + return; + for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; @@ -942,6 +947,8 @@ void __init vmalloc_init(void) INIT_LIST_HEAD(&vbq->dirty); vbq->nr_dirty = 0; } + + vmap_initialized = true; } void unmap_kernel_range(unsigned long addr, unsigned long size) -- cgit v1.2.2 From 1c1271850494f06b63ae6b485e2e1b9c27ffb2d1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 12 Nov 2008 01:24:41 +0100 Subject: parisc: fix find_extend_vma() breakage The STACK_GROWSUP case of stack expansion was missing a test for 'prev', which got removed by commit cb8f488c33539f096580e202f5438a809195008f ("mmap.c: deinline a few functions") by mistake. I found my original email in "sent" folder. The patch in that mail does NOT remove !prev. That change had beed added by someone else. Ok, I think we are not much interested in who did it, let's fix it for good. [ "It looks like this was caused by me fixing rejects. That was the fancy include-lots-of-context-so-it-wont-apply patch." - akpm ] Reported-and-bisected-by: Helge Deller Signed-off-by: Denys Vlasenko Cc: Andrew Morton Cc: Jiri Kosina Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index de14ac21e5b5..d4855a682ab6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1704,7 +1704,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (expand_stack(prev, addr)) + if (!prev || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) { if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) -- cgit v1.2.2 From 7526674de0c921e7f1e9b6f71a1f9d832557b554 Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Wed, 12 Nov 2008 13:24:56 -0800 Subject: hugetlb: make unmap_ref_private multi-size-aware Oops. Part of the hugetlb private reservation code was not fully converted to use hstates. When a huge page must be unmapped from VMAs due to a failed COW, HPAGE_SIZE is used in the call to unmap_hugepage_range() regardless of the page size being used. This works if the VMA is using the default huge page size. Otherwise we might unmap too much, too little, or trigger a BUG_ON. Rare but serious -- fix it. Signed-off-by: Adam Litke Cc: Jon Tollefson Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d143ab67be44..6058b53dcb89 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1796,6 +1796,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, unsigned long address) { + struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; struct address_space *mapping; struct prio_tree_iter iter; @@ -1805,7 +1806,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * vm_pgoff is in PAGE_SIZE units, hence the different calculation * from page cache lookup which is in HPAGE_SIZE units. */ - address = address & huge_page_mask(hstate_vma(vma)); + address = address & huge_page_mask(h); pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + (vma->vm_pgoff >> PAGE_SHIFT); mapping = (struct address_space *)page_private(page); @@ -1824,7 +1825,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) unmap_hugepage_range(iter_vma, - address, address + HPAGE_SIZE, + address, address + huge_page_size(h), page); } -- cgit v1.2.2 From e33c3b5e172e2e45456f42fba47227d48745543f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 12 Nov 2008 13:25:37 -0800 Subject: cpusets: update mems allowed in page allocator If all allowable memory is unreclaimable, it is possible to loop forever in the page allocator for ~__GFP_NORETRY allocations. During this time, it is also possible for a task's cpuset to expand its set of allowable nodes so that it now includes free memory. The cached copy of this set, current->mems_allowed, is stale, however, since there has not been a subsequent call to cpuset_update_task_memory_state(). The cached copy of the set of allowable nodes is now updated in the page allocator's slow path so the additional memory is available to get_page_from_freelist(). [akpm@linux-foundation.org: add comment] Signed-off-by: David Rientjes Cc: Paul Menage Cc: Christoph Lameter Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 54069e64e3a8..d8ac01474563 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1561,6 +1561,10 @@ nofail_alloc: /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); + /* + * The task's cpuset might have expanded its set of allowable nodes + */ + cpuset_update_task_memory_state(); p->flags |= PF_MEMALLOC; reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; -- cgit v1.2.2 From 8891d6da17db0f9bb507d3a017f130b9970c3087 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 12 Nov 2008 13:26:53 -0800 Subject: mm: remove lru_add_drain_all() from the munlock path lockdep warns about following message at boot time on one of my test machine. Then, schedule_on_each_cpu() sholdn't be called when the task have mmap_sem. Actually, lru_add_drain_all() exist to prevent the unevictalble pages stay on reclaimable lru list. but currenct unevictable code can rescue unevictable pages although it stay on reclaimable list. So removing is better. In addition, this patch add lru_add_drain_all() to sys_mlock() and sys_mlockall(). it isn't must. but it reduce the failure of moving to unevictable list. its failure can rescue in vmscan later. but reducing is better. Note, if above rescuing happend, the Mlocked and the Unevictable field mismatching happend in /proc/meminfo. but it doesn't cause any real trouble. ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.28-rc2-mm1 #2 ------------------------------------------------------- lvm/1103 is trying to acquire lock: (&cpu_hotplug.lock){--..}, at: [] get_online_cpus+0x29/0x50 but task is already holding lock: (&mm->mmap_sem){----}, at: [] sys_mlockall+0x4e/0xb0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&mm->mmap_sem){----}: [] check_noncircular+0x82/0x110 [] might_fault+0x4a/0xa0 [] validate_chain+0xb11/0x1070 [] might_fault+0x4a/0xa0 [] __lock_acquire+0x263/0xa10 [] lock_acquire+0x7c/0xb0 (*) grab mmap_sem [] might_fault+0x4a/0xa0 [] might_fault+0x7b/0xa0 [] might_fault+0x4a/0xa0 [] copy_to_user+0x30/0x60 [] filldir+0x7c/0xd0 [] sysfs_readdir+0x11a/0x1f0 (*) grab sysfs_mutex [] filldir+0x0/0xd0 [] filldir+0x0/0xd0 [] vfs_readdir+0x86/0xa0 (*) grab i_mutex [] sys_getdents+0x6b/0xc0 [] syscall_call+0x7/0xb [] 0xffffffff -> #2 (sysfs_mutex){--..}: [] check_noncircular+0x82/0x110 [] sysfs_addrm_start+0x2c/0xc0 [] validate_chain+0xb11/0x1070 [] sysfs_addrm_start+0x2c/0xc0 [] __lock_acquire+0x263/0xa10 [] lock_acquire+0x7c/0xb0 (*) grab sysfs_mutex [] sysfs_addrm_start+0x2c/0xc0 [] mutex_lock_nested+0xa5/0x2f0 [] sysfs_addrm_start+0x2c/0xc0 [] sysfs_addrm_start+0x2c/0xc0 [] sysfs_addrm_start+0x2c/0xc0 [] create_dir+0x3f/0x90 [] sysfs_create_dir+0x29/0x50 [] _spin_unlock+0x25/0x40 [] kobject_add_internal+0xcd/0x1a0 [] kobject_set_name_vargs+0x3a/0x50 [] kobject_init_and_add+0x2d/0x40 [] sysfs_slab_add+0xd2/0x180 [] sysfs_add_func+0x0/0x70 [] sysfs_add_func+0x5c/0x70 (*) grab slub_lock [] run_workqueue+0x172/0x200 [] run_workqueue+0x10f/0x200 [] worker_thread+0x0/0xf0 [] worker_thread+0x9c/0xf0 [] autoremove_wake_function+0x0/0x50 [] worker_thread+0x0/0xf0 [] kthread+0x42/0x70 [] kthread+0x0/0x70 [] kernel_thread_helper+0x7/0x1c [] 0xffffffff -> #1 (slub_lock){----}: [] check_noncircular+0xd/0x110 [] slab_cpuup_callback+0x11f/0x1d0 [] validate_chain+0xb11/0x1070 [] slab_cpuup_callback+0x11f/0x1d0 [] mark_lock+0x35d/0xd00 [] __lock_acquire+0x263/0xa10 [] lock_acquire+0x7c/0xb0 [] slab_cpuup_callback+0x11f/0x1d0 [] down_read+0x43/0x80 [] slab_cpuup_callback+0x11f/0x1d0 (*) grab slub_lock [] slab_cpuup_callback+0x11f/0x1d0 [] notifier_call_chain+0x3c/0x70 [] _cpu_up+0x84/0x110 [] cpu_up+0x4b/0x70 (*) grab cpu_hotplug.lock [] kernel_init+0x0/0x170 [] kernel_init+0xb5/0x170 [] kernel_init+0x0/0x170 [] kernel_thread_helper+0x7/0x1c [] 0xffffffff -> #0 (&cpu_hotplug.lock){--..}: [] validate_chain+0x5af/0x1070 [] dev_status+0x0/0x50 [] __lock_acquire+0x263/0xa10 [] lock_acquire+0x7c/0xb0 [] get_online_cpus+0x29/0x50 [] mutex_lock_nested+0xa5/0x2f0 [] get_online_cpus+0x29/0x50 [] get_online_cpus+0x29/0x50 [] lru_add_drain_per_cpu+0x0/0x10 [] get_online_cpus+0x29/0x50 (*) grab cpu_hotplug.lock [] schedule_on_each_cpu+0x32/0xe0 [] __mlock_vma_pages_range+0x85/0x2c0 [] __lock_acquire+0x285/0xa10 [] vma_merge+0xa9/0x1d0 [] mlock_fixup+0x180/0x200 [] do_mlockall+0x78/0x90 (*) grab mmap_sem [] sys_mlockall+0x81/0xb0 [] syscall_call+0x7/0xb [] 0xffffffff other info that might help us debug this: 1 lock held by lvm/1103: #0: (&mm->mmap_sem){----}, at: [] sys_mlockall+0x4e/0xb0 stack backtrace: Pid: 1103, comm: lvm Not tainted 2.6.28-rc2-mm1 #2 Call Trace: [] print_circular_bug_tail+0x7c/0xd0 [] validate_chain+0x5af/0x1070 [] dev_status+0x0/0x50 [] __lock_acquire+0x263/0xa10 [] lock_acquire+0x7c/0xb0 [] get_online_cpus+0x29/0x50 [] mutex_lock_nested+0xa5/0x2f0 [] get_online_cpus+0x29/0x50 [] get_online_cpus+0x29/0x50 [] lru_add_drain_per_cpu+0x0/0x10 [] get_online_cpus+0x29/0x50 [] schedule_on_each_cpu+0x32/0xe0 [] __mlock_vma_pages_range+0x85/0x2c0 [] __lock_acquire+0x285/0xa10 [] vma_merge+0xa9/0x1d0 [] mlock_fixup+0x180/0x200 [] do_mlockall+0x78/0x90 [] sys_mlockall+0x81/0xb0 [] syscall_call+0x7/0xb Signed-off-by: KOSAKI Motohiro Tested-by: Kamalesh Babulal Cc: Lee Schermerhorn Cc: Christoph Lameter Cc: Heiko Carstens Cc: Nick Piggin Cc: Hugh Dickins Cc: Rik van Riel Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 008ea70b7afa..a6da2aee940a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -66,14 +66,10 @@ void __clear_page_mlock(struct page *page) putback_lru_page(page); } else { /* - * Page not on the LRU yet. Flush all pagevecs and retry. + * We lost the race. the page already moved to evictable list. */ - lru_add_drain_all(); - if (!isolate_lru_page(page)) - putback_lru_page(page); - else if (PageUnevictable(page)) + if (PageUnevictable(page)) count_vm_event(UNEVICTABLE_PGSTRANDED); - } } @@ -187,8 +183,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & VM_WRITE) gup_flags |= GUP_FLAGS_WRITE; - lru_add_drain_all(); /* push cached pages to LRU */ - while (nr_pages > 0) { int i; @@ -251,8 +245,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, ret = 0; } - lru_add_drain_all(); /* to update stats */ - return ret; /* count entire vma as locked_vm */ } @@ -546,6 +538,8 @@ asmlinkage long sys_mlock(unsigned long start, size_t len) if (!can_do_mlock()) return -EPERM; + lru_add_drain_all(); /* flush pagevec */ + down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; @@ -612,6 +606,8 @@ asmlinkage long sys_mlockall(int flags) if (!can_do_mlock()) goto out; + lru_add_drain_all(); /* flush pagevec */ + down_write(¤t->mm->mmap_sem); lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; -- cgit v1.2.2 From 33c5d3d64589c5d379db5a5615735f6d08438369 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 12 Nov 2008 13:27:01 -0800 Subject: memcg: bugfix for memory hotplug The start pfn calculation in page_cgroup's memory hotplug notifier chain is wrong. Tested-by: Badari Pulavarty Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index f59d797dc5a9..1223d927904d 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -165,7 +165,7 @@ int online_page_cgroup(unsigned long start_pfn, unsigned long start, end, pfn; int fail = 0; - start = start_pfn & (PAGES_PER_SECTION - 1); + start = start_pfn & ~(PAGES_PER_SECTION - 1); end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { @@ -188,7 +188,7 @@ int offline_page_cgroup(unsigned long start_pfn, { unsigned long start, end, pfn; - start = start_pfn & (PAGES_PER_SECTION - 1); + start = start_pfn & ~(PAGES_PER_SECTION - 1); end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) -- cgit v1.2.2 From 748f1a2ed7a68e15b28a1da3559afbebba121772 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 14 Nov 2008 16:25:01 +0900 Subject: mm: remove unevictable's show_page_path Hugh Dickins reported show_page_path() is buggy and unsafe because - lack dput() against d_find_alias() - don't concern vma->vm_mm->owner == NULL - lack lock_page() it was only for debugging, so rather than trying to fix it, just remove it now. Reported-by: Hugh Dickins Signed-off-by: Hugh Dickins Signed-off-by: KOSAKI Motohiro CC: Lee Schermerhorn CC: Rik van Riel Signed-off-by: Linus Torvalds --- mm/vmscan.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 3b5860294bb6..c141b3e78071 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2368,39 +2368,6 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) return 1; } -static void show_page_path(struct page *page) -{ - char buf[256]; - if (page_is_file_cache(page)) { - struct address_space *mapping = page->mapping; - struct dentry *dentry; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - - spin_lock(&mapping->i_mmap_lock); - dentry = d_find_alias(mapping->host); - printk(KERN_INFO "rescued: %s %lu\n", - dentry_path(dentry, buf, 256), pgoff); - spin_unlock(&mapping->i_mmap_lock); - } else { -#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU) - struct anon_vma *anon_vma; - struct vm_area_struct *vma; - - anon_vma = page_lock_anon_vma(page); - if (!anon_vma) - return; - - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - printk(KERN_INFO "rescued: anon %s\n", - vma->vm_mm->owner->comm); - break; - } - page_unlock_anon_vma(anon_vma); -#endif - } -} - - /** * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list * @page: page to check evictability and move to appropriate lru list @@ -2421,8 +2388,6 @@ retry: if (page_evictable(page, NULL)) { enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); - show_page_path(page); - __dec_zone_state(zone, NR_UNEVICTABLE); list_move(&page->lru, &zone->lru[l].list); __inc_zone_state(zone, NR_INACTIVE_ANON + l); -- cgit v1.2.2 From 72eb8c6747b49e41fd2b042510f03ac7c13426fc Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 17 Nov 2008 00:30:57 +0100 Subject: unitialized return value in mm/mlock.c: __mlock_vma_pages_range() Fix an unitialized return value when compiling on parisc (with CONFIG_UNEVICTABLE_LRU=y): mm/mlock.c: In function `__mlock_vma_pages_range': mm/mlock.c:165: warning: `ret' might be used uninitialized in this function Signed-off-by: Helge Deller [ It isn't ever really used uninitialized, since no caller should ever call this function with an empty range. But the compiler is correct that from a local analysis standpoint that is impossible to see, and fixing the warning is appropriate. ] Signed-off-by: Linus Torvalds --- mm/mlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index a6da2aee940a..1ada366570cb 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -162,7 +162,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long addr = start; struct page *pages[16]; /* 16 gives a reasonable batch */ int nr_pages = (end - start) / PAGE_SIZE; - int ret; + int ret = 0; int gup_flags = 0; VM_BUG_ON(start & ~PAGE_MASK); -- cgit v1.2.2