From 04c3496152394d17e3bc2316f9731ee3e8a026bc Mon Sep 17 00:00:00 2001 From: "Steven J. Magnani" Date: Wed, 24 Nov 2010 12:56:54 -0800 Subject: nommu: yield CPU while disposing VM Depending on processor speed, page size, and the amount of memory a process is allowed to amass, cleanup of a large VM may freeze the system for many seconds. This can result in a watchdog timeout. Make sure other tasks receive some service when cleaning up large VMs. Signed-off-by: Steven J. Magnani Cc: Greg Ungerer Reviewed-by: KOSAKI Motohiro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 3613517c7592..27a9ac588516 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1717,6 +1717,7 @@ void exit_mmap(struct mm_struct *mm) mm->mmap = vma->vm_next; delete_vma_from_mm(vma); delete_vma(mm, vma); + cond_resched(); } kleave(""); -- cgit v1.2.2 From 112bc2e120a94a511858918d6866a4978f9c500e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 24 Nov 2010 12:56:58 -0800 Subject: memcg: fix false positive VM_BUG on non-SMP Fix this: kernel BUG at mm/memcontrol.c:2155! invalid opcode: 0000 [#1] last sysfs file: Pid: 18, comm: sh Not tainted 2.6.37-rc3 #3 /Bochs EIP: 0060:[] EFLAGS: 00000246 CPU: 0 EIP is at mem_cgroup_move_account+0xe2/0xf0 EAX: 00000004 EBX: c6f931d4 ECX: c681c300 EDX: c681c000 ESI: c681c300 EDI: ffffffea EBP: c681c000 ESP: c46f3e30 DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068 Process sh (pid: 18, ti=c46f2000 task=c6826e60 task.ti=c46f2000) Stack: 00000155 c681c000 0805f000 c46ee180 c46f3e5c c7058820 c1074d37 00000000 08060000 c46db9a0 c46ec080 c7058820 0805f000 08060000 c46f3e98 c1074c50 c106c75e c46f3e98 c46ec080 08060000 0805ffff c46db9a0 c46f3e98 c46e0340 Call Trace: [] ? mem_cgroup_move_charge_pte_range+0xe7/0x130 [] ? mem_cgroup_move_charge_pte_range+0x0/0x130 [] ? walk_page_range+0xee/0x1d0 [] ? mem_cgroup_move_task+0x66/0x90 [] ? mem_cgroup_move_charge_pte_range+0x0/0x130 [] ? mem_cgroup_move_task+0x0/0x90 [] ? cgroup_attach_task+0x136/0x200 [] ? cgroup_tasks_write+0x48/0xc0 [] ? cgroup_file_write+0xde/0x220 [] ? do_page_fault+0x17d/0x3f0 [] ? alloc_fd+0x2d/0xd0 [] ? cgroup_file_write+0x0/0x220 [] ? vfs_write+0x92/0xc0 [] ? sys_write+0x41/0x70 [] ? syscall_call+0x7/0xb Code: 03 00 74 09 8b 44 24 04 e8 1c f1 ff ff 89 73 04 8d 86 b0 00 00 00 b9 01 00 00 00 89 da 31 ff e8 65 f5 ff ff e9 4d ff ff ff 0f 0b <0f> 0b 0f 0b 0f 0b 90 8d b4 26 00 00 00 00 83 ec 10 8b 0d f4 e3 EIP: [] mem_cgroup_move_account+0xe2/0xf0 SS:ESP 0068:c46f3e30 ---[ end trace 7daa1582159b6532 ]--- lock_page_cgroup and unlock_page_cgroup are implemented using bit_spinlock. bit_spinlock doesn't touch the bit if we are on non-SMP machine, so we can't use the bit to check whether the lock was taken. Let's introduce is_page_cgroup_locked based on bit_spin_is_locked instead of PageCgroupLocked to fix it. [akpm@linux-foundation.org: s/is_page_cgroup_locked/page_is_cgroup_locked/] Signed-off-by: Kirill A. Shutemov Reviewed-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2efa8ea07ff7..62d1880f6992 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2152,7 +2152,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, { VM_BUG_ON(from == to); VM_BUG_ON(PageLRU(pc->page)); - VM_BUG_ON(!PageCgroupLocked(pc)); + VM_BUG_ON(!page_is_cgroup_locked(pc)); VM_BUG_ON(!PageCgroupUsed(pc)); VM_BUG_ON(pc->mem_cgroup != from); -- cgit v1.2.2 From b1dd693e5b9348bd68a80e679e03cf9c0973b01b Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 24 Nov 2010 12:57:06 -0800 Subject: memcg: avoid deadlock between move charge and try_charge() __mem_cgroup_try_charge() can be called under down_write(&mmap_sem)(e.g. mlock does it). This means it can cause deadlock if it races with move charge: Ex.1) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | down_write(&mmap_sem) mc.moving_task = current | .. mem_cgroup_precharge_mc() | __mem_cgroup_try_charge() mem_cgroup_count_precharge() | prepare_to_wait() down_read(&mmap_sem) | if (mc.moving_task) -> cannot aquire the lock | -> true | schedule() Ex.2) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | mc.moving_task = current | mem_cgroup_precharge_mc() | mem_cgroup_count_precharge() | down_read(&mmap_sem) | .. | up_read(&mmap_sem) | | down_write(&mmap_sem) mem_cgroup_move_task() | .. mem_cgroup_move_charge() | __mem_cgroup_try_charge() down_read(&mmap_sem) | prepare_to_wait() -> cannot aquire the lock | if (mc.moving_task) | -> true | schedule() To avoid this deadlock, we do all the move charge works (both can_attach() and attach()) under one mmap_sem section. And after this patch, we set/clear mc.moving_task outside mc.lock, because we use the lock only to check mc.from/to. Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 62d1880f6992..26218df8d19d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -278,13 +278,14 @@ enum move_type { /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { - spinlock_t lock; /* for from, to, moving_task */ + spinlock_t lock; /* for from, to */ struct mem_cgroup *from; struct mem_cgroup *to; unsigned long precharge; unsigned long moved_charge; unsigned long moved_swap; struct task_struct *moving_task; /* a task moving charges */ + struct mm_struct *mm; wait_queue_head_t waitq; /* a waitq for other context */ } mc = { .lock = __SPIN_LOCK_UNLOCKED(mc.lock), @@ -4631,7 +4632,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + /* We've already held the mmap_sem */ for (vma = mm->mmap; vma; vma = vma->vm_next) { struct mm_walk mem_cgroup_count_precharge_walk = { .pmd_entry = mem_cgroup_count_precharge_pte_range, @@ -4643,7 +4644,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_count_precharge_walk); } - up_read(&mm->mmap_sem); precharge = mc.precharge; mc.precharge = 0; @@ -4694,11 +4694,16 @@ static void mem_cgroup_clear_mc(void) mc.moved_swap = 0; } + if (mc.mm) { + up_read(&mc.mm->mmap_sem); + mmput(mc.mm); + } spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; - mc.moving_task = NULL; spin_unlock(&mc.lock); + mc.moving_task = NULL; + mc.mm = NULL; mem_cgroup_end_move(from); memcg_oom_recover(from); memcg_oom_recover(to); @@ -4724,12 +4729,21 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, return 0; /* We move charges only when we move a owner of the mm */ if (mm->owner == p) { + /* + * We do all the move charge works under one mmap_sem to + * avoid deadlock with down_write(&mmap_sem) + * -> try_charge() -> if (mc.moving_task) -> sleep. + */ + down_read(&mm->mmap_sem); + VM_BUG_ON(mc.from); VM_BUG_ON(mc.to); VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); VM_BUG_ON(mc.moving_task); + VM_BUG_ON(mc.mm); + mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; @@ -4737,14 +4751,16 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, mc.precharge = 0; mc.moved_charge = 0; mc.moved_swap = 0; - mc.moving_task = current; spin_unlock(&mc.lock); + mc.moving_task = current; + mc.mm = mm; ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); - } - mmput(mm); + /* We call up_read() and mmput() in clear_mc(). */ + } else + mmput(mm); } return ret; } @@ -4832,7 +4848,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) struct vm_area_struct *vma; lru_add_drain_all(); - down_read(&mm->mmap_sem); + /* We've already held the mmap_sem */ for (vma = mm->mmap; vma; vma = vma->vm_next) { int ret; struct mm_walk mem_cgroup_move_charge_walk = { @@ -4851,7 +4867,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) */ break; } - up_read(&mm->mmap_sem); } static void mem_cgroup_move_task(struct cgroup_subsys *ss, @@ -4860,17 +4875,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct task_struct *p, bool threadgroup) { - struct mm_struct *mm; - - if (!mc.to) + if (!mc.mm) /* no need to move charge */ return; - mm = get_task_mm(p); - if (mm) { - mem_cgroup_move_charge(mm); - mmput(mm); - } + mem_cgroup_move_charge(mc.mm); mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -- cgit v1.2.2 From a42c390cfa0c2612459d7226ba11612847ca3a64 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 24 Nov 2010 12:57:08 -0800 Subject: cgroups: make swap accounting default behavior configurable Swap accounting can be configured by CONFIG_CGROUP_MEM_RES_CTLR_SWAP configuration option and then it is turned on by default. There is a boot option (noswapaccount) which can disable this feature. This makes it hard for distributors to enable the configuration option as this feature leads to a bigger memory consumption and this is a no-go for general purpose distribution kernel. On the other hand swap accounting may be very usuful for some workloads. This patch adds a new configuration option which controls the default behavior (CGROUP_MEM_RES_CTLR_SWAP_ENABLED). If the option is selected then the feature is turned on by default. It also adds a new boot parameter swapaccount[=1|0] which enhances the original noswapaccount parameter semantic by means of enable/disable logic (defaults to 1 if no value is provided to be still consistent with noswapaccount). The default behavior is unchanged (if CONFIG_CGROUP_MEM_RES_CTLR_SWAP is enabled then CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED is enabled as well) Signed-off-by: Michal Hocko Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 26218df8d19d..7a22b4129211 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -61,7 +61,14 @@ struct mem_cgroup *root_mem_cgroup __read_mostly; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ int do_swap_account __read_mostly; -static int really_do_swap_account __initdata = 1; /* for remember boot option*/ + +/* for remember boot option*/ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED +static int really_do_swap_account __initdata = 1; +#else +static int really_do_swap_account __initdata = 0; +#endif + #else #define do_swap_account (0) #endif @@ -4920,10 +4927,20 @@ struct cgroup_subsys mem_cgroup_subsys = { }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +static int __init enable_swap_account(char *s) +{ + /* consider enabled if no parameter or 1 is given */ + if (!s || !strcmp(s, "1")) + really_do_swap_account = 1; + else if (!strcmp(s, "0")) + really_do_swap_account = 0; + return 1; +} +__setup("swapaccount", enable_swap_account); static int __init disable_swap_account(char *s) { - really_do_swap_account = 0; + enable_swap_account("0"); return 1; } __setup("noswapaccount", disable_swap_account); -- cgit v1.2.2 From e9959f0f37160e1f5351af828cc981712b5066c1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 24 Nov 2010 12:57:09 -0800 Subject: mm/page_alloc.c: fix build_all_zonelist() where percpu_alloc() is wrongly called under stop_machine_run() During memory hotplug, build_allzonelists() may be called under stop_machine_run(). In this function, setup_zone_pageset() is called. But it's bug because it will do page allocation under stop_machine_run(). Here is a report from Alok Kataria. BUG: sleeping function called from invalid context at kernel/mutex.c:94 in_atomic(): 0, irqs_disabled(): 1, pid: 4, name: migration/0 Pid: 4, comm: migration/0 Not tainted 2.6.35.6-45.fc14.x86_64 #1 Call Trace: [] __might_sleep+0xeb/0xf0 [] mutex_lock+0x24/0x50 [] pcpu_alloc+0x6d/0x7ee [] ? load_balance+0xbe/0x60e [] ? rt_se_boosted+0x21/0x2f [] ? dequeue_rt_stack+0x18b/0x1ed [] __alloc_percpu+0x10/0x12 [] setup_zone_pageset+0x38/0xbe [] ? build_zonelists_node.clone.58+0x79/0x8c [] __build_all_zonelists+0x419/0x46c [] ? cpu_stopper_thread+0xb2/0x198 [] stop_machine_cpu_stop+0x8e/0xc5 [] ? stop_machine_cpu_stop+0x0/0xc5 [] cpu_stopper_thread+0x108/0x198 [] ? schedule+0x5b2/0x5cc [] ? cpu_stopper_thread+0x0/0x198 [] kthread+0x7f/0x87 [] kernel_thread_helper+0x4/0x10 [] ? kthread+0x0/0x87 [] ? kernel_thread_helper+0x0/0x10 Built 5 zonelists in Node order, mobility grouping on. Total pages: 289456 Policy zone: Normal This patch tries to fix the issue by moving setup_zone_pageset() out from stop_machine_run(). It's obviously not necessary to be called under stop_machine_run(). [akpm@linux-foundation.org: remove unneeded local] Reported-by: Alok Kataria Signed-off-by: KAMEZAWA Hiroyuki Cc: Tejun Heo Cc: Petr Vandrovec Cc: Pekka Enberg Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07a654486f75..e4092704c1a9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3008,14 +3008,6 @@ static __init_refok int __build_all_zonelists(void *data) build_zonelist_cache(pgdat); } -#ifdef CONFIG_MEMORY_HOTPLUG - /* Setup real pagesets for the new zone */ - if (data) { - struct zone *zone = data; - setup_zone_pageset(zone); - } -#endif - /* * Initialize the boot_pagesets that are going to be used * for bootstrapping processors. The real pagesets for @@ -3064,7 +3056,11 @@ void build_all_zonelists(void *data) } else { /* we have to stop all cpus to guarantee there is no user of zonelist */ - stop_machine(__build_all_zonelists, data, NULL); +#ifdef CONFIG_MEMORY_HOTPLUG + if (data) + setup_zone_pageset((struct zone *)data); +#endif + stop_machine(__build_all_zonelists, NULL, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); -- cgit v1.2.2 From 5f0af70a25593a9d53b87bc8d31902fb7cc63e40 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 24 Nov 2010 12:57:10 -0800 Subject: mm: remove call to find_vma in pagewalk for non-hugetlbfs Commit d33b9f45 ("mm: hugetlb: fix hugepage memory leak in walk_page_range()") introduces a check if a vma is a hugetlbfs one and later in 5dc37642 ("mm hugetlb: add hugepage support to pagemap") it is moved under #ifdef CONFIG_HUGETLB_PAGE but a needless find_vma call is left behind and its result is not used anywhere else in the function. The side-effect of caching vma for @addr inside walk->mm is neither utilized in walk_page_range() nor in called functions. Signed-off-by: David Sterba Reviewed-by: Naoya Horiguchi Acked-by: Andi Kleen Cc: Andy Whitcroft Cc: David Rientjes Cc: Hugh Dickins Cc: Lee Schermerhorn Cc: Matt Mackall Acked-by: Mel Gorman Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 8b1a2ce21ee5..38cc58b8b2b0 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -139,7 +139,6 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd_t *pgd; unsigned long next; int err = 0; - struct vm_area_struct *vma; if (addr >= end) return err; @@ -149,15 +148,17 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, addr); do { + struct vm_area_struct *uninitialized_var(vma); + next = pgd_addr_end(addr, end); +#ifdef CONFIG_HUGETLB_PAGE /* * handle hugetlb vma individually because pagetable walk for * the hugetlb page is dependent on the architecture and * we can't handled it in the same manner as non-huge pages. */ vma = find_vma(walk->mm, addr); -#ifdef CONFIG_HUGETLB_PAGE if (vma && is_vm_hugetlb_page(vma)) { if (vma->vm_end < next) next = vma->vm_end; -- cgit v1.2.2 From 1f64d69c7ad2e48e697493e45590679f7a69b7b2 Mon Sep 17 00:00:00 2001 From: Dean Nelson Date: Thu, 2 Dec 2010 14:31:12 -0800 Subject: mm/hugetlb.c: avoid double unlock_page() in hugetlb_fault() Have hugetlb_fault() call unlock_page(page) only if it had previously called lock_page(page). Setting CONFIG_DEBUG_VM=y and then running the libhugetlbfs test suite, resulted in the tripping of VM_BUG_ON(!PageLocked(page)) in unlock_page() having been called by hugetlb_fault() when page == pagecache_page. This patch remedied the problem. Signed-off-by: Dean Nelson Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c4a3558589ab..85855240933d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2738,7 +2738,8 @@ out_page_table_lock: unlock_page(pagecache_page); put_page(pagecache_page); } - unlock_page(page); + if (page != pagecache_page) + unlock_page(page); out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); -- cgit v1.2.2 From 55cfaa3cbdd29c4919ecb5fb8965c310f357e48c Mon Sep 17 00:00:00 2001 From: Zeng Zhaoming Date: Thu, 2 Dec 2010 14:31:13 -0800 Subject: mm/mempolicy.c: add rcu read lock to protect pid structure find_task_by_vpid() should be protected by rcu_read_lock(), to prevent free_pid() reclaiming pid. Signed-off-by: Zeng Zhaoming Cc: "Paul E. McKenney" Cc: KOSAKI Motohiro Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4a57f135b76e..11ff260fb282 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1307,15 +1307,18 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, goto out; /* Find the mm_struct */ + rcu_read_lock(); read_lock(&tasklist_lock); task = pid ? find_task_by_vpid(pid) : current; if (!task) { read_unlock(&tasklist_lock); + rcu_read_unlock(); err = -ESRCH; goto out; } mm = get_task_mm(task); read_unlock(&tasklist_lock); + rcu_read_unlock(); err = -EINVAL; if (!mm) -- cgit v1.2.2 From e172662d113ceb22db727a979bb35b9c02f703b5 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 2 Dec 2010 14:31:13 -0800 Subject: vmstat: fix dirty threshold ordering The nr_dirty_[background_]threshold fields are misplaced before the numa_* fields, and users will read strange values. This is the right order. Before patch, nr_dirty_background_threshold will read as 0 (the value from numa_miss). numa_hit 128501 numa_miss 0 numa_foreign 0 numa_interleave 7388 numa_local 128501 numa_other 0 nr_dirty_threshold 144291 nr_dirty_background_threshold 72145 Signed-off-by: Wu Fengguang Cc: Michael Rubin Reviewed-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 42eac4d33216..8f62f17ee1c7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -750,8 +750,6 @@ static const char * const vmstat_text[] = { "nr_shmem", "nr_dirtied", "nr_written", - "nr_dirty_threshold", - "nr_dirty_background_threshold", #ifdef CONFIG_NUMA "numa_hit", @@ -761,6 +759,8 @@ static const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "nr_dirty_threshold", + "nr_dirty_background_threshold", #ifdef CONFIG_VM_EVENT_COUNTERS "pgpgin", -- cgit v1.2.2 From 64141da587241301ce8638cc945f8b67853156ec Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 2 Dec 2010 14:31:18 -0800 Subject: vmalloc: eagerly clear ptes on vunmap On stock 2.6.37-rc4, running: # mount lilith:/export /mnt/lilith # find /mnt/lilith/ -type f -print0 | xargs -0 file crashes the machine fairly quickly under Xen. Often it results in oops messages, but the couple of times I tried just now, it just hung quietly and made Xen print some rude messages: (XEN) mm.c:2389:d80 Bad type (saw 7400000000000001 != exp 3000000000000000) for mfn 1d7058 (pfn 18fa7) (XEN) mm.c:964:d80 Attempt to create linear p.t. with write perms (XEN) mm.c:2389:d80 Bad type (saw 7400000000000010 != exp 1000000000000000) for mfn 1d2e04 (pfn 1d1fb) (XEN) mm.c:2965:d80 Error while pinning mfn 1d2e04 Which means the domain tried to map a pagetable page RW, which would allow it to map arbitrary memory, so Xen stopped it. This is because vm_unmap_ram() left some pages mapped in the vmalloc area after NFS had finished with them, and those pages got recycled as pagetable pages while still having these RW aliases. Removing those mappings immediately removes the Xen-visible aliases, and so it has no problem with those pages being reused as pagetable pages. Deferring the TLB flush doesn't upset Xen because it can flush the TLB itself as needed to maintain its invariants. When unmapping a region in the vmalloc space, clear the ptes immediately. There's no point in deferring this because there's no amortization benefit. The TLBs are left dirty, and they are flushed lazily to amortize the cost of the IPIs. This specific motivation for this patch is an oops-causing regression since 2.6.36 when using NFS under Xen, triggered by the NFS client's use of vm_map_ram() introduced in 56e4ebf877b60 ("NFS: readdir with vmapped pages") . XFS also uses vm_map_ram() and could cause similar problems. Signed-off-by: Jeremy Fitzhardinge Cc: Nick Piggin Cc: Bryan Schumaker Cc: Trond Myklebust Cc: Alex Elder Cc: Dave Chinner Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a3d66b3dc5cb..eb5cc7d00c5a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -31,8 +31,6 @@ #include #include -bool vmap_lazy_unmap __read_mostly = true; - /*** Page table manipulation functions ***/ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) @@ -503,9 +501,6 @@ static unsigned long lazy_max_pages(void) { unsigned int log; - if (!vmap_lazy_unmap) - return 0; - log = fls(num_online_cpus()); return log * (32UL * 1024 * 1024 / PAGE_SIZE); @@ -566,7 +561,6 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, if (va->va_end > *end) *end = va->va_end; nr += (va->va_end - va->va_start) >> PAGE_SHIFT; - unmap_vmap_area(va); list_add_tail(&va->purge_list, &valist); va->flags |= VM_LAZY_FREEING; va->flags &= ~VM_LAZY_FREE; @@ -611,10 +605,11 @@ static void purge_vmap_area_lazy(void) } /* - * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been - * called for the correct range previously. + * Free a vmap area, caller ensuring that the area has been unmapped + * and flush_cache_vunmap had been called for the correct range + * previously. */ -static void free_unmap_vmap_area_noflush(struct vmap_area *va) +static void free_vmap_area_noflush(struct vmap_area *va) { va->flags |= VM_LAZY_FREE; atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); @@ -622,6 +617,16 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va) try_purge_vmap_area_lazy(); } +/* + * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been + * called for the correct range previously. + */ +static void free_unmap_vmap_area_noflush(struct vmap_area *va) +{ + unmap_vmap_area(va); + free_vmap_area_noflush(va); +} + /* * Free and unmap a vmap area */ @@ -798,7 +803,7 @@ static void free_vmap_block(struct vmap_block *vb) spin_unlock(&vmap_block_tree_lock); BUG_ON(tmp != vb); - free_unmap_vmap_area_noflush(vb->va); + free_vmap_area_noflush(vb->va); call_rcu(&vb->rcu_head, rcu_free_vb); } @@ -936,6 +941,8 @@ static void vb_free(const void *addr, unsigned long size) rcu_read_unlock(); BUG_ON(!vb); + vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + spin_lock(&vb->lock); BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); @@ -988,7 +995,6 @@ void vm_unmap_aliases(void) s = vb->va->va_start + (i << PAGE_SHIFT); e = vb->va->va_start + (j << PAGE_SHIFT); - vunmap_page_range(s, e); flush = 1; if (s < start) -- cgit v1.2.2 From 20d6c96b5f1cad5c5da4641945ec17a1d9a1afc8 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 2 Dec 2010 14:31:19 -0800 Subject: mem-hotplug: introduce {un}lock_memory_hotplug() Presently hwpoison is using lock_system_sleep() to prevent a race with memory hotplug. However lock_system_sleep() is a no-op if CONFIG_HIBERNATION=n. Therefore we need a new lock. Signed-off-by: KOSAKI Motohiro Cc: Andi Kleen Cc: Kamezawa Hiroyuki Suggested-by: Hugh Dickins Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 8 ++++---- mm/memory_hotplug.c | 31 ++++++++++++++++++++++++------- 2 files changed, 28 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 124324134ff6..46ab2c044b0e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -51,6 +51,7 @@ #include #include #include +#include #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -1230,11 +1231,10 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) return 1; /* - * The lock_system_sleep prevents a race with memory hotplug, - * because the isolation assumes there's only a single user. + * The lock_memory_hotplug prevents a race with memory hotplug. * This is a big hammer, a better would be nicer. */ - lock_system_sleep(); + lock_memory_hotplug(); /* * Isolate the page, so that it doesn't get reallocated if it @@ -1264,7 +1264,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) ret = 1; } unset_migratetype_isolate(p); - unlock_system_sleep(); + unlock_memory_hotplug(); return ret; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9260314a221e..2c6523af5473 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -34,6 +34,23 @@ #include "internal.h" +DEFINE_MUTEX(mem_hotplug_mutex); + +void lock_memory_hotplug(void) +{ + mutex_lock(&mem_hotplug_mutex); + + /* for exclusive hibernation if CONFIG_HIBERNATION=y */ + lock_system_sleep(); +} + +void unlock_memory_hotplug(void) +{ + unlock_system_sleep(); + mutex_unlock(&mem_hotplug_mutex); +} + + /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) { @@ -493,7 +510,7 @@ int mem_online_node(int nid) pg_data_t *pgdat; int ret; - lock_system_sleep(); + lock_memory_hotplug(); pgdat = hotadd_new_pgdat(nid, 0); if (pgdat) { ret = -ENOMEM; @@ -504,7 +521,7 @@ int mem_online_node(int nid) BUG_ON(ret); out: - unlock_system_sleep(); + unlock_memory_hotplug(); return ret; } @@ -516,7 +533,7 @@ int __ref add_memory(int nid, u64 start, u64 size) struct resource *res; int ret; - lock_system_sleep(); + lock_memory_hotplug(); res = register_memory_resource(start, size); ret = -EEXIST; @@ -563,7 +580,7 @@ error: release_memory_resource(res); out: - unlock_system_sleep(); + unlock_memory_hotplug(); return ret; } EXPORT_SYMBOL_GPL(add_memory); @@ -791,7 +808,7 @@ static int offline_pages(unsigned long start_pfn, if (!test_pages_in_a_zone(start_pfn, end_pfn)) return -EINVAL; - lock_system_sleep(); + lock_memory_hotplug(); zone = page_zone(pfn_to_page(start_pfn)); node = zone_to_nid(zone); @@ -880,7 +897,7 @@ repeat: writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); - unlock_system_sleep(); + unlock_memory_hotplug(); return 0; failed_removal: @@ -891,7 +908,7 @@ failed_removal: undo_isolate_page_range(start_pfn, end_pfn); out: - unlock_system_sleep(); + unlock_memory_hotplug(); return ret; } -- cgit v1.2.2 From a0b0f58cdd32ab363a600a294ddaa90f0c32de8c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 2 Dec 2010 14:31:20 -0800 Subject: ksm: annotate ksm_thread_mutex is no deadlock source commit 62b61f611e ("ksm: memory hotremove migration only") caused the following new lockdep warning. ======================================================= [ INFO: possible circular locking dependency detected ] ------------------------------------------------------- bash/1621 is trying to acquire lock: ((memory_chain).rwsem){.+.+.+}, at: [] __blocking_notifier_call_chain+0x69/0xc0 but task is already holding lock: (ksm_thread_mutex){+.+.+.}, at: [] ksm_memory_callback+0x3a/0xc0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (ksm_thread_mutex){+.+.+.}: [] lock_acquire+0xaa/0x140 [] __mutex_lock_common+0x44/0x3f0 [] mutex_lock_nested+0x48/0x60 [] ksm_memory_callback+0x3a/0xc0 [] notifier_call_chain+0x8c/0xe0 [] __blocking_notifier_call_chain+0x7e/0xc0 [] blocking_notifier_call_chain+0x16/0x20 [] memory_notify+0x1b/0x20 [] remove_memory+0x1cc/0x5f0 [] memory_block_change_state+0xfd/0x1a0 [] store_mem_state+0xe2/0xf0 [] sysdev_store+0x20/0x30 [] sysfs_write_file+0xe6/0x170 [] vfs_write+0xc8/0x190 [] sys_write+0x54/0x90 [] system_call_fastpath+0x16/0x1b -> #0 ((memory_chain).rwsem){.+.+.+}: [] __lock_acquire+0x155a/0x1600 [] lock_acquire+0xaa/0x140 [] down_read+0x51/0xa0 [] __blocking_notifier_call_chain+0x69/0xc0 [] blocking_notifier_call_chain+0x16/0x20 [] memory_notify+0x1b/0x20 [] remove_memory+0x56e/0x5f0 [] memory_block_change_state+0xfd/0x1a0 [] store_mem_state+0xe2/0xf0 [] sysdev_store+0x20/0x30 [] sysfs_write_file+0xe6/0x170 [] vfs_write+0xc8/0x190 [] sys_write+0x54/0x90 [] system_call_fastpath+0x16/0x1b But it's a false positive. Both memory_chain.rwsem and ksm_thread_mutex have an outer lock (mem_hotplug_mutex). So they cannot deadlock. Thus, This patch annotate ksm_thread_mutex is not deadlock source. [akpm@linux-foundation.org: update comment, from Hugh] Signed-off-by: KOSAKI Motohiro Acked-by: Hugh Dickins Cc: Andrea Arcangeli Cc: Andi Kleen Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 65ab5c7067d9..43bc893470b4 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1724,8 +1724,13 @@ static int ksm_memory_callback(struct notifier_block *self, /* * Keep it very simple for now: just lock out ksmd and * MADV_UNMERGEABLE while any memory is going offline. + * mutex_lock_nested() is necessary because lockdep was alarmed + * that here we take ksm_thread_mutex inside notifier chain + * mutex, and later take notifier chain mutex inside + * ksm_thread_mutex to unlock it. But that's safe because both + * are inside mem_hotplug_mutex. */ - mutex_lock(&ksm_thread_mutex); + mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); break; case MEM_OFFLINE: -- cgit v1.2.2 From 37d57443d5d810c6ef49e93586b046e7d4774818 Mon Sep 17 00:00:00 2001 From: Tero Roponen Date: Wed, 1 Dec 2010 20:04:20 +0200 Subject: slub: Fix a crash during slabinfo -v Commit f7cb1933621bce66a77f690776a16fe3ebbc4d58 ("SLUB: Pass active and inactive redzone flags instead of boolean to debug functions") missed two instances of check_object(). This caused a lot of warnings during 'slabinfo -v' finally leading to a crash: BUG ext4_xattr: Freepointer corrupt ... BUG buffer_head: Freepointer corrupt ... BUG ext4_alloc_context: Freepointer corrupt ... ... BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] file_sb_list_del+0x1c/0x35 PGD 79d78067 PUD 79e67067 PMD 0 Oops: 0002 [#1] SMP last sysfs file: /sys/kernel/slab/:t-0000192/validate This patch fixes the problem by converting the two missed instances. Acked-by: Christoph Lameter Signed-off-by: Tero Roponen Signed-off-by: Pekka Enberg --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 981fb730aa04..bec0e355fbad 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3401,13 +3401,13 @@ static int validate_slab(struct kmem_cache *s, struct page *page, for_each_free_object(p, s, page->freelist) { set_bit(slab_index(p, s, addr), map); - if (!check_object(s, page, p, 0)) + if (!check_object(s, page, p, SLUB_RED_INACTIVE)) return 0; } for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, 1)) + if (!check_object(s, page, p, SLUB_RED_ACTIVE)) return 0; return 1; } -- cgit v1.2.2 From c9e664f1fdf34aa8cede047b206deaa8f1945af0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 3 Dec 2010 22:57:45 +0100 Subject: PM / Hibernate: Fix memory corruption related to swap There is a problem that swap pages allocated before the creation of a hibernation image can be released and used for storing the contents of different memory pages while the image is being saved. Since the kernel stored in the image doesn't know of that, it causes memory corruption to occur after resume from hibernation, especially on systems with relatively small RAM that need to swap often. This issue can be addressed by keeping the GFP_IOFS bits clear in gfp_allowed_mask during the entire hibernation, including the saving of the image, until the system is finally turned off or the hibernation is aborted. Unfortunately, for this purpose it's necessary to rework the way in which the hibernate and suspend code manipulates gfp_allowed_mask. This change is based on an earlier patch from Hugh Dickins. Signed-off-by: Rafael J. Wysocki Reported-by: Ondrej Zary Acked-by: Hugh Dickins Reviewed-by: KAMEZAWA Hiroyuki Cc: stable@kernel.org --- mm/page_alloc.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e4092704c1a9..ff7e15872398 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -104,19 +104,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; * only be modified with pm_mutex held, unless the suspend/hibernate code is * guaranteed not to run in parallel with that modification). */ -void set_gfp_allowed_mask(gfp_t mask) + +static gfp_t saved_gfp_mask; + +void pm_restore_gfp_mask(void) { WARN_ON(!mutex_is_locked(&pm_mutex)); - gfp_allowed_mask = mask; + if (saved_gfp_mask) { + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + } } -gfp_t clear_gfp_allowed_mask(gfp_t mask) +void pm_restrict_gfp_mask(void) { - gfp_t ret = gfp_allowed_mask; - WARN_ON(!mutex_is_locked(&pm_mutex)); - gfp_allowed_mask &= ~mask; - return ret; + WARN_ON(saved_gfp_mask); + saved_gfp_mask = gfp_allowed_mask; + gfp_allowed_mask &= ~GFP_IOFS; } #endif /* CONFIG_PM_SLEEP */ -- cgit v1.2.2