From 5bead2a0680687b9576d57c177988e8aa082b922 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Sat, 13 Sep 2008 02:33:19 -0700 Subject: mm: mark the correct zone as full when scanning zonelists The iterator for_each_zone_zonelist() uses a struct zoneref *z cursor when scanning zonelists to keep track of where in the zonelist it is. The zoneref that is returned corresponds to the the next zone that is to be scanned, not the current one. It was intended to be treated as an opaque list. When the page allocator is scanning a zonelist, it marks elements in the zonelist corresponding to zones that are temporarily full. As the zonelist is being updated, it uses the cursor here; if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); This is intended to prevent rescanning in the near future but the zoneref cursor does not correspond to the zone that has been found to be full. This is an easy misunderstanding to make so this patch corrects the problem by changing zoneref cursor to be the current zone being scanned instead of the next one. Signed-off-by: Mel Gorman Cc: Andy Whitcroft Cc: KAMEZAWA Hiroyuki Cc: [2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmzone.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmzone.c b/mm/mmzone.c index 486ed595ee6f..16ce8b955dcf 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, (z->zone && !zref_in_nodemask(z, nodes))) z++; - *zone = zonelist_zone(z++); + *zone = zonelist_zone(z); return z; } -- cgit v1.2.2 From 02b71b70129aaaa38f280af2aa5a767a4dec9107 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Thu, 11 Sep 2008 12:25:41 -0700 Subject: slub: fixed uninitialized counter in struct kmem_cache_node Initialized total objects atomic for the node in init_kmem_cache_node. The uninitialized value was ruining the stats in /proc/slabinfo. Acked-by: Christoph Lameter Signed-off-by: Salman Qazi Signed-off-by: Pekka Enberg --- mm/slub.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index fb486d5540f8..0c83e6afe7b2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1932,6 +1932,7 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG atomic_long_set(&n->nr_slabs, 0); + atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif } -- cgit v1.2.2 From db203d53d474aa068984e409d807628f5841da1b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 22 Sep 2008 13:57:50 -0700 Subject: mm: tiny-shmem fix lock ordering: mmap_sem vs i_mutex tiny-shmem calls do_truncate in shmem_file_setup. do_truncate takes i_mutex, and shmem_file_setup is called with mmap_sem held. However i_mutex nests outside mmap_sem. Copy the code in shmem.c to avoid this problem. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Nick Piggin Reported-and-tested-by: Ingo Molnar Cc: Peter Zijlstra Cc: Matt Mackall Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/tiny-shmem.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index ae532f501943..d17cb6f6ab10 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -65,31 +65,25 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) if (!dentry) goto put_memory; - error = -ENOSPC; - inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); - if (!inode) - goto put_dentry; - - d_instantiate(dentry, inode); error = -ENFILE; - file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ, - &ramfs_file_operations); + file = get_empty_filp(); if (!file) goto put_dentry; - inode->i_nlink = 0; /* It is unlinked */ - - /* notify everyone as to the change of file size */ - error = do_truncate(dentry, size, 0, file); - if (error < 0) + error = -ENOSPC; + inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); + if (!inode) goto close_file; + d_instantiate(dentry, inode); + inode->i_size = size; + inode->i_nlink = 0; /* It is unlinked */ + init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, + &ramfs_file_operations); return file; close_file: put_filp(file); - return ERR_PTR(error); - put_dentry: dput(dentry); put_memory: -- cgit v1.2.2 From a10cebf56ca7e7c034d1b6646230c6553e478967 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Mon, 22 Sep 2008 13:57:52 -0700 Subject: memcg: check under limit at shrink_usage Current memory cgroup(both in mainline and -mm) doesn't account swap caches as memory(swap cache support is dropped temporarily now). So try_to_free_mem_cgroup_pages doesn't reflect the count of pages that have been moved to swap cache. But this makes mem_cgroup_shrink_usage fail easily if most of the pages are anon/shmem, and then shmem_getpage returns -ENOMEM and the process will be killed. This patch adds res_counter_check_under_limit to avoid these cases. BTW, even if swap cache support is enabled again, if a process is moved to another cgroup, which has been just made, between precharge and shrink_usage in shmem_getpage, shrink_usage may fail just because there is no pages to reclaim. So this change would make sense anyway. Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Cc: Pavel Emelyanov Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0f1f7a7374ba..c0500e4d3a2f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -806,6 +806,7 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) do { progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); + progress += res_counter_check_under_limit(&mem->res); } while (!progress && --retry); css_put(&mem->css); -- cgit v1.2.2 From 31a78f23bac0069004e69f98808b6988baccb6b6 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Sun, 28 Sep 2008 23:09:31 +0100 Subject: mm owner: fix race between swapoff and exit There's a race between mm->owner assignment and swapoff, more easily seen when task slab poisoning is turned on. The condition occurs when try_to_unuse() runs in parallel with an exiting task. A similar race can occur with callers of get_task_mm(), such as /proc// or ptrace or page migration. CPU0 CPU1 try_to_unuse looks at mm = task0->mm increments mm->mm_users task 0 exits mm->owner needs to be updated, but no new owner is found (mm_users > 1, but no other task has task->mm = task0->mm) mm_update_next_owner() leaves mmput(mm) decrements mm->mm_users task0 freed dereferencing mm->owner fails The fix is to notify the subsystem via mm_owner_changed callback(), if no new owner is found, by specifying the new task as NULL. Jiri Slaby: mm->owner was set to NULL prior to calling cgroup_mm_owner_callbacks(), but must be set after that, so as not to pass NULL as old owner causing oops. Daisuke Nishimura: mm_update_next_owner() may set mm->owner to NULL, but mem_cgroup_from_task() and its callers need to take account of this situation to avoid oops. Hugh Dickins: Lockdep warning and hang below exec_mmap() when testing these patches. exit_mm() up_reads mmap_sem before calling mm_update_next_owner(), so exec_mmap() now needs to do the same. And with that repositioning, there's now no point in mm_need_new_owner() allowing for NULL mm. Reported-by: Hugh Dickins Signed-off-by: Balbir Singh Signed-off-by: Jiri Slaby Signed-off-by: Daisuke Nishimura Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c0500e4d3a2f..36896f3eb7f5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -250,6 +250,14 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { + /* + * mm_update_next_owner() may clear mm->owner to NULL + * if it races with swapoff, page migration, etc. + * So this can be called with p == NULL. + */ + if (unlikely(!p)) + return NULL; + return container_of(task_subsys_state(p, mem_cgroup_subsys_id), struct mem_cgroup, css); } @@ -549,6 +557,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, if (likely(!memcg)) { rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + kmem_cache_free(page_cgroup_cache, pc); + return 0; + } /* * For every charge from the cgroup, increment reference count */ @@ -801,6 +814,10 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + return 0; + } css_get(&mem->css); rcu_read_unlock(); -- cgit v1.2.2 From 6c1b7f680dd4f550fa6f91f148cc6fa2c4bd0737 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 2 Oct 2008 14:50:16 -0700 Subject: memory hotplug: missing zone->lock in test_pages_isolated() __test_page_isolated_in_pageblock() in mm/page_isolation.c has a comment saying that the caller must hold zone->lock. But the only caller of that function, test_pages_isolated(), does not hold zone->lock and the lock is also not acquired anywhere before. This patch adds the missing zone->lock to test_pages_isolated(). We reproducibly run into BUG_ON(!PageBuddy(page)) in __offline_isolated_pages() during memory hotplug stress test, see trace below. This patch fixes that problem, it would be good if we could have it in 2.6.27. kernel BUG at /home/autobuild/BUILD/linux-2.6.26-20080909/mm/page_alloc.c:4561! illegal operation: 0001 [#1] PREEMPT SMP Modules linked in: dm_multipath sunrpc bonding qeth_l3 dm_mod qeth ccwgroup vmur CPU: 1 Not tainted 2.6.26-29.x.20080909-s390default #1 Process memory_loop_all (pid: 10025, task: 2f444028, ksp: 2b10dd28) Krnl PSW : 040c0000 801727ea (__offline_isolated_pages+0x18e/0x1c4) R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:0 CC:0 PM:0 Krnl GPRS: 00000000 7e27fc00 00000000 7e27fc00 00000000 00000400 00014000 7e27fc01 00606f00 7e27fc00 00013fe0 2b10dd28 00000005 80172662 801727b2 2b10dd28 Krnl Code: 801727de: 5810900c l %r1,12(%r9) 801727e2: a7f4ffb3 brc 15,80172748 801727e6: a7f40001 brc 15,801727e8 >801727ea: a7f4ffbc brc 15,80172762 801727ee: a7f40001 brc 15,801727f0 801727f2: a7f4ffaf brc 15,80172750 801727f6: 0707 bcr 0,%r7 801727f8: 0017 unknown Call Trace: ([<0000000000172772>] __offline_isolated_pages+0x116/0x1c4) [<00000000001953a2>] offline_isolated_pages_cb+0x22/0x34 [<000000000013164c>] walk_memory_resource+0xcc/0x11c [<000000000019520e>] offline_pages+0x36a/0x498 [<00000000001004d6>] remove_memory+0x36/0x44 [<000000000028fb06>] memory_block_change_state+0x112/0x150 [<000000000028ffb8>] store_mem_state+0x90/0xe4 [<0000000000289c00>] sysdev_store+0x34/0x40 [<00000000001ee048>] sysfs_write_file+0xd0/0x178 [<000000000019b1a8>] vfs_write+0x74/0x118 [<000000000019b9ae>] sys_write+0x46/0x7c [<000000000011160e>] sysc_do_restart+0x12/0x16 [<0000000077f3e8ca>] 0x77f3e8ca Signed-off-by: Gerald Schaefer Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c69f84fe038d..b70a7fec1ff6 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -114,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn; + unsigned long pfn, flags; struct page *page; + struct zone *zone; + int ret; pfn = start_pfn; /* @@ -131,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) if (pfn < end_pfn) return -EBUSY; /* Check all pages are free or Marked as ISOLATED */ - if (__test_page_isolated_in_pageblock(start_pfn, end_pfn)) - return 0; - return -EBUSY; + zone = page_zone(pfn_to_page(pfn)); + spin_lock_irqsave(&zone->lock, flags); + ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); + spin_unlock_irqrestore(&zone->lock, flags); + return ret ? 0 : -EBUSY; } -- cgit v1.2.2 From 4b19de6d1cb07c8bcb6778e771f9cfd5bcfdfd3e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 2 Oct 2008 14:50:16 -0700 Subject: mm: tiny-shmem nommu fix The previous patch db203d53d474aa068984e409d807628f5841da1b ("mm: tiny-shmem fix lock ordering: mmap_sem vs i_mutex") to fix the lock ordering in tiny-shmem breaks shared anonymous and IPC memory on NOMMU architectures because it was using the expanding truncate to signal ramfs to allocate a physically contiguous RAM backing the inode (otherwise it is unusable for "memory mapping" it to userspace). However do_truncate is what caused the lock ordering error, due to it taking i_mutex. In this case, we can actually just call ramfs directly to allocate memory for the mapping, rather than go via truncate. Acked-by: David Howells Acked-by: Hugh Dickins Signed-off-by: Nick Piggin Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/tiny-shmem.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index d17cb6f6ab10..8d7a27a6335c 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -80,6 +80,12 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) inode->i_nlink = 0; /* It is unlinked */ init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, &ramfs_file_operations); + +#ifndef CONFIG_MMU + error = ramfs_nommu_expand_for_mapping(inode, size); + if (error) + goto close_file; +#endif return file; close_file: -- cgit v1.2.2 From 6babc32c41e3642d875372cb6afbd9ade7a9f311 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 2 Oct 2008 14:50:18 -0700 Subject: mm: handle initialising compound pages at orders greater than MAX_ORDER When we initialise a compound page we initialise the page flags and head page pointer for all base pages spanned by that page. When we initialise a gigantic page (a page of order greater than or equal to MAX_ORDER) we have to initialise more than MAX_ORDER_NR_PAGES pages. Currently we assume that all elements of the mem_map in this page are contigious in memory. However this is only guarenteed out to MAX_ORDER_NR_PAGES pages, and with SPARSEMEM enabled they will not be contigious. This leads us to walk off the end of the first section and scribble on everything which follows, BAD. When we reach a MAX_ORDER_NR_PAGES boundary we much locate the next section of the mem_map. As gigantic pages can only be maximally aligned we know this will occur at exact multiple of MAX_ORDER_NR_PAGES pages from the start of the page. This is a bug fix for the gigantic page support in hugetlbfs. Credit to Mel Gorman for spotting the issue. Signed-off-by: Andy Whitcroft Cc: Mel Gorman Cc: Jon Tollefson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e293c58bea58..27b8681139fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -268,13 +268,14 @@ void prep_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; + struct page *p = page + 1; set_compound_page_dtor(page, free_compound_page); set_compound_order(page, order); __SetPageHead(page); - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; - + for (i = 1; i < nr_pages; i++, p++) { + if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) + p = pfn_to_page(page_to_pfn(page) + i); __SetPageTail(p); p->first_page = page; } @@ -284,6 +285,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; + struct page *p = page + 1; if (unlikely(compound_order(page) != order)) bad_page(page); @@ -291,8 +293,9 @@ static void destroy_compound_page(struct page *page, unsigned long order) if (unlikely(!PageHead(page))) bad_page(page); __ClearPageHead(page); - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; + for (i = 1; i < nr_pages; i++, p++) { + if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) + p = pfn_to_page(page_to_pfn(page) + i); if (unlikely(!PageTail(p) | (p->first_page != page))) -- cgit v1.2.2 From 85ba94ba0592296053f7f2846812173424afe1cb Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Tue, 7 Oct 2008 11:37:35 -0500 Subject: SLOB: fix bogus ksize calculation SLOB's ksize calculation was braindamaged and generally harmlessly underreported the allocation size. But for very small buffers, it could in fact overreport them, leading code depending on krealloc to overrun the allocation and trample other data. Signed-off-by: Matt Mackall Tested-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- mm/slob.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index 4c82dd41f32e..62b679dc660f 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -515,7 +515,7 @@ size_t ksize(const void *block) sp = (struct slob_page *)virt_to_page(block); if (slob_page(sp)) - return ((slob_t *)block - 1)->units + SLOB_UNIT; + return (((slob_t *)block - 1)->units - 1) * SLOB_UNIT; else return sp->page.private; } -- cgit v1.2.2 From 70096a561d1e09120bae1f293f3632cedbfd5c68 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Wed, 8 Oct 2008 14:51:57 -0500 Subject: SLOB: fix bogus ksize calculation fix This fixes the previous fix, which was completely wrong on closer inspection. This version has been manually tested with a user-space test harness and generates sane values. A nearly identical patch has been boot-tested. The problem arose from changing how kmalloc/kfree handled alignment padding without updating ksize to match. This brings it in sync. Signed-off-by: Matt Mackall Signed-off-by: Linus Torvalds --- mm/slob.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index 62b679dc660f..cb675d126791 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -514,9 +514,11 @@ size_t ksize(const void *block) return 0; sp = (struct slob_page *)virt_to_page(block); - if (slob_page(sp)) - return (((slob_t *)block - 1)->units - 1) * SLOB_UNIT; - else + if (slob_page(sp)) { + int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + unsigned int *m = (unsigned int *)(block - align); + return SLOB_UNITS(*m) * SLOB_UNIT; + } else return sp->page.private; } -- cgit v1.2.2