From 5bead2a0680687b9576d57c177988e8aa082b922 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Sat, 13 Sep 2008 02:33:19 -0700 Subject: mm: mark the correct zone as full when scanning zonelists The iterator for_each_zone_zonelist() uses a struct zoneref *z cursor when scanning zonelists to keep track of where in the zonelist it is. The zoneref that is returned corresponds to the the next zone that is to be scanned, not the current one. It was intended to be treated as an opaque list. When the page allocator is scanning a zonelist, it marks elements in the zonelist corresponding to zones that are temporarily full. As the zonelist is being updated, it uses the cursor here; if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); This is intended to prevent rescanning in the near future but the zoneref cursor does not correspond to the zone that has been found to be full. This is an easy misunderstanding to make so this patch corrects the problem by changing zoneref cursor to be the current zone being scanned instead of the next one. Signed-off-by: Mel Gorman Cc: Andy Whitcroft Cc: KAMEZAWA Hiroyuki Cc: [2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmzone.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmzone.c b/mm/mmzone.c index 486ed595ee6f..16ce8b955dcf 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, (z->zone && !zref_in_nodemask(z, nodes))) z++; - *zone = zonelist_zone(z++); + *zone = zonelist_zone(z); return z; } -- cgit v1.2.2 From 02b71b70129aaaa38f280af2aa5a767a4dec9107 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Thu, 11 Sep 2008 12:25:41 -0700 Subject: slub: fixed uninitialized counter in struct kmem_cache_node Initialized total objects atomic for the node in init_kmem_cache_node. The uninitialized value was ruining the stats in /proc/slabinfo. Acked-by: Christoph Lameter Signed-off-by: Salman Qazi Signed-off-by: Pekka Enberg --- mm/slub.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index fb486d5540f8..0c83e6afe7b2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1932,6 +1932,7 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG atomic_long_set(&n->nr_slabs, 0); + atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif } -- cgit v1.2.2 From db203d53d474aa068984e409d807628f5841da1b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 22 Sep 2008 13:57:50 -0700 Subject: mm: tiny-shmem fix lock ordering: mmap_sem vs i_mutex tiny-shmem calls do_truncate in shmem_file_setup. do_truncate takes i_mutex, and shmem_file_setup is called with mmap_sem held. However i_mutex nests outside mmap_sem. Copy the code in shmem.c to avoid this problem. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Nick Piggin Reported-and-tested-by: Ingo Molnar Cc: Peter Zijlstra Cc: Matt Mackall Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/tiny-shmem.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index ae532f501943..d17cb6f6ab10 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -65,31 +65,25 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) if (!dentry) goto put_memory; - error = -ENOSPC; - inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); - if (!inode) - goto put_dentry; - - d_instantiate(dentry, inode); error = -ENFILE; - file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ, - &ramfs_file_operations); + file = get_empty_filp(); if (!file) goto put_dentry; - inode->i_nlink = 0; /* It is unlinked */ - - /* notify everyone as to the change of file size */ - error = do_truncate(dentry, size, 0, file); - if (error < 0) + error = -ENOSPC; + inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); + if (!inode) goto close_file; + d_instantiate(dentry, inode); + inode->i_size = size; + inode->i_nlink = 0; /* It is unlinked */ + init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, + &ramfs_file_operations); return file; close_file: put_filp(file); - return ERR_PTR(error); - put_dentry: dput(dentry); put_memory: -- cgit v1.2.2 From a10cebf56ca7e7c034d1b6646230c6553e478967 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Mon, 22 Sep 2008 13:57:52 -0700 Subject: memcg: check under limit at shrink_usage Current memory cgroup(both in mainline and -mm) doesn't account swap caches as memory(swap cache support is dropped temporarily now). So try_to_free_mem_cgroup_pages doesn't reflect the count of pages that have been moved to swap cache. But this makes mem_cgroup_shrink_usage fail easily if most of the pages are anon/shmem, and then shmem_getpage returns -ENOMEM and the process will be killed. This patch adds res_counter_check_under_limit to avoid these cases. BTW, even if swap cache support is enabled again, if a process is moved to another cgroup, which has been just made, between precharge and shrink_usage in shmem_getpage, shrink_usage may fail just because there is no pages to reclaim. So this change would make sense anyway. Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Cc: Pavel Emelyanov Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0f1f7a7374ba..c0500e4d3a2f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -806,6 +806,7 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) do { progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); + progress += res_counter_check_under_limit(&mem->res); } while (!progress && --retry); css_put(&mem->css); -- cgit v1.2.2 From 31a78f23bac0069004e69f98808b6988baccb6b6 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Sun, 28 Sep 2008 23:09:31 +0100 Subject: mm owner: fix race between swapoff and exit There's a race between mm->owner assignment and swapoff, more easily seen when task slab poisoning is turned on. The condition occurs when try_to_unuse() runs in parallel with an exiting task. A similar race can occur with callers of get_task_mm(), such as /proc// or ptrace or page migration. CPU0 CPU1 try_to_unuse looks at mm = task0->mm increments mm->mm_users task 0 exits mm->owner needs to be updated, but no new owner is found (mm_users > 1, but no other task has task->mm = task0->mm) mm_update_next_owner() leaves mmput(mm) decrements mm->mm_users task0 freed dereferencing mm->owner fails The fix is to notify the subsystem via mm_owner_changed callback(), if no new owner is found, by specifying the new task as NULL. Jiri Slaby: mm->owner was set to NULL prior to calling cgroup_mm_owner_callbacks(), but must be set after that, so as not to pass NULL as old owner causing oops. Daisuke Nishimura: mm_update_next_owner() may set mm->owner to NULL, but mem_cgroup_from_task() and its callers need to take account of this situation to avoid oops. Hugh Dickins: Lockdep warning and hang below exec_mmap() when testing these patches. exit_mm() up_reads mmap_sem before calling mm_update_next_owner(), so exec_mmap() now needs to do the same. And with that repositioning, there's now no point in mm_need_new_owner() allowing for NULL mm. Reported-by: Hugh Dickins Signed-off-by: Balbir Singh Signed-off-by: Jiri Slaby Signed-off-by: Daisuke Nishimura Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c0500e4d3a2f..36896f3eb7f5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -250,6 +250,14 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { + /* + * mm_update_next_owner() may clear mm->owner to NULL + * if it races with swapoff, page migration, etc. + * So this can be called with p == NULL. + */ + if (unlikely(!p)) + return NULL; + return container_of(task_subsys_state(p, mem_cgroup_subsys_id), struct mem_cgroup, css); } @@ -549,6 +557,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, if (likely(!memcg)) { rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + kmem_cache_free(page_cgroup_cache, pc); + return 0; + } /* * For every charge from the cgroup, increment reference count */ @@ -801,6 +814,10 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + return 0; + } css_get(&mem->css); rcu_read_unlock(); -- cgit v1.2.2