From 69cf0fac6052c5bd3fb3469a41d4216e926028f8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 17 Apr 2006 22:46:32 +0100 Subject: [PATCH] Fix MADV_REMOVE protection checking madvise_remove needs to respect file and mmap protections. Signed-off-by: Hugh Dickins [ Will the real CVE-2006-1524 stand up, please.. ] Signed-off-by: Linus Torvalds --- mm/madvise.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index af3d573b0141..4e196155a0c3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -168,6 +168,9 @@ static long madvise_remove(struct vm_area_struct *vma, return -EINVAL; } + if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) + return -EACCES; + mapping = vma->vm_file->f_mapping; offset = (loff_t)(start - vma->vm_start) -- cgit v1.2.2 From 75129e297e861e6c61038aa4cdbf604b022de4ff Mon Sep 17 00:00:00 2001 From: John Hawkes Date: Tue, 18 Apr 2006 22:20:33 -0700 Subject: [PATCH] mm/slob.c: for_each_possible_cpu(), not NR_CPUS Convert for-loops that explicitly reference "NR_CPUS" into the potentially more efficient for_each_possible_cpu() construct. Signed-off-by: John Hawkes Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slob.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index 9bcc7e2cabfd..a68255ba4553 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -354,9 +354,7 @@ void *__alloc_percpu(size_t size) if (!pdata) return NULL; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; + for_each_possible_cpu(i) { pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); if (!pdata->ptrs[i]) goto unwind_oom; @@ -383,11 +381,9 @@ free_percpu(const void *objp) int i; struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; + for_each_possible_cpu(i) kfree(p->ptrs[i]); - } + kfree(p); } EXPORT_SYMBOL(free_percpu); -- cgit v1.2.2 From 97c2c9b84d0c1edf4926b13661d5af3f0edccbce Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 18 Apr 2006 22:20:38 -0700 Subject: [PATCH] oom-kill: mm locking fix Dave Peterson points out that badness() is playing with mm_structs without taking a reference on them. mmput() can sleep, so taking a reference here (inside tasklist_lock) is hard. Fix it up via task_lock() instead. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 78747afad6b0..9a643c4bf77c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -46,15 +46,25 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) { unsigned long points, cpu_time, run_time, s; - struct list_head *tsk; + struct mm_struct *mm; + struct task_struct *child; - if (!p->mm) + task_lock(p); + mm = p->mm; + if (!mm) { + task_unlock(p); return 0; + } /* * The memory size of the process is the basis for the badness. */ - points = p->mm->total_vm; + points = mm->total_vm; + + /* + * After this unlock we can no longer dereference local variable `mm' + */ + task_unlock(p); /* * Processes which fork a lot of child processes are likely @@ -64,11 +74,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * child is eating the vast majority of memory, adding only half * to the parents will make the child our kill candidate of choice. */ - list_for_each(tsk, &p->children) { - struct task_struct *chld; - chld = list_entry(tsk, struct task_struct, sibling); - if (chld->mm != p->mm && chld->mm) - points += chld->mm->total_vm/2 + 1; + list_for_each_entry(child, &p->children, sibling) { + task_lock(child); + if (child->mm != mm && child->mm) + points += child->mm->total_vm/2 + 1; + task_unlock(child); } /* -- cgit v1.2.2 From 013159227b840dfd441bd2e4c8b4d77ffb3cc42e Mon Sep 17 00:00:00 2001 From: Dave Peterson Date: Tue, 18 Apr 2006 22:20:44 -0700 Subject: [PATCH] mm: fix mm_struct reference counting bugs in mm/oom_kill.c Fix oom_kill_task() so it doesn't call mmput() (which may sleep) while holding tasklist_lock. Signed-off-by: David S. Peterson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 9a643c4bf77c..042e6436c3ee 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -254,17 +254,24 @@ static void __oom_kill_task(task_t *p, const char *message) force_sig(SIGKILL, p); } -static struct mm_struct *oom_kill_task(task_t *p, const char *message) +static int oom_kill_task(task_t *p, const char *message) { - struct mm_struct *mm = get_task_mm(p); + struct mm_struct *mm; task_t * g, * q; - if (!mm) - return NULL; - if (mm == &init_mm) { - mmput(mm); - return NULL; - } + mm = p->mm; + + /* WARNING: mm may not be dereferenced since we did not obtain its + * value from get_task_mm(p). This is OK since all we need to do is + * compare mm to q->mm below. + * + * Furthermore, even if mm contains a non-NULL value, p->mm may + * change to NULL at any time since we do not hold task_lock(p). + * However, this is of no concern to us. + */ + + if (mm == NULL || mm == &init_mm) + return 1; __oom_kill_task(p, message); /* @@ -276,13 +283,12 @@ static struct mm_struct *oom_kill_task(task_t *p, const char *message) __oom_kill_task(q, message); while_each_thread(g, q); - return mm; + return 0; } -static struct mm_struct *oom_kill_process(struct task_struct *p, - unsigned long points, const char *message) +static int oom_kill_process(struct task_struct *p, unsigned long points, + const char *message) { - struct mm_struct *mm; struct task_struct *c; struct list_head *tsk; @@ -293,9 +299,8 @@ static struct mm_struct *oom_kill_process(struct task_struct *p, c = list_entry(tsk, struct task_struct, sibling); if (c->mm == p->mm) continue; - mm = oom_kill_task(c, message); - if (mm) - return mm; + if (!oom_kill_task(c, message)) + return 0; } return oom_kill_task(p, message); } @@ -310,7 +315,6 @@ static struct mm_struct *oom_kill_process(struct task_struct *p, */ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) { - struct mm_struct *mm = NULL; task_t *p; unsigned long points = 0; @@ -330,12 +334,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) */ switch (constrained_alloc(zonelist, gfp_mask)) { case CONSTRAINT_MEMORY_POLICY: - mm = oom_kill_process(current, points, + oom_kill_process(current, points, "No available memory (MPOL_BIND)"); break; case CONSTRAINT_CPUSET: - mm = oom_kill_process(current, points, + oom_kill_process(current, points, "No available memory in cpuset"); break; @@ -357,8 +361,7 @@ retry: panic("Out of memory and no killable processes...\n"); } - mm = oom_kill_process(p, points, "Out of memory"); - if (!mm) + if (oom_kill_process(p, points, "Out of memory")) goto retry; break; @@ -367,8 +370,6 @@ retry: out: read_unlock(&tasklist_lock); cpuset_unlock(); - if (mm) - mmput(mm); /* * Give "p" a good chance of killing itself before we -- cgit v1.2.2 From 6aa3001b239b387d98a7f945e4a51edeb59e4f2d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 18 Apr 2006 22:20:52 -0700 Subject: [PATCH] page_alloc.c: buddy handling cleanup Fix up some whitespace damage. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 97d6827c7d66..123c60586740 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -232,11 +232,13 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) * zone->lock is already acquired when we use these. * So, we don't need atomic page->flags operations here. */ -static inline unsigned long page_order(struct page *page) { +static inline unsigned long page_order(struct page *page) +{ return page_private(page); } -static inline void set_page_order(struct page *page, int order) { +static inline void set_page_order(struct page *page, int order) +{ set_page_private(page, order); __SetPageBuddy(page); } @@ -299,9 +301,9 @@ static inline int page_is_buddy(struct page *page, int order) if (PageBuddy(page) && page_order(page) == order) { BUG_ON(page_count(page) != 0); - return 1; + return 1; } - return 0; + return 0; } /* -- cgit v1.2.2 From 6d472be37896b1c41b50f3da124f8b7718ba7797 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 20 Apr 2006 02:43:12 -0700 Subject: [PATCH] Remove cond_resched in gather_stats() gather_stats() is called with a spinlock held from check_pte_range. We cannot reschedule with a lock held. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index dec8249e972d..8778f58880c4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1761,7 +1761,6 @@ static void gather_stats(struct page *page, void *private, int pte_dirty) md->mapcount_max = count; md->node[page_to_nid(page)]++; - cond_resched(); } #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.2.2 From 304dbdb7a4fbb7f40a6ad5c5836fdd456c233c63 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Sat, 22 Apr 2006 02:35:48 -0700 Subject: [PATCH] add migratepage address space op to shmem Basic problem: pages of a shared memory segment can only be migrated once. In 2.6.16 through 2.6.17-rc1, shared memory mappings do not have a migratepage address space op. Therefore, migrate_pages() falls back to default processing. In this path, it will try to pageout() dirty pages. Once a shared memory page has been migrated it becomes dirty, so migrate_pages() will try to page it out. However, because the page count is 3 [cache + current + pte], pageout() will return PAGE_KEEP because is_page_cache_freeable() returns false. This will abort all subsequent migrations. This patch adds a migratepage address space op to shared memory segments to avoid taking the default path. We use the "migrate_page()" function because it knows how to migrate dirty pages. This allows shared memory segment pages to migrate, subject to other conditions such as # pte's referencing the page [page_mapcount(page)], when requested. I think this is safe. If we're migrating a shared memory page, then we found the page via a page table, so it must be in memory. Can be verified with memtoy and the shmem-mbind-test script, both available at: http://free.linux.hp.com/~lts/Tools/ Signed-off-by: Lee Schermerhorn Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 37eaf42ed2c6..4c5e68e4e9ae 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -46,6 +46,8 @@ #include #include #include +#include + #include #include #include @@ -2173,6 +2175,7 @@ static struct address_space_operations shmem_aops = { .prepare_write = shmem_prepare_write, .commit_write = simple_commit_write, #endif + .migratepage = migrate_page, }; static struct file_operations shmem_file_operations = { -- cgit v1.2.2 From 83d722f7e198b034699b1500d98729beff930efd Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Mon, 24 Apr 2006 19:35:21 -0700 Subject: [PATCH] Remove __devinit and __cpuinit from notifier_call definitions Few of the notifier_chain_register() callers use __init in the definition of notifier_call. It is incorrect as the function definition should be available after the initializations (they do not unregister them during initializations). This patch fixes all such usages to _not_ have the notifier_call __init section. Signed-off-by: Chandra Seetharaman Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- mm/slab.c | 2 +- mm/vmscan.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 123c60586740..ea77c999047e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1962,7 +1962,7 @@ static inline void free_zone_pagesets(int cpu) } } -static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, +static int pageset_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { diff --git a/mm/slab.c b/mm/slab.c index e6ef9bd52335..af5c5237e11a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1036,7 +1036,7 @@ static inline void free_alien_cache(struct array_cache **ac_ptr) #endif -static int __devinit cpuup_callback(struct notifier_block *nfb, +static int cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; diff --git a/mm/vmscan.c b/mm/vmscan.c index acdf001d6941..4649a63a8cb6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1328,7 +1328,7 @@ repeat: not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ -static int __devinit cpu_callback(struct notifier_block *nfb, +static int cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { pg_data_t *pgdat; -- cgit v1.2.2 From ebf43500ef148a380bd132743c3fc530111ac620 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Apr 2006 08:46:01 +0200 Subject: [PATCH] Add find_get_pages_contig(): contiguous variant of find_get_pages() find_get_pages_contig() will break out if we hit a hole in the page cache. From Andrew Morton, small modifications and documentation by me. Signed-off-by: Jens Axboe --- mm/filemap.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 3ef20739e725..fd57442186cb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -697,6 +697,38 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, return ret; } +/** + * find_get_pages_contig - gang contiguous pagecache lookup + * @mapping: The address_space to search + * @index: The starting page index + * @nr_pages: The maximum number of pages + * @pages: Where the resulting pages are placed + * + * find_get_pages_contig() works exactly like find_get_pages(), except + * that the returned number of pages are guaranteed to be contiguous. + * + * find_get_pages_contig() returns the number of pages which were found. + */ +unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, + unsigned int nr_pages, struct page **pages) +{ + unsigned int i; + unsigned int ret; + + read_lock_irq(&mapping->tree_lock); + ret = radix_tree_gang_lookup(&mapping->page_tree, + (void **)pages, index, nr_pages); + for (i = 0; i < ret; i++) { + if (pages[i]->mapping == NULL || pages[i]->index != index) + break; + + page_cache_get(pages[i]); + index++; + } + read_unlock_irq(&mapping->tree_lock); + return i; +} + /* * Like find_get_pages, except we only return pages which are tagged with * `tag'. We update *index to index the next page for the traversal. -- cgit v1.2.2 From 693f7d362055261882659475d2ef022e32edbff1 Mon Sep 17 00:00:00 2001 From: "shin, jacob" Date: Fri, 28 Apr 2006 10:54:37 -0500 Subject: [PATCH] slab: fix crash on __drain_alien_cahce() during CPU Hotplug transfer_objects should only be called when all of the cpus in the node are online. CPU_DEAD notifier callback marks l3->shared to NULL. Signed-off-by: Jacob Shin Signed-off-by: Linus Torvalds --- mm/slab.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index af5c5237e11a..c32af7e7581e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -979,7 +979,8 @@ static void __drain_alien_cache(struct kmem_cache *cachep, * That way we could avoid the overhead of putting the objects * into the free lists and getting them back later. */ - transfer_objects(rl3->shared, ac, ac->limit); + if (rl3->shared) + transfer_objects(rl3->shared, ac, ac->limit); free_block(cachep, ac->entry, ac->avail, node); ac->avail = 0; -- cgit v1.2.2 From 4c28f81193b6778f7b49090930d88e6d12bcb928 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 1 May 2006 12:16:08 -0700 Subject: [PATCH] page migration: Fix fallback behavior for dirty pages Currently we check PageDirty() in order to make the decision to swap out the page. However, the dirty information may be only be contained in the ptes pointing to the page. We need to first unmap the ptes before checking for PageDirty(). If unmap is successful then the page count of the page will also be decreased so that pageout() works properly. This is a fix necessary for 2.6.17. Without this fix we may migrate dirty pages for filesystems without migration functions. Filesystems may keep pointers to dirty pages. Migration of dirty pages can result in the filesystem keeping pointers to freed pages. Unmapping is currently not be separated out from removing all the references to a page and moving the mapping. Therefore try_to_unmap will be called again in migrate_page() if the writeout is successful. However, it wont do anything since the ptes are already removed. The coming updates to the page migration code will restructure the code so that this is no longer necessary. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index d444229f2599..1c25040693d2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -439,6 +439,17 @@ redo: goto unlock_both; } + /* Make sure the dirty bit is up to date */ + if (try_to_unmap(page, 1) == SWAP_FAIL) { + rc = -EPERM; + goto unlock_both; + } + + if (page_mapcount(page)) { + rc = -EAGAIN; + goto unlock_both; + } + /* * Default handling if a filesystem does not provide * a migration function. We can only migrate clean -- cgit v1.2.2 From 46a66eecdf7bc12562ecb492297447ed0e1ecf59 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 1 May 2006 12:16:09 -0700 Subject: [PATCH] sparsemem interaction with memory add bug fixes This patch fixes two bugs with the way sparsemem interacts with memory add. They are: - memory leak if memmap for section already exists - calling alloc_bootmem_node() after boot These bugs were discovered and a first cut at the fixes were provided by Arnd Bergmann and Joel Schopp . Signed-off-by: Mike Kravetz Signed-off-by: Joel Schopp Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index 0a51f36ba3a1..d7c32de99ee8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -32,7 +32,10 @@ static struct mem_section *sparse_index_alloc(int nid) unsigned long array_size = SECTIONS_PER_ROOT * sizeof(struct mem_section); - section = alloc_bootmem_node(NODE_DATA(nid), array_size); + if (system_state == SYSTEM_RUNNING) + section = kmalloc_node(array_size, GFP_KERNEL, nid); + else + section = alloc_bootmem_node(NODE_DATA(nid), array_size); if (section) memset(section, 0, array_size); @@ -281,9 +284,9 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, ret = sparse_init_one_section(ms, section_nr, memmap); - if (ret <= 0) - __kfree_section_memmap(memmap, nr_pages); out: pgdat_resize_unlock(pgdat, &flags); + if (ret <= 0) + __kfree_section_memmap(memmap, nr_pages); return ret; } -- cgit v1.2.2 From bed120c64eb07b6838bb758109811484af8cebba Mon Sep 17 00:00:00 2001 From: Joel H Schopp Date: Mon, 1 May 2006 12:16:11 -0700 Subject: [PATCH] spufs: fix for CONFIG_NUMA Based on an older patch from Mike Kravetz We need to have a mem_map for high addresses in order to make fops->no_page work on spufs mem and register files. So far, we have used the memory_present() function during early bootup, but that did not work when CONFIG_NUMA was enabled. We now use the __add_pages() function to add the mem_map when loading the spufs module, which is a lot nicer. Signed-off-by: Arnd Bergmann Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1fe76d963ac2..1ae2b2cc3a54 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -69,12 +69,16 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn, for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) { err = __add_section(zone, phys_start_pfn + i); - if (err) + /* We want to keep adding the rest of the + * sections if the first ones already exist + */ + if (err && (err != -EEXIST)) break; } return err; } +EXPORT_SYMBOL_GPL(__add_pages); static void grow_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) -- cgit v1.2.2 From ac924c6034d9095f95ee889f7e31bbb9145da0c2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 15 May 2006 09:43:59 -0700 Subject: [PATCH] setup_per_zone_pages_min() overflow fix As pointed out in http://bugzilla.kernel.org/show_bug.cgi?id=6490, this function can experience overflows on 32-bit machines, causing our response to changed values of min_free_kbytes to go whacky. Fixing it efficiently is all too hard, so fix it with 64-bit math instead. Cc: Ake Sandgren Cc: Martin Bligh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ea77c999047e..813b4ec1298a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -39,6 +39,7 @@ #include #include +#include #include "internal.h" /* @@ -2566,9 +2567,11 @@ void setup_per_zone_pages_min(void) } for_each_zone(zone) { - unsigned long tmp; + u64 tmp; + spin_lock_irqsave(&zone->lru_lock, flags); - tmp = (pages_min * zone->present_pages) / lowmem_pages; + tmp = (u64)pages_min * zone->present_pages; + do_div(tmp, lowmem_pages); if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't @@ -2595,8 +2598,8 @@ void setup_per_zone_pages_min(void) zone->pages_min = tmp; } - zone->pages_low = zone->pages_min + tmp / 4; - zone->pages_high = zone->pages_min + tmp / 2; + zone->pages_low = zone->pages_min + (tmp >> 2); + zone->pages_high = zone->pages_min + (tmp >> 1); spin_unlock_irqrestore(&zone->lru_lock, flags); } -- cgit v1.2.2 From 39d24e64263cd3211705d3b61ea4171c65030921 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 15 May 2006 09:44:13 -0700 Subject: [PATCH] add slab_is_available() routine for boot code slab_is_available() indicates slab based allocators are available for use. SPARSEMEM code needs to know this as it can be called at various times during the boot process. Signed-off-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 8 ++++++++ mm/sparse.c | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index c32af7e7581e..b1d643b5238d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -700,6 +700,14 @@ static enum { FULL } g_cpucache_up; +/* + * used by boot code to determine if it can use slab based allocator + */ +int slab_is_available(void) +{ + return g_cpucache_up == FULL; +} + static DEFINE_PER_CPU(struct work_struct, reap_work); static void free_block(struct kmem_cache *cachep, void **objpp, int len, diff --git a/mm/sparse.c b/mm/sparse.c index d7c32de99ee8..c5e89eb9ac8f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -32,7 +32,7 @@ static struct mem_section *sparse_index_alloc(int nid) unsigned long array_size = SECTIONS_PER_ROOT * sizeof(struct mem_section); - if (system_state == SYSTEM_RUNNING) + if (slab_is_available()) section = kmalloc_node(array_size, GFP_KERNEL, nid); else section = alloc_bootmem_node(NODE_DATA(nid), array_size); -- cgit v1.2.2 From a4523a8b38089478f93bc053c31f678c63f5ee1b Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 15 May 2006 11:41:00 -0700 Subject: [PATCH] slab: Fix kmem_cache_destroy() on NUMA With CONFIG_NUMA set, kmem_cache_destroy() may fail and say "Can't free all objects." The problem is caused by sequences such as the following (suppose we are on a NUMA machine with two nodes, 0 and 1): * Allocate an object from cache on node 0. * Free the object on node 1. The object is put into node 1's alien array_cache for node 0. * Call kmem_cache_destroy(), which ultimately ends up in __cache_shrink(). * __cache_shrink() does drain_cpu_caches(), which loops through all nodes. For each node it drains the shared array_cache and then handles the alien array_cache for the other node. However this means that node 0's shared array_cache will be drained, and then node 1 will move the contents of its alien[0] array_cache into that same shared array_cache. node 0's shared array_cache is never looked at again, so the objects left there will appear to be in use when __cache_shrink() calls __node_shrink() for node 0. So __node_shrink() will return 1 and kmem_cache_destroy() will fail. This patch fixes this by having drain_cpu_caches() do drain_alien_cache() on every node before it does drain_array() on the nodes' shared array_caches. The problem was originally reported by Or Gerlitz . Signed-off-by: Roland Dreier Acked-by: Christoph Lameter Acked-by: Pekka Enberg Signed-off-by: Linus Torvalds --- mm/slab.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index b1d643b5238d..d31a06bfbea5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2200,11 +2200,14 @@ static void drain_cpu_caches(struct kmem_cache *cachep) check_irq_on(); for_each_online_node(node) { l3 = cachep->nodelists[node]; - if (l3) { + if (l3 && l3->alien) + drain_alien_cache(cachep, l3->alien); + } + + for_each_online_node(node) { + l3 = cachep->nodelists[node]; + if (l3) drain_array(cachep, l3, l3->shared, 1, node); - if (l3->alien) - drain_alien_cache(cachep, l3->alien); - } } } -- cgit v1.2.2 From 12783b002db1f02c29353c8f698a85514420b9f4 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Sat, 20 May 2006 15:00:05 -0700 Subject: [PATCH] SPARSEMEM incorrectly calculates section number A bad calculation/loop in __section_nr() could result in incorrect section information being put into sysfs memory entries. This primarily impacts memory add operations as the sysfs information is used while onlining new memory. Fix suggested by Dave Hansen. Note that the bug may not be obvious from the patch. It actually occurs in the function's return statement: return (root_nr * SECTIONS_PER_ROOT) + (ms - root); In the existing code, root_nr has already been multiplied by SECTIONS_PER_ROOT. Signed-off-by: Mike Kravetz Cc: Dave Hansen Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index c5e89eb9ac8f..100040c0dfb6 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -87,11 +87,8 @@ int __section_nr(struct mem_section* ms) unsigned long root_nr; struct mem_section* root; - for (root_nr = 0; - root_nr < NR_MEM_SECTIONS; - root_nr += SECTIONS_PER_ROOT) { - root = __nr_to_section(root_nr); - + for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) { + root = __nr_to_section(root_nr * SECTIONS_PER_ROOT); if (!root) continue; -- cgit v1.2.2 From bdd804f478a0cc74bf7db8e9f9d5fd379d1b31ca Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sat, 20 May 2006 15:00:09 -0700 Subject: [PATCH] Cpuset: might sleep checking zones allowed fix Fix a couple of infrequently encountered 'sleeping function called from invalid context' in the cpuset hooks in __alloc_pages. Could sleep while interrupts disabled. The routine cpuset_zone_allowed() is called by code in mm/page_alloc.c __alloc_pages() to determine if a zone is allowed in the current tasks cpuset. This routine can sleep, for certain GFP_KERNEL allocations, if the zone is on a memory node not allowed in the current cpuset, but might be allowed in a parent cpuset. But we can't sleep in __alloc_pages() if in interrupt, nor if called for a GFP_ATOMIC request (__GFP_WAIT not set in gfp_flags). The rule was intended to be: Don't call cpuset_zone_allowed() if you can't sleep, unless you pass in the __GFP_HARDWALL flag set in gfp_flag, which disables the code that might scan up ancestor cpusets and sleep. This rule was being violated in a couple of places, due to a bogus change made (by myself, pj) to __alloc_pages() as part of the November 2005 effort to cleanup its logic, and also due to a later fix to constrain which swap daemons were awoken. The bogus change can be seen at: http://linux.derkeiler.com/Mailing-Lists/Kernel/2005-11/4691.html [PATCH 01/05] mm fix __alloc_pages cpuset ALLOC_* flags This was first noticed on a tight memory system, in code that was disabling interrupts and doing allocation requests with __GFP_WAIT not set, which resulted in __might_sleep() writing complaints to the log "Debug: sleeping function called ...", when the code in cpuset_zone_allowed() tried to take the callback_sem cpuset semaphore. We haven't seen a system hang on this 'might_sleep' yet, but we are at decent risk of seeing it fairly soon, especially since the additional cpuset_zone_allowed() check was added, conditioning wakeup_kswapd(), in March 2006. Special thanks to Dave Chinner, for figuring this out, and a tip of the hat to Nick Piggin who warned me of this back in Nov 2005, before I was ready to listen. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 813b4ec1298a..bb3416932ab0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -951,7 +951,7 @@ restart: goto got_pg; do { - if (cpuset_zone_allowed(*z, gfp_mask)) + if (cpuset_zone_allowed(*z, gfp_mask|__GFP_HARDWALL)) wakeup_kswapd(*z, order); } while (*(++z)); @@ -970,7 +970,8 @@ restart: alloc_flags |= ALLOC_HARDER; if (gfp_mask & __GFP_HIGH) alloc_flags |= ALLOC_HIGH; - alloc_flags |= ALLOC_CPUSET; + if (wait) + alloc_flags |= ALLOC_CPUSET; /* * Go through the zonelist again. Let __GFP_HIGH and allocations -- cgit v1.2.2 From e984bb43f7450312ba66fe0e67a99efa6be3b246 Mon Sep 17 00:00:00 2001 From: Bob Picco Date: Sat, 20 May 2006 15:00:31 -0700 Subject: [PATCH] Align the node_mem_map endpoints to a MAX_ORDER boundary Andy added code to buddy allocator which does not require the zone's endpoints to be aligned to MAX_ORDER. An issue is that the buddy allocator requires the node_mem_map's endpoints to be MAX_ORDER aligned. Otherwise __page_find_buddy could compute a buddy not in node_mem_map for partial MAX_ORDER regions at zone's endpoints. page_is_buddy will detect that these pages at endpoints are not PG_buddy (they were zeroed out by bootmem allocator and not part of zone). Of course the negative here is we could waste a little memory but the positive is eliminating all the old checks for zone boundary conditions. SPARSEMEM won't encounter this issue because of MAX_ORDER size constraint when SPARSEMEM is configured. ia64 VIRTUAL_MEM_MAP doesn't need the logic either because the holes and endpoints are handled differently. This leaves checking alloc_remap and other arches which privately allocate for node_mem_map. Signed-off-by: Bob Picco Acked-by: Mel Gorman Cc: Dave Hansen Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bb3416932ab0..253a450c400d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2125,14 +2125,22 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) #ifdef CONFIG_FLAT_NODE_MEM_MAP /* ia64 gets its own node_mem_map, before this, without bootmem */ if (!pgdat->node_mem_map) { - unsigned long size; + unsigned long size, start, end; struct page *map; - size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); + /* + * The zone's endpoints aren't required to be MAX_ORDER + * aligned but the node_mem_map endpoints must be in order + * for the buddy allocator to function correctly. + */ + start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); + end = pgdat->node_start_pfn + pgdat->node_spanned_pages; + end = ALIGN(end, MAX_ORDER_NR_PAGES); + size = (end - start) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); if (!map) map = alloc_bootmem_node(pgdat, size); - pgdat->node_mem_map = map; + pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); } #ifdef CONFIG_FLATMEM /* -- cgit v1.2.2 From 25a6df952542ad9f284421b6ffe28f3eb3df1305 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Tue, 30 May 2006 21:25:42 -0700 Subject: [PATCH] spanned_pages is not updated at a case of memory hot-add From: Yasunori Goto If hot-added memory's address is smaller than old area, spanned_pages will not be updated. It must be fixed. example) Old zone_start_pfn = 0x60000, and spanned_pages = 0x10000 Added new memory's start_pfn = 0x50000, and end_pfn = 0x60000 new spanned_pages will be still 0x10000 by old code. (It should be updated to 0x20000.) Because old_zone_end_pfn will be 0x70000, and end_pfn smaller than it. So, spanned_pages will not be updated. In current code, spanned_pages is updated only when end_pfn is updated. But, it should be updated by subtraction between bigger end_pfn and new zone_start_pfn. Signed-off-by: Yasunori Goto Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1ae2b2cc3a54..70df5c0d957e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -91,8 +91,8 @@ static void grow_zone_span(struct zone *zone, if (start_pfn < zone->zone_start_pfn) zone->zone_start_pfn = start_pfn; - if (end_pfn > old_zone_end_pfn) - zone->spanned_pages = end_pfn - zone->zone_start_pfn; + zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - + zone->zone_start_pfn; zone_span_writeunlock(zone); } @@ -106,8 +106,8 @@ static void grow_pgdat_span(struct pglist_data *pgdat, if (start_pfn < pgdat->node_start_pfn) pgdat->node_start_pfn = start_pfn; - if (end_pfn > old_pgdat_end_pfn) - pgdat->node_spanned_pages = end_pfn - pgdat->node_start_pfn; + pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - + pgdat->node_start_pfn; } int online_pages(unsigned long pfn, unsigned long nr_pages) -- cgit v1.2.2 From b1ab41c4943008375c149a63602d7407f61de5b2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 2 Jun 2006 15:44:58 +0200 Subject: [PATCH] slab.c: fix offslab_limit bug mm/slab.c's offlab_limit logic is totally broken. Firstly, "offslab_limit" is a global variable while it should either be calculated in situ or should be passed in as a parameter. Secondly, the more serious problem with it is that the condition for calculating it: if (!(OFF_SLAB(sizes->cs_cachep))) { offslab_limit = sizes->cs_size - sizeof(struct slab); offslab_limit /= sizeof(kmem_bufctl_t); is in total disconnect with the condition that makes use of it: /* More than offslab_limit objects will cause problems */ if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit) break; but due to offslab_limit being a global variable this breakage was hidden. Up until lockdep came along and perturbed the slab sizes sufficiently so that the first off-slab cache would still see a (non-calculated) zero value for offslab_limit and would panic with: kmem_cache_create: couldn't create cache size-512. Call Trace: [] show_trace+0x96/0x1c8 [] dump_stack+0x13/0x15 [] panic+0x39/0x21a [] kmem_cache_create+0x5a0/0x5d0 [] kmem_cache_init+0x193/0x379 [] start_kernel+0x17f/0x218 [] _sinittext+0x263/0x26a Kernel panic - not syncing: kmem_cache_create(): failed to create slab `size-512' Paolo Ornati's config on x86_64 managed to trigger it. The fix is to move the calculation to the place that makes use of it. This also makes slab.o 54 bytes smaller. Btw., the check itself is quite silly. Its intention is to test whether the number of objects per slab would be higher than the number of slab control pointers possible. In theory it could be triggered: if someone tried to allocate 4-byte objects cache and explicitly requested with CFLGS_OFF_SLAB. So i kept the check. Out of historic interest i checked how old this bug was and it's ancient, 10 years old! It is the oldest hidden and then truly triggering bugs i ever saw being fixed in the kernel! Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- mm/slab.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index d31a06bfbea5..f1b644eb39d8 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -207,11 +207,6 @@ typedef unsigned int kmem_bufctl_t; #define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) -/* Max number of objs-per-slab for caches which use off-slab slabs. - * Needed to avoid a possible looping condition in cache_grow(). - */ -static unsigned long offslab_limit; - /* * struct slab * @@ -1356,12 +1351,6 @@ void __init kmem_cache_init(void) NULL, NULL); } - /* Inc off-slab bufctl limit until the ceiling is hit. */ - if (!(OFF_SLAB(sizes->cs_cachep))) { - offslab_limit = sizes->cs_size - sizeof(struct slab); - offslab_limit /= sizeof(kmem_bufctl_t); - } - sizes->cs_dmacachep = kmem_cache_create(names->name_dma, sizes->cs_size, ARCH_KMALLOC_MINALIGN, @@ -1780,6 +1769,7 @@ static void set_up_list3s(struct kmem_cache *cachep, int index) static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, size_t align, unsigned long flags) { + unsigned long offslab_limit; size_t left_over = 0; int gfporder; @@ -1791,9 +1781,18 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, if (!num) continue; - /* More than offslab_limit objects will cause problems */ - if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit) - break; + if (flags & CFLGS_OFF_SLAB) { + /* + * Max number of objs-per-slab for caches which + * use off-slab slabs. Needed to avoid a possible + * looping condition in cache_grow(). + */ + offslab_limit = size - sizeof(struct slab); + offslab_limit /= sizeof(kmem_bufctl_t); + + if (num > offslab_limit) + break; + } /* Found something acceptable - save it away */ cachep->num = num; -- cgit v1.2.2