From 6047a007d0f6b7395cd158f3bdda34ab39a48821 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 14 Jan 2009 12:22:25 +0200 Subject: SLUB: Use ->objsize from struct kmem_cache_cpu in slab_free() There's no reason to use ->objsize from struct kmem_cache in slab_free() for the SLAB_DEBUG_OBJECTS case. All it does is generate extra cache pressure as we try very hard not to touch struct kmem_cache in the fast-path. Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 6392ae5cc6b1..f21e25ad453b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1724,7 +1724,7 @@ static __always_inline void slab_free(struct kmem_cache *s, c = get_cpu_slab(s, smp_processor_id()); debug_check_no_locks_freed(object, c->objsize); if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, s->objsize); + debug_check_no_obj_freed(object, c->objsize); if (likely(page == c->page && c->node >= 0)) { object[c->offset] = c->freelist; c->freelist = object; -- cgit v1.2.2 From 6e9ed0cc4b963fde66ab47d9fb19147631e44555 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9rico=20Wang?= Date: Mon, 19 Jan 2009 02:00:38 +0800 Subject: slob: clean up the code - Use NULL instead of plain 0; - Rename slob_page() to is_slob_page(); - Define slob_page() to convert void* to struct slob_page*; - Rename slob_new_page() to slob_new_pages(); - Define slob_free_pages() accordingly. Compile tests only. Signed-off-by: WANG Cong Signed-off-by: Matt Mackall Cc: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slob.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index bf7e8fc3aed8..c9cd31d27e69 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -126,9 +126,9 @@ static LIST_HEAD(free_slob_medium); static LIST_HEAD(free_slob_large); /* - * slob_page: True for all slob pages (false for bigblock pages) + * is_slob_page: True for all slob pages (false for bigblock pages) */ -static inline int slob_page(struct slob_page *sp) +static inline int is_slob_page(struct slob_page *sp) { return PageSlobPage((struct page *)sp); } @@ -143,6 +143,11 @@ static inline void clear_slob_page(struct slob_page *sp) __ClearPageSlobPage((struct page *)sp); } +static inline struct slob_page *slob_page(const void *addr) +{ + return (struct slob_page *)virt_to_page(addr); +} + /* * slob_page_free: true for pages on free_slob_pages list. */ @@ -230,7 +235,7 @@ static int slob_last(slob_t *s) return !((unsigned long)slob_next(s) & ~PAGE_MASK); } -static void *slob_new_page(gfp_t gfp, int order, int node) +static void *slob_new_pages(gfp_t gfp, int order, int node) { void *page; @@ -247,12 +252,17 @@ static void *slob_new_page(gfp_t gfp, int order, int node) return page_address(page); } +static void slob_free_pages(void *b, int order) +{ + free_pages((unsigned long)b, order); +} + /* * Allocate a slob block within a given slob_page sp. */ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) { - slob_t *prev, *cur, *aligned = 0; + slob_t *prev, *cur, *aligned = NULL; int delta = 0, units = SLOB_UNITS(size); for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { @@ -349,10 +359,10 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) /* Not enough space: must allocate a new page */ if (!b) { - b = slob_new_page(gfp & ~__GFP_ZERO, 0, node); + b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); if (!b) - return 0; - sp = (struct slob_page *)virt_to_page(b); + return NULL; + sp = slob_page(b); set_slob_page(sp); spin_lock_irqsave(&slob_lock, flags); @@ -384,7 +394,7 @@ static void slob_free(void *block, int size) return; BUG_ON(!size); - sp = (struct slob_page *)virt_to_page(block); + sp = slob_page(block); units = SLOB_UNITS(size); spin_lock_irqsave(&slob_lock, flags); @@ -476,7 +486,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) } else { void *ret; - ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); + ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); if (ret) { struct page *page; page = virt_to_page(ret); @@ -494,8 +504,8 @@ void kfree(const void *block) if (unlikely(ZERO_OR_NULL_PTR(block))) return; - sp = (struct slob_page *)virt_to_page(block); - if (slob_page(sp)) { + sp = slob_page(block); + if (is_slob_page(sp)) { int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); @@ -513,8 +523,8 @@ size_t ksize(const void *block) if (unlikely(block == ZERO_SIZE_PTR)) return 0; - sp = (struct slob_page *)virt_to_page(block); - if (slob_page(sp)) { + sp = slob_page(block); + if (is_slob_page(sp)) { int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; @@ -572,7 +582,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) if (c->size < PAGE_SIZE) b = slob_alloc(c->size, flags, c->align, node); else - b = slob_new_page(flags, get_order(c->size), node); + b = slob_new_pages(flags, get_order(c->size), node); if (c->ctor) c->ctor(b); @@ -586,7 +596,7 @@ static void __kmem_cache_free(void *b, int size) if (size < PAGE_SIZE) slob_free(b, size); else - free_pages((unsigned long)b, get_order(size)); + slob_free_pages(b, get_order(size)); } static void kmem_rcu_free(struct rcu_head *head) -- cgit v1.2.2 From 6146f0d5e47ca4047ffded0fb79b6c25359b386c Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Wed, 4 Feb 2009 09:06:57 -0500 Subject: integrity: IMA hooks This patch replaces the generic integrity hooks, for which IMA registered itself, with IMA integrity hooks in the appropriate places directly in the fs directory. Signed-off-by: Mimi Zohar Acked-by: Serge Hallyn Signed-off-by: James Morris --- mm/mmap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index d4855a682ab6..c3647f3b0621 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -1048,6 +1049,9 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, } error = security_file_mmap(file, reqprot, prot, flags, addr, 0); + if (error) + return error; + error = ima_file_mmap(file, prot); if (error) return error; -- cgit v1.2.2 From 1df9f0a73178718969ae47d813b8e7aab2cf073c Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Wed, 4 Feb 2009 09:07:02 -0500 Subject: Integrity: IMA file free imbalance The number of calls to ima_path_check()/ima_file_free() should be balanced. An extra call to fput(), indicates the file could have been accessed without first being measured. Although f_count is incremented/decremented in places other than fget/fput, like fget_light/fput_light and get_file, the current task must already hold a file refcnt. The call to __fput() is delayed until the refcnt becomes 0, resulting in ima_file_free() flagging any changes. - add hook to increment opencount for IPC shared memory(SYSV), shmat files, and /dev/zero - moved NULL iint test in opencount_get() Signed-off-by: Mimi Zohar Acked-by: Serge Hallyn Signed-off-by: James Morris --- mm/shmem.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index f1b0d4871f3a..dd5588f5d939 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -2600,6 +2601,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) if (IS_ERR(file)) return PTR_ERR(file); + ima_shm_check(file); if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; -- cgit v1.2.2 From ed850a52af971528b048812c4215cef298af0d3b Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 10 Feb 2009 23:01:19 -0500 Subject: integrity: shmem zero fix Based on comments from Mike Frysinger and Randy Dunlap: (http://lkml.org/lkml/2009/2/9/262) - moved ima.h include before CONFIG_SHMEM test to fix compiler error on Blackfin: mm/shmem.c: In function 'shmem_zero_setup': mm/shmem.c:2670: error: implicit declaration of function 'ima_shm_check' - added 'struct linux_binprm' in ima.h to fix compiler warning on Blackfin: In file included from mm/shmem.c:32: include/linux/ima.h:25: warning: 'struct linux_binprm' declared inside parameter list include/linux/ima.h:25: warning: its scope is only this definition or declaration, which is probably not what you want - moved fs.h include within _LINUX_IMA_H definition Signed-off-by: Mimi Zohar Signed-off-by: Mike Frysinger Signed-off-by: James Morris --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 75199888a6bd..8135fac294ee 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -28,6 +28,7 @@ #include #include #include +#include static struct vfsmount *shm_mnt; @@ -59,7 +60,6 @@ static struct vfsmount *shm_mnt; #include #include #include -#include #include #include -- cgit v1.2.2 From cf40bd16fdad42c053040bcd3988f5fdedbb6c57 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 21 Jan 2009 08:12:39 +0100 Subject: lockdep: annotate reclaim context (__GFP_NOFS) Here is another version, with the incremental patch rolled up, and added reclaim context annotation to kswapd, and allocation tracing to slab allocators (which may only ever reach the page allocator in rare cases, so it is good to put annotations here too). Haven't tested this version as such, but it should be getting closer to merge worthy ;) -- After noticing some code in mm/filemap.c accidentally perform a __GFP_FS allocation when it should not have been, I thought it might be a good idea to try to catch this kind of thing with lockdep. I coded up a little idea that seems to work. Unfortunately the system has to actually be in __GFP_FS page reclaim, then take the lock, before it will mark it. But at least that might still be some orders of magnitude more common (and more debuggable) than an actual deadlock condition, so we have some improvement I hope (the concept is no less complete than discovery of a lock's interrupt contexts). I guess we could even do the same thing with __GFP_IO (normal reclaim), and even GFP_NOIO locks too... but filesystems will have the most locks and fiddly code paths, so let's start there and see how it goes. It *seems* to work. I did a quick test. ================================= [ INFO: inconsistent lock state ] 2.6.28-rc6-00007-ged31348-dirty #26 --------------------------------- inconsistent {in-reclaim-W} -> {ov-reclaim-W} usage. modprobe/8526 [HC0[0]:SC0[0]:HE1:SE1] takes: (testlock){--..}, at: [] brd_init+0x55/0x216 [brd] {in-reclaim-W} state was registered at: [] __lock_acquire+0x75b/0x1a60 [] lock_acquire+0x91/0xc0 [] mutex_lock_nested+0xb1/0x310 [] brd_init+0x2b/0x216 [brd] [] _stext+0x3b/0x170 [] sys_init_module+0xaf/0x1e0 [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff irq event stamp: 3929 hardirqs last enabled at (3929): [] mutex_lock_nested+0x285/0x310 hardirqs last disabled at (3928): [] mutex_lock_nested+0x59/0x310 softirqs last enabled at (3732): [] sk_filter+0x83/0xe0 softirqs last disabled at (3730): [] sk_filter+0x16/0xe0 other info that might help us debug this: 1 lock held by modprobe/8526: #0: (testlock){--..}, at: [] brd_init+0x55/0x216 [brd] stack backtrace: Pid: 8526, comm: modprobe Not tainted 2.6.28-rc6-00007-ged31348-dirty #26 Call Trace: [] print_usage_bug+0x193/0x1d0 [] mark_lock+0xaf0/0xca0 [] mark_held_locks+0x55/0xc0 [] ? brd_init+0x0/0x216 [brd] [] trace_reclaim_fs+0x2a/0x60 [] __alloc_pages_internal+0x475/0x580 [] ? mutex_lock_nested+0x26e/0x310 [] ? brd_init+0x0/0x216 [brd] [] brd_init+0x6a/0x216 [brd] [] ? brd_init+0x0/0x216 [brd] [] _stext+0x3b/0x170 [] ? mutex_unlock+0x9/0x10 [] ? __mutex_unlock_slowpath+0x10d/0x180 [] ? trace_hardirqs_on_caller+0x12c/0x190 [] sys_init_module+0xaf/0x1e0 [] system_call_fastpath+0x16/0x1b Signed-off-by: Nick Piggin Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 5 +++++ mm/slab.c | 4 ++++ mm/slob.c | 2 ++ mm/slub.c | 1 + mm/vmscan.c | 3 +++ 5 files changed, 15 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5675b3073854..22b15a4cde8a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1479,6 +1479,8 @@ __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; unsigned long pages_reclaimed = 0; + lockdep_trace_alloc(gfp_mask); + might_sleep_if(wait); if (should_fail_alloc_page(gfp_mask, order)) @@ -1578,12 +1580,15 @@ nofail_alloc: */ cpuset_update_task_memory_state(); p->flags |= PF_MEMALLOC; + + lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); p->flags &= ~PF_MEMALLOC; cond_resched(); diff --git a/mm/slab.c b/mm/slab.c index ddc41f337d58..6b61de8543ec 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3318,6 +3318,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, unsigned long save_flags; void *ptr; + lockdep_trace_alloc(flags); + if (slab_should_failslab(cachep, flags)) return NULL; @@ -3394,6 +3396,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) unsigned long save_flags; void *objp; + lockdep_trace_alloc(flags); + if (slab_should_failslab(cachep, flags)) return NULL; diff --git a/mm/slob.c b/mm/slob.c index bf7e8fc3aed8..1264799df5d1 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -464,6 +464,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) unsigned int *m; int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + lockdep_trace_alloc(flags); + if (size < PAGE_SIZE - align) { if (!size) return ZERO_SIZE_PTR; diff --git a/mm/slub.c b/mm/slub.c index bdc9abb08a23..214eb207c513 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1596,6 +1596,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, unsigned long flags; unsigned int objsize; + lockdep_trace_alloc(gfpflags); might_sleep_if(gfpflags & __GFP_WAIT); if (should_failslab(s->objsize, gfpflags)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9a27c44aa327..303eb658b50b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1963,6 +1963,9 @@ static int kswapd(void *p) struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; + + lockdep_set_current_reclaim_state(GFP_KERNEL); + node_to_cpumask_ptr(cpumask, pgdat->node_id); if (!cpumask_empty(cpumask)) -- cgit v1.2.2 From 6700ec65c207068a81a535e9dca616fefac21671 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 15 Feb 2009 21:18:17 +0100 Subject: lockdep: annotate reclaim context (__GFP_NOFS), fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: fix build warning Fix: mm/vmscan.c: In function ‘kswapd’: mm/vmscan.c:1969: warning: ISO C90 forbids mixed declarations and code node_to_cpumask_ptr(cpumask, pgdat->node_id), has a side-effect: it defines the 'cpumask' local variable as well, so it has to go into the variable definition section. Sidenote: it might make sense to make this purpose of these macros more apparent, by naming them the standard way, such as: DEFINE_node_to_cpumask_ptr(cpumask, pgdat->node_id); (But that is outside the scope of this patch.) Cc: Rusty Russell Cc: Mike Travis Cc: Andrew Morton Cc: Nick Piggin Signed-off-by: Ingo Molnar --- mm/vmscan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 303eb658b50b..cf8441345277 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1963,11 +1963,10 @@ static int kswapd(void *p) struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; + node_to_cpumask_ptr(cpumask, pgdat->node_id); lockdep_set_current_reclaim_state(GFP_KERNEL); - node_to_cpumask_ptr(cpumask, pgdat->node_id); - if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); current->reclaim_state = &reclaim_state; -- cgit v1.2.2 From 734269521e320ad14ed39ae9b64d482b9028dcd2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:07 +0900 Subject: vmalloc: call flush_cache_vunmap() from unmap_kernel_range() Impact: proper vcache flush on unmap_kernel_range() flush_cache_vunmap() should be called before pages are unmapped. Add a call to it in unmap_kernel_range(). Signed-off-by: Tejun Heo --- mm/vmalloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 75f49d312e8c..c37924a2ee36 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1012,6 +1012,8 @@ void __init vmalloc_init(void) void unmap_kernel_range(unsigned long addr, unsigned long size) { unsigned long end = addr + size; + + flush_cache_vunmap(addr, end); vunmap_page_range(addr, end); flush_tlb_kernel_range(addr, end); } -- cgit v1.2.2 From f2a8205c4ef1af917d175c36a4097ae5587791c8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: percpu: kill percpu_alloc() and friends Impact: kill unused functions percpu_alloc() and its friends never saw much action. It was supposed to replace the cpu-mask unaware __alloc_percpu() but it never happened and in fact __percpu_alloc_mask() itself never really grew proper up/down handling interface either (no exported interface for populate/depopulate). percpu allocation is about to go through major reimplementation and there's no reason to carry this unused interface around. Replace it with __alloc_percpu() and free_percpu(). Signed-off-by: Tejun Heo --- mm/allocpercpu.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index 4297bc41bfd2..3653c570232b 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) /** - * percpu_alloc_mask - initial setup of per-cpu data + * alloc_percpu - initial setup of per-cpu data * @size: size of per-cpu object - * @gfp: may sleep or not etc. - * @mask: populate per-data for cpu's selected through mask bits + * @align: alignment * - * Populating per-cpu data for all online cpu's would be a typical use case, - * which is simplified by the percpu_alloc() wrapper. - * Per-cpu objects are populated with zeroed buffers. + * Allocate dynamic percpu area. Percpu objects are populated with + * zeroed buffers. */ -void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) +void *__alloc_percpu(size_t size, size_t align) { /* * We allocate whole cache lines to avoid false sharing */ size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); - void *pdata = kzalloc(sz, gfp); + void *pdata = kzalloc(sz, GFP_KERNEL); void *__pdata = __percpu_disguise(pdata); + /* + * Can't easily make larger alignment work with kmalloc. WARN + * on it. Larger alignment should only be used for module + * percpu sections on SMP for which this path isn't used. + */ + WARN_ON_ONCE(align > __alignof__(unsigned long long)); + if (unlikely(!pdata)) return NULL; - if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask))) + if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL, + &cpu_possible_map))) return __pdata; kfree(pdata); return NULL; } -EXPORT_SYMBOL_GPL(__percpu_alloc_mask); +EXPORT_SYMBOL_GPL(__alloc_percpu); /** - * percpu_free - final cleanup of per-cpu data + * free_percpu - final cleanup of per-cpu data * @__pdata: object to clean up * * We simply clean up any per-cpu object left. No need for the client to * track and specify through a bis mask which per-cpu objects are to free. */ -void percpu_free(void *__pdata) +void free_percpu(void *__pdata) { if (unlikely(!__pdata)) return; __percpu_depopulate_mask(__pdata, &cpu_possible_map); kfree(__percpu_disguise(__pdata)); } -EXPORT_SYMBOL_GPL(percpu_free); +EXPORT_SYMBOL_GPL(free_percpu); -- cgit v1.2.2 From f0aa6617903648077dffe5cfcf7c4458f4610fa7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: vmalloc: implement vm_area_register_early() Impact: allow multiple early vm areas There are places where kernel VM area needs to be allocated before vmalloc is initialized. This is done by allocating static vm_struct, initializing several fields and linking it to vmlist and later vmalloc initialization picking up these from vmlist. This is currently done manually and if there's more than one such areas, there's no defined way to arbitrate who gets which address. This patch implements vm_area_register_early(), which takes vm_area struct with flags and size initialized, assigns address to it and puts it on the vmlist. This way, multiple early vm areas can determine which addresses they should use. The only current user - alpha mm init - is converted to use it. Signed-off-by: Tejun Heo --- mm/vmalloc.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c37924a2ee36..d206261ad9ef 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -982,6 +983,29 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro } EXPORT_SYMBOL(vm_map_ram); +/** + * vm_area_register_early - register vmap area early during boot + * @vm: vm_struct to register + * @size: size of area to register + * + * This function is used to register kernel vm area before + * vmalloc_init() is called. @vm->size and @vm->flags should contain + * proper values on entry and other fields should be zero. On return, + * vm->addr contains the allocated address. + * + * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. + */ +void __init vm_area_register_early(struct vm_struct *vm) +{ + static size_t vm_init_off __initdata; + + vm->addr = (void *)VMALLOC_START + vm_init_off; + vm_init_off = PFN_ALIGN(vm_init_off + vm->size); + + vm->next = vmlist; + vmlist = vm; +} + void __init vmalloc_init(void) { struct vmap_area *va; -- cgit v1.2.2 From 8fc48985006da4ceba24508db64ec77fc0dfe3bb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: vmalloc: add un/map_kernel_range_noflush() Impact: two more public map/unmap functions Implement map_kernel_range_noflush() and unmap_kernel_range_noflush(). These functions respectively map and unmap address range in kernel VM area but doesn't do any vcache or tlb flushing. These will be used by new percpu allocator. Signed-off-by: Tejun Heo Cc: Nick Piggin --- mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d206261ad9ef..224eca9650a8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -153,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr, * * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] */ -static int vmap_page_range(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +static int vmap_page_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages) { pgd_t *pgd; unsigned long next; @@ -170,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end, if (err) break; } while (pgd++, addr = next, addr != end); - flush_cache_vmap(start, end); if (unlikely(err)) return err; return nr; } +static int vmap_page_range(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages) +{ + int ret; + + ret = vmap_page_range_noflush(start, end, prot, pages); + flush_cache_vmap(start, end); + return ret; +} + static inline int is_vmalloc_or_module_addr(const void *x) { /* @@ -1033,6 +1042,58 @@ void __init vmalloc_init(void) vmap_initialized = true; } +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map + * + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size + * specify should have been allocated using get_vm_area() and its + * friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is + * responsible for calling flush_cache_vmap() on to-be-mapped areas + * before calling this function. + * + * RETURNS: + * The number of pages mapped on success, -errno on failure. + */ +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) +{ + return vmap_page_range_noflush(addr, addr + size, prot, pages); +} + +/** + * unmap_kernel_range_noflush - unmap kernel VM area + * @addr: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size + * specify should have been allocated using get_vm_area() and its + * friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is + * responsible for calling flush_cache_vunmap() on to-be-mapped areas + * before calling this function and flush_tlb_kernel_range() after. + */ +void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) +{ + vunmap_page_range(addr, addr + size); +} + +/** + * unmap_kernel_range - unmap kernel VM area and flush cache and TLB + * @addr: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Similar to unmap_kernel_range_noflush() but flushes vcache before + * the unmapping and tlb after. + */ void unmap_kernel_range(unsigned long addr, unsigned long size) { unsigned long end = addr + size; -- cgit v1.2.2 From fbf59bc9d74d1fb30b8e0630743aff2806eafcea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: percpu: implement new dynamic percpu allocator Impact: new scalable dynamic percpu allocator which allows dynamic percpu areas to be accessed the same way as static ones Implement scalable dynamic percpu allocator which can be used for both static and dynamic percpu areas. This will allow static and dynamic areas to share faster direct access methods. This feature is optional and enabled only when CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is defined by arch. Please read comment on top of mm/percpu.c for details. Signed-off-by: Tejun Heo Cc: Andrew Morton --- mm/Makefile | 4 + mm/percpu.c | 890 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 894 insertions(+) create mode 100644 mm/percpu.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index 72255be57f89..818569b68f46 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o +ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +obj-$(CONFIG_SMP) += percpu.o +else obj-$(CONFIG_SMP) += allocpercpu.o +endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o diff --git a/mm/percpu.c b/mm/percpu.c new file mode 100644 index 000000000000..4617d97e877c --- /dev/null +++ b/mm/percpu.c @@ -0,0 +1,890 @@ +/* + * linux/mm/percpu.c - percpu memory allocator + * + * Copyright (C) 2009 SUSE Linux Products GmbH + * Copyright (C) 2009 Tejun Heo + * + * This file is released under the GPLv2. + * + * This is percpu allocator which can handle both static and dynamic + * areas. Percpu areas are allocated in chunks in vmalloc area. Each + * chunk is consisted of num_possible_cpus() units and the first chunk + * is used for static percpu variables in the kernel image (special + * boot time alloc/init handling necessary as these areas need to be + * brought up before allocation services are running). Unit grows as + * necessary and all units grow or shrink in unison. When a chunk is + * filled up, another chunk is allocated. ie. in vmalloc area + * + * c0 c1 c2 + * ------------------- ------------------- ------------ + * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u + * ------------------- ...... ------------------- .... ------------ + * + * Allocation is done in offset-size areas of single unit space. Ie, + * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, + * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring + * percpu base registers UNIT_SIZE apart. + * + * There are usually many small percpu allocations many of them as + * small as 4 bytes. The allocator organizes chunks into lists + * according to free size and tries to allocate from the fullest one. + * Each chunk keeps the maximum contiguous area size hint which is + * guaranteed to be eqaul to or larger than the maximum contiguous + * area in the chunk. This helps the allocator not to iterate the + * chunk maps unnecessarily. + * + * Allocation state in each chunk is kept using an array of integers + * on chunk->map. A positive value in the map represents a free + * region and negative allocated. Allocation inside a chunk is done + * by scanning this map sequentially and serving the first matching + * entry. This is mostly copied from the percpu_modalloc() allocator. + * Chunks are also linked into a rb tree to ease address to chunk + * mapping during free. + * + * To use this allocator, arch code should do the followings. + * + * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + * + * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate + * regular address to percpu pointer and back + * + * - use pcpu_setup_static() during percpu area initialization to + * setup kernel static percpu area + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define PCPU_MIN_UNIT_PAGES_SHIFT 4 /* also max alloc size */ +#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ +#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ + +struct pcpu_chunk { + struct list_head list; /* linked to pcpu_slot lists */ + struct rb_node rb_node; /* key is chunk->vm->addr */ + int free_size; /* free bytes in the chunk */ + int contig_hint; /* max contiguous size hint */ + struct vm_struct *vm; /* mapped vmalloc region */ + int map_used; /* # of map entries used */ + int map_alloc; /* # of map entries allocated */ + int *map; /* allocation map */ + struct page *page[]; /* #cpus * UNIT_PAGES */ +}; + +static int pcpu_unit_pages_shift; +static int pcpu_unit_pages; +static int pcpu_unit_shift; +static int pcpu_unit_size; +static int pcpu_chunk_size; +static int pcpu_nr_slots; +static size_t pcpu_chunk_struct_size; + +/* the address of the first chunk which starts with the kernel static area */ +void *pcpu_base_addr; +EXPORT_SYMBOL_GPL(pcpu_base_addr); + +/* the size of kernel static area */ +static int pcpu_static_size; + +/* + * One mutex to rule them all. + * + * The following mutex is grabbed in the outermost public alloc/free + * interface functions and released only when the operation is + * complete. As such, every function in this file other than the + * outermost functions are called under pcpu_mutex. + * + * It can easily be switched to use spinlock such that only the area + * allocation and page population commit are protected with it doing + * actual [de]allocation without holding any lock. However, given + * what this allocator does, I think it's better to let them run + * sequentially. + */ +static DEFINE_MUTEX(pcpu_mutex); + +static struct list_head *pcpu_slot; /* chunk list slots */ +static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ + +static int pcpu_size_to_slot(int size) +{ + int highbit = fls(size); + return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); +} + +static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) +{ + if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) + return 0; + + return pcpu_size_to_slot(chunk->free_size); +} + +static int pcpu_page_idx(unsigned int cpu, int page_idx) +{ + return (cpu << pcpu_unit_pages_shift) + page_idx; +} + +static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + return &chunk->page[pcpu_page_idx(cpu, page_idx)]; +} + +static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + return (unsigned long)chunk->vm->addr + + (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); +} + +static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, + int page_idx) +{ + return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; +} + +/** + * pcpu_realloc - versatile realloc + * @p: the current pointer (can be NULL for new allocations) + * @size: the current size (can be 0 for new allocations) + * @new_size: the wanted new size (can be 0 for free) + * + * More robust realloc which can be used to allocate, resize or free a + * memory area of arbitrary size. If the needed size goes over + * PAGE_SIZE, kernel VM is used. + * + * RETURNS: + * The new pointer on success, NULL on failure. + */ +static void *pcpu_realloc(void *p, size_t size, size_t new_size) +{ + void *new; + + if (new_size <= PAGE_SIZE) + new = kmalloc(new_size, GFP_KERNEL); + else + new = vmalloc(new_size); + if (new_size && !new) + return NULL; + + memcpy(new, p, min(size, new_size)); + if (new_size > size) + memset(new + size, 0, new_size - size); + + if (size <= PAGE_SIZE) + kfree(p); + else + vfree(p); + + return new; +} + +/** + * pcpu_chunk_relocate - put chunk in the appropriate chunk slot + * @chunk: chunk of interest + * @oslot: the previous slot it was on + * + * This function is called after an allocation or free changed @chunk. + * New slot according to the changed state is determined and @chunk is + * moved to the slot. + */ +static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) +{ + int nslot = pcpu_chunk_slot(chunk); + + if (oslot != nslot) { + if (oslot < nslot) + list_move(&chunk->list, &pcpu_slot[nslot]); + else + list_move_tail(&chunk->list, &pcpu_slot[nslot]); + } +} + +static struct rb_node **pcpu_chunk_rb_search(void *addr, + struct rb_node **parentp) +{ + struct rb_node **p = &pcpu_addr_root.rb_node; + struct rb_node *parent = NULL; + struct pcpu_chunk *chunk; + + while (*p) { + parent = *p; + chunk = rb_entry(parent, struct pcpu_chunk, rb_node); + + if (addr < chunk->vm->addr) + p = &(*p)->rb_left; + else if (addr > chunk->vm->addr) + p = &(*p)->rb_right; + else + break; + } + + if (parentp) + *parentp = parent; + return p; +} + +/** + * pcpu_chunk_addr_search - search for chunk containing specified address + * @addr: address to search for + * + * Look for chunk which might contain @addr. More specifically, it + * searchs for the chunk with the highest start address which isn't + * beyond @addr. + * + * RETURNS: + * The address of the found chunk. + */ +static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) +{ + struct rb_node *n, *parent; + struct pcpu_chunk *chunk; + + n = *pcpu_chunk_rb_search(addr, &parent); + if (!n) { + /* no exactly matching chunk, the parent is the closest */ + n = parent; + BUG_ON(!n); + } + chunk = rb_entry(n, struct pcpu_chunk, rb_node); + + if (addr < chunk->vm->addr) { + /* the parent was the next one, look for the previous one */ + n = rb_prev(n); + BUG_ON(!n); + chunk = rb_entry(n, struct pcpu_chunk, rb_node); + } + + return chunk; +} + +/** + * pcpu_chunk_addr_insert - insert chunk into address rb tree + * @new: chunk to insert + * + * Insert @new into address rb tree. + */ +static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) +{ + struct rb_node **p, *parent; + + p = pcpu_chunk_rb_search(new->vm->addr, &parent); + BUG_ON(*p); + rb_link_node(&new->rb_node, parent, p); + rb_insert_color(&new->rb_node, &pcpu_addr_root); +} + +/** + * pcpu_split_block - split a map block + * @chunk: chunk of interest + * @i: index of map block to split + * @head: head size (can be 0) + * @tail: tail size (can be 0) + * + * Split the @i'th map block into two or three blocks. If @head is + * non-zero, @head bytes block is inserted before block @i moving it + * to @i+1 and reducing its size by @head bytes. + * + * If @tail is non-zero, the target block, which can be @i or @i+1 + * depending on @head, is reduced by @tail bytes and @tail byte block + * is inserted after the target block. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) +{ + int nr_extra = !!head + !!tail; + int target = chunk->map_used + nr_extra; + + /* reallocation required? */ + if (chunk->map_alloc < target) { + int new_alloc = chunk->map_alloc; + int *new; + + while (new_alloc < target) + new_alloc *= 2; + + new = pcpu_realloc(chunk->map, + chunk->map_alloc * sizeof(new[0]), + new_alloc * sizeof(new[0])); + if (!new) + return -ENOMEM; + + chunk->map_alloc = new_alloc; + chunk->map = new; + } + + /* insert a new subblock */ + memmove(&chunk->map[i + nr_extra], &chunk->map[i], + sizeof(chunk->map[0]) * (chunk->map_used - i)); + chunk->map_used += nr_extra; + + if (head) { + chunk->map[i + 1] = chunk->map[i] - head; + chunk->map[i++] = head; + } + if (tail) { + chunk->map[i++] -= tail; + chunk->map[i] = tail; + } + return 0; +} + +/** + * pcpu_alloc_area - allocate area from a pcpu_chunk + * @chunk: chunk of interest + * @size: wanted size + * @align: wanted align + * + * Try to allocate @size bytes area aligned at @align from @chunk. + * Note that this function only allocates the offset. It doesn't + * populate or map the area. + * + * RETURNS: + * Allocated offset in @chunk on success, -errno on failure. + */ +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) +{ + int oslot = pcpu_chunk_slot(chunk); + int max_contig = 0; + int i, off; + + /* + * The static chunk initially doesn't have map attached + * because kmalloc wasn't available during init. Give it one. + */ + if (unlikely(!chunk->map)) { + chunk->map = pcpu_realloc(NULL, 0, + PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + if (!chunk->map) + return -ENOMEM; + + chunk->map_alloc = PCPU_DFL_MAP_ALLOC; + chunk->map[chunk->map_used++] = -pcpu_static_size; + if (chunk->free_size) + chunk->map[chunk->map_used++] = chunk->free_size; + } + + for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { + bool is_last = i + 1 == chunk->map_used; + int head, tail; + + /* extra for alignment requirement */ + head = ALIGN(off, align) - off; + BUG_ON(i == 0 && head != 0); + + if (chunk->map[i] < 0) + continue; + if (chunk->map[i] < head + size) { + max_contig = max(chunk->map[i], max_contig); + continue; + } + + /* + * If head is small or the previous block is free, + * merge'em. Note that 'small' is defined as smaller + * than sizeof(int), which is very small but isn't too + * uncommon for percpu allocations. + */ + if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { + if (chunk->map[i - 1] > 0) + chunk->map[i - 1] += head; + else { + chunk->map[i - 1] -= head; + chunk->free_size -= head; + } + chunk->map[i] -= head; + off += head; + head = 0; + } + + /* if tail is small, just keep it around */ + tail = chunk->map[i] - head - size; + if (tail < sizeof(int)) + tail = 0; + + /* split if warranted */ + if (head || tail) { + if (pcpu_split_block(chunk, i, head, tail)) + return -ENOMEM; + if (head) { + i++; + off += head; + max_contig = max(chunk->map[i - 1], max_contig); + } + if (tail) + max_contig = max(chunk->map[i + 1], max_contig); + } + + /* update hint and mark allocated */ + if (is_last) + chunk->contig_hint = max_contig; /* fully scanned */ + else + chunk->contig_hint = max(chunk->contig_hint, + max_contig); + + chunk->free_size -= chunk->map[i]; + chunk->map[i] = -chunk->map[i]; + + pcpu_chunk_relocate(chunk, oslot); + return off; + } + + chunk->contig_hint = max_contig; /* fully scanned */ + pcpu_chunk_relocate(chunk, oslot); + + /* + * Tell the upper layer that this chunk has no area left. + * Note that this is not an error condition but a notification + * to upper layer that it needs to look at other chunks. + * -ENOSPC is chosen as it isn't used in memory subsystem and + * matches the meaning in a way. + */ + return -ENOSPC; +} + +/** + * pcpu_free_area - free area to a pcpu_chunk + * @chunk: chunk of interest + * @freeme: offset of area to free + * + * Free area starting from @freeme to @chunk. Note that this function + * only modifies the allocation map. It doesn't depopulate or unmap + * the area. + */ +static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) +{ + int oslot = pcpu_chunk_slot(chunk); + int i, off; + + for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) + if (off == freeme) + break; + BUG_ON(off != freeme); + BUG_ON(chunk->map[i] > 0); + + chunk->map[i] = -chunk->map[i]; + chunk->free_size += chunk->map[i]; + + /* merge with previous? */ + if (i > 0 && chunk->map[i - 1] >= 0) { + chunk->map[i - 1] += chunk->map[i]; + chunk->map_used--; + memmove(&chunk->map[i], &chunk->map[i + 1], + (chunk->map_used - i) * sizeof(chunk->map[0])); + i--; + } + /* merge with next? */ + if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { + chunk->map[i] += chunk->map[i + 1]; + chunk->map_used--; + memmove(&chunk->map[i + 1], &chunk->map[i + 2], + (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); + } + + chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); + pcpu_chunk_relocate(chunk, oslot); +} + +/** + * pcpu_unmap - unmap pages out of a pcpu_chunk + * @chunk: chunk of interest + * @page_start: page index of the first page to unmap + * @page_end: page index of the last page to unmap + 1 + * @flush: whether to flush cache and tlb or not + * + * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. + * If @flush is true, vcache is flushed before unmapping and tlb + * after. + */ +static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, + bool flush) +{ + unsigned int last = num_possible_cpus() - 1; + unsigned int cpu; + + /* + * Each flushing trial can be very expensive, issue flush on + * the whole region at once rather than doing it for each cpu. + * This could be an overkill but is more scalable. + */ + if (flush) + flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); + + for_each_possible_cpu(cpu) + unmap_kernel_range_noflush( + pcpu_chunk_addr(chunk, cpu, page_start), + (page_end - page_start) << PAGE_SHIFT); + + /* ditto as flush_cache_vunmap() */ + if (flush) + flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); +} + +/** + * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk + * @chunk: chunk to depopulate + * @off: offset to the area to depopulate + * @size: size of the area to depopulate + * @flush: whether to flush cache and tlb or not + * + * For each cpu, depopulate and unmap pages [@page_start,@page_end) + * from @chunk. If @flush is true, vcache is flushed before unmapping + * and tlb after. + */ +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, size_t off, + size_t size, bool flush) +{ + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + int unmap_start = -1; + int uninitialized_var(unmap_end); + unsigned int cpu; + int i; + + for (i = page_start; i < page_end; i++) { + for_each_possible_cpu(cpu) { + struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + + if (!*pagep) + continue; + + __free_page(*pagep); + + /* + * If it's partial depopulation, it might get + * populated or depopulated again. Mark the + * page gone. + */ + *pagep = NULL; + + unmap_start = unmap_start < 0 ? i : unmap_start; + unmap_end = i + 1; + } + } + + if (unmap_start >= 0) + pcpu_unmap(chunk, unmap_start, unmap_end, flush); +} + +/** + * pcpu_map - map pages into a pcpu_chunk + * @chunk: chunk of interest + * @page_start: page index of the first page to map + * @page_end: page index of the last page to map + 1 + * + * For each cpu, map pages [@page_start,@page_end) into @chunk. + * vcache is flushed afterwards. + */ +static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) +{ + unsigned int last = num_possible_cpus() - 1; + unsigned int cpu; + int err; + + for_each_possible_cpu(cpu) { + err = map_kernel_range_noflush( + pcpu_chunk_addr(chunk, cpu, page_start), + (page_end - page_start) << PAGE_SHIFT, + PAGE_KERNEL, + pcpu_chunk_pagep(chunk, cpu, page_start)); + if (err < 0) + return err; + } + + /* flush at once, please read comments in pcpu_unmap() */ + flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); + return 0; +} + +/** + * pcpu_populate_chunk - populate and map an area of a pcpu_chunk + * @chunk: chunk of interest + * @off: offset to the area to populate + * @size: size of the area to populate + * + * For each cpu, populate and map pages [@page_start,@page_end) into + * @chunk. The area is cleared on return. + */ +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + int map_start = -1; + int map_end; + unsigned int cpu; + int i; + + for (i = page_start; i < page_end; i++) { + if (pcpu_chunk_page_occupied(chunk, i)) { + if (map_start >= 0) { + if (pcpu_map(chunk, map_start, map_end)) + goto err; + map_start = -1; + } + continue; + } + + map_start = map_start < 0 ? i : map_start; + map_end = i + 1; + + for_each_possible_cpu(cpu) { + struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + + *pagep = alloc_pages_node(cpu_to_node(cpu), + alloc_mask, 0); + if (!*pagep) + goto err; + } + } + + if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) + goto err; + + for_each_possible_cpu(cpu) + memset(chunk->vm->addr + (cpu << pcpu_unit_shift) + off, 0, + size); + + return 0; +err: + /* likely under heavy memory pressure, give memory back */ + pcpu_depopulate_chunk(chunk, off, size, true); + return -ENOMEM; +} + +static void free_pcpu_chunk(struct pcpu_chunk *chunk) +{ + if (!chunk) + return; + if (chunk->vm) + free_vm_area(chunk->vm); + pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0); + kfree(chunk); +} + +static struct pcpu_chunk *alloc_pcpu_chunk(void) +{ + struct pcpu_chunk *chunk; + + chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL); + if (!chunk) + return NULL; + + chunk->map = pcpu_realloc(NULL, 0, + PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + chunk->map_alloc = PCPU_DFL_MAP_ALLOC; + chunk->map[chunk->map_used++] = pcpu_unit_size; + + chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); + if (!chunk->vm) { + free_pcpu_chunk(chunk); + return NULL; + } + + INIT_LIST_HEAD(&chunk->list); + chunk->free_size = pcpu_unit_size; + chunk->contig_hint = pcpu_unit_size; + + return chunk; +} + +/** + * __alloc_percpu - allocate percpu area + * @size: size of area to allocate + * @align: alignment of area (max PAGE_SIZE) + * + * Allocate percpu area of @size bytes aligned at @align. Might + * sleep. Might trigger writeouts. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void *__alloc_percpu(size_t size, size_t align) +{ + void *ptr = NULL; + struct pcpu_chunk *chunk; + int slot, off; + + if (unlikely(!size || size > PAGE_SIZE << PCPU_MIN_UNIT_PAGES_SHIFT || + align > PAGE_SIZE)) { + WARN(true, "illegal size (%zu) or align (%zu) for " + "percpu allocation\n", size, align); + return NULL; + } + + mutex_lock(&pcpu_mutex); + + /* allocate area */ + for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + if (size > chunk->contig_hint) + continue; + off = pcpu_alloc_area(chunk, size, align); + if (off >= 0) + goto area_found; + if (off != -ENOSPC) + goto out_unlock; + } + } + + /* hmmm... no space left, create a new chunk */ + chunk = alloc_pcpu_chunk(); + if (!chunk) + goto out_unlock; + pcpu_chunk_relocate(chunk, -1); + pcpu_chunk_addr_insert(chunk); + + off = pcpu_alloc_area(chunk, size, align); + if (off < 0) + goto out_unlock; + +area_found: + /* populate, map and clear the area */ + if (pcpu_populate_chunk(chunk, off, size)) { + pcpu_free_area(chunk, off); + goto out_unlock; + } + + ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off); +out_unlock: + mutex_unlock(&pcpu_mutex); + return ptr; +} +EXPORT_SYMBOL_GPL(__alloc_percpu); + +static void pcpu_kill_chunk(struct pcpu_chunk *chunk) +{ + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); + list_del(&chunk->list); + rb_erase(&chunk->rb_node, &pcpu_addr_root); + free_pcpu_chunk(chunk); +} + +/** + * free_percpu - free percpu area + * @ptr: pointer to area to free + * + * Free percpu area @ptr. Might sleep. + */ +void free_percpu(void *ptr) +{ + void *addr = __pcpu_ptr_to_addr(ptr); + struct pcpu_chunk *chunk; + int off; + + if (!ptr) + return; + + mutex_lock(&pcpu_mutex); + + chunk = pcpu_chunk_addr_search(addr); + off = addr - chunk->vm->addr; + + pcpu_free_area(chunk, off); + + /* the chunk became fully free, kill one if there are other free ones */ + if (chunk->free_size == pcpu_unit_size) { + struct pcpu_chunk *pos; + + list_for_each_entry(pos, + &pcpu_slot[pcpu_chunk_slot(chunk)], list) + if (pos != chunk) { + pcpu_kill_chunk(pos); + break; + } + } + + mutex_unlock(&pcpu_mutex); +} +EXPORT_SYMBOL_GPL(free_percpu); + +/** + * pcpu_setup_static - initialize kernel static percpu area + * @populate_pte_fn: callback to allocate pagetable + * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages + * + * Initialize kernel static percpu area. The caller should allocate + * all the necessary pages and pass them in @pages. + * @populate_pte_fn() is called on each page to be used for percpu + * mapping and is responsible for making sure all the necessary page + * tables for the page is allocated. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access. + */ +size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, + struct page **pages, size_t cpu_size) +{ + static struct vm_struct static_vm; + struct pcpu_chunk *static_chunk; + int nr_cpu_pages = DIV_ROUND_UP(cpu_size, PAGE_SIZE); + unsigned int cpu; + int err, i; + + pcpu_unit_pages_shift = max_t(int, PCPU_MIN_UNIT_PAGES_SHIFT, + order_base_2(cpu_size) - PAGE_SHIFT); + + pcpu_static_size = cpu_size; + pcpu_unit_pages = 1 << pcpu_unit_pages_shift; + pcpu_unit_shift = PAGE_SHIFT + pcpu_unit_pages_shift; + pcpu_unit_size = 1 << pcpu_unit_shift; + pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; + pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1; + pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + + (1 << pcpu_unit_pages_shift) * sizeof(struct page *); + + /* allocate chunk slots */ + pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); + for (i = 0; i < pcpu_nr_slots; i++) + INIT_LIST_HEAD(&pcpu_slot[i]); + + /* init and register vm area */ + static_vm.flags = VM_ALLOC; + static_vm.size = pcpu_chunk_size; + vm_area_register_early(&static_vm); + + /* init static_chunk */ + static_chunk = alloc_bootmem(pcpu_chunk_struct_size); + INIT_LIST_HEAD(&static_chunk->list); + static_chunk->vm = &static_vm; + static_chunk->free_size = pcpu_unit_size - pcpu_static_size; + static_chunk->contig_hint = static_chunk->free_size; + + /* assign pages and map them */ + for_each_possible_cpu(cpu) { + for (i = 0; i < nr_cpu_pages; i++) { + *pcpu_chunk_pagep(static_chunk, cpu, i) = *pages++; + populate_pte_fn(pcpu_chunk_addr(static_chunk, cpu, i)); + } + } + + err = pcpu_map(static_chunk, 0, nr_cpu_pages); + if (err) + panic("failed to setup static percpu area, err=%d\n", err); + + /* link static_chunk in */ + pcpu_chunk_relocate(static_chunk, -1); + pcpu_chunk_addr_insert(static_chunk); + + /* we're done */ + pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0); + return pcpu_unit_size; +} -- cgit v1.2.2 From ffadd4d0feb5376c82dc3a4104731b7ce2794edc Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 17 Feb 2009 12:05:07 -0500 Subject: SLUB: Introduce and use SLUB_MAX_SIZE and SLUB_PAGE_SHIFT constants As a preparational patch to bump up page allocator pass-through threshold, introduce two new constants SLUB_MAX_SIZE and SLUB_PAGE_SHIFT and convert mm/slub.c to use them. Reported-by: "Zhang, Yanmin" Tested-by: "Zhang, Yanmin" Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index bdc9abb08a23..5a5e7f5bf799 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2475,7 +2475,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; +struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); static int __init setup_slub_min_order(char *str) @@ -2537,7 +2537,7 @@ panic: } #ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; +static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT]; static void sysfs_add_func(struct work_struct *w) { @@ -2658,7 +2658,7 @@ void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large(size, flags); s = get_slab(size, flags); @@ -2686,7 +2686,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large_node(size, flags, node); s = get_slab(size, flags); @@ -2985,7 +2985,7 @@ void __init kmem_cache_init(void) caches++; } - for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { create_kmalloc_cache(&kmalloc_caches[i], "kmalloc", 1 << i, GFP_KERNEL); caches++; @@ -3022,7 +3022,7 @@ void __init kmem_cache_init(void) slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) kmalloc_caches[i]. name = kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); @@ -3222,7 +3222,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large(size, gfpflags); s = get_slab(size, gfpflags); @@ -3238,7 +3238,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large_node(size, gfpflags, node); s = get_slab(size, gfpflags); -- cgit v1.2.2 From e8120ff1ffc51102ead1f4c98a3fd5d26fefc722 Mon Sep 17 00:00:00 2001 From: Zhang Yanmin Date: Thu, 12 Feb 2009 18:00:17 +0200 Subject: SLUB: Fix default slab order for big object sizes The default order of kmalloc-8192 on 2*4 stoakley is an issue of calculate_order. slab_size order name ------------------------------------------------- 4096 3 sgpool-128 8192 2 kmalloc-8192 16384 3 kmalloc-16384 kmalloc-8192's default order is smaller than sgpool-128's. On 4*4 tigerton machine, a similiar issue appears on another kmem_cache. Function calculate_order uses 'min_objects /= 2;' to shrink. Plus size calculation/checking in slab_order, sometimes above issue appear. Below patch against 2.6.29-rc2 fixes it. I checked the default orders of all kmem_cache and they don't become smaller than before. So the patch wouldn't hurt performance. Signed-off-by Zhang Yanmin Signed-off-by: Pekka Enberg --- mm/slub.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 5a5e7f5bf799..c01a7a3001d2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1844,6 +1844,7 @@ static inline int calculate_order(int size) int order; int min_objects; int fraction; + int max_objects; /* * Attempt to find best configuration for a slab. This @@ -1856,6 +1857,9 @@ static inline int calculate_order(int size) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); + max_objects = (PAGE_SIZE << slub_max_order)/size; + min_objects = min(min_objects, max_objects); + while (min_objects > 1) { fraction = 16; while (fraction >= 4) { @@ -1865,7 +1869,7 @@ static inline int calculate_order(int size) return order; fraction /= 2; } - min_objects /= 2; + min_objects --; } /* -- cgit v1.2.2 From cae3aeb83fef5a7c9c8ac40e653e59dd9a35469c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 21 Feb 2009 16:56:23 +0900 Subject: percpu: clean up size usage Andrew was concerned about the unit of variables named or have suffix size. Every usage in percpu allocator is in bytes but make it super clear by adding comments. While at it, make pcpu_depopulate_chunk() take int @off and @size like everyone else. Signed-off-by: Tejun Heo Cc: Andrew Morton --- mm/percpu.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 4617d97e877c..997724c2ea24 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -119,7 +119,7 @@ static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ static int pcpu_size_to_slot(int size) { - int highbit = fls(size); + int highbit = fls(size); /* size is in bytes */ return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); } @@ -158,8 +158,8 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, /** * pcpu_realloc - versatile realloc * @p: the current pointer (can be NULL for new allocations) - * @size: the current size (can be 0 for new allocations) - * @new_size: the wanted new size (can be 0 for free) + * @size: the current size in bytes (can be 0 for new allocations) + * @new_size: the wanted new size in bytes (can be 0 for free) * * More robust realloc which can be used to allocate, resize or free a * memory area of arbitrary size. If the needed size goes over @@ -290,8 +290,8 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) * pcpu_split_block - split a map block * @chunk: chunk of interest * @i: index of map block to split - * @head: head size (can be 0) - * @tail: tail size (can be 0) + * @head: head size in bytes (can be 0) + * @tail: tail size in bytes (can be 0) * * Split the @i'th map block into two or three blocks. If @head is * non-zero, @head bytes block is inserted before block @i moving it @@ -346,7 +346,7 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) /** * pcpu_alloc_area - allocate area from a pcpu_chunk * @chunk: chunk of interest - * @size: wanted size + * @size: wanted size in bytes * @align: wanted align * * Try to allocate @size bytes area aligned at @align from @chunk. @@ -540,15 +540,15 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk * @chunk: chunk to depopulate * @off: offset to the area to depopulate - * @size: size of the area to depopulate + * @size: size of the area to depopulate in bytes * @flush: whether to flush cache and tlb or not * * For each cpu, depopulate and unmap pages [@page_start,@page_end) * from @chunk. If @flush is true, vcache is flushed before unmapping * and tlb after. */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, size_t off, - size_t size, bool flush) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, + bool flush) { int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); @@ -617,7 +617,7 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) * pcpu_populate_chunk - populate and map an area of a pcpu_chunk * @chunk: chunk of interest * @off: offset to the area to populate - * @size: size of the area to populate + * @size: size of the area to populate in bytes * * For each cpu, populate and map pages [@page_start,@page_end) into * @chunk. The area is cleared on return. @@ -707,7 +707,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) /** * __alloc_percpu - allocate percpu area - * @size: size of area to allocate + * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * * Allocate percpu area of @size bytes aligned at @align. Might @@ -819,6 +819,7 @@ EXPORT_SYMBOL_GPL(free_percpu); * pcpu_setup_static - initialize kernel static percpu area * @populate_pte_fn: callback to allocate pagetable * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages + * @cpu_size: the size of static percpu area in bytes * * Initialize kernel static percpu area. The caller should allocate * all the necessary pages and pass them in @pages. -- cgit v1.2.2 From 3b89d7d881a1dbb4da158f7eb5d6b3ceefc72810 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Sun, 22 Feb 2009 17:40:07 -0800 Subject: slub: move min_partial to struct kmem_cache Although it allows for better cacheline use, it is unnecessary to save a copy of the cache's min_partial value in each kmem_cache_node. Cc: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index bdc9abb08a23..4fff385b17a3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1335,7 +1335,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) n = get_node(s, zone_to_nid(zone)); if (n && cpuset_zone_allowed_hardwall(zone, flags) && - n->nr_partial > n->min_partial) { + n->nr_partial > s->min_partial) { page = get_partial_node(n); if (page) return page; @@ -1387,7 +1387,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) slab_unlock(page); } else { stat(c, DEACTIVATE_EMPTY); - if (n->nr_partial < n->min_partial) { + if (n->nr_partial < s->min_partial) { /* * Adding an empty slab to the partial slabs in order * to avoid page allocator overhead. This slab needs @@ -1928,17 +1928,6 @@ static void init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) { n->nr_partial = 0; - - /* - * The larger the object size is, the more pages we want on the partial - * list to avoid pounding the page allocator excessively. - */ - n->min_partial = ilog2(s->size); - if (n->min_partial < MIN_PARTIAL) - n->min_partial = MIN_PARTIAL; - else if (n->min_partial > MAX_PARTIAL) - n->min_partial = MAX_PARTIAL; - spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG @@ -2181,6 +2170,15 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) } #endif +static void calculate_min_partial(struct kmem_cache *s, unsigned long min) +{ + if (min < MIN_PARTIAL) + min = MIN_PARTIAL; + else if (min > MAX_PARTIAL) + min = MAX_PARTIAL; + s->min_partial = min; +} + /* * calculate_sizes() determines the order and the distribution of data within * a slab object. @@ -2319,6 +2317,11 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, if (!calculate_sizes(s, -1)) goto error; + /* + * The larger the object size is, the more pages we want on the partial + * list to avoid pounding the page allocator excessively. + */ + calculate_min_partial(s, ilog2(s->size)); s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; -- cgit v1.2.2 From 73d342b169db700b5a6ad626fe4b86911efec8db Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Sun, 22 Feb 2009 17:40:09 -0800 Subject: slub: add min_partial sysfs tunable Now that a cache's min_partial has been moved to struct kmem_cache, it's possible to easily tune it from userspace by adding a sysfs attribute. It may not be desirable to keep a large number of partial slabs around if a cache is used infrequently and memory, especially when constrained by a cgroup, is scarce. It's better to allow userspace to set the minimum policy per cache instead of relying explicitly on kmem_cache_shrink(). The memory savings from simply moving min_partial from struct kmem_cache_node to struct kmem_cache is obviously not significant (unless maybe you're from SGI or something), at the largest it's # allocated caches * (MAX_NUMNODES - 1) * sizeof(unsigned long) The true savings occurs when userspace reduces the number of partial slabs that would otherwise be wasted, especially on machines with a large number of nodes (ia64 with CONFIG_NODES_SHIFT at 10 for default?). As well as the kernel estimates ideal values for n->min_partial and ensures it's within a sane range, userspace has no other input other than writing to /sys/kernel/slab/cache/shrink. There simply isn't any better heuristic to add when calculating the partial values for a better estimate that works for all possible caches. And since it's currently a static value, the user really has no way of reclaiming that wasted space, which can be significant when constrained by a cgroup (either cpusets or, later, memory controller slab limits) without shrinking it entirely. This also allows the user to specify that increased fragmentation and more partial slabs are actually desired to avoid the cost of allocating new slabs at runtime for specific caches. There's also no reason why this should be a per-struct kmem_cache_node value in the first place. You could argue that a machine would have such node size asymmetries that it should be specified on a per-node basis, but we know nobody is doing that right now since it's a purely static value at the moment and there's no convenient way to tune that via slub's sysfs interface. Cc: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 4fff385b17a3..a3e2d552ff46 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3838,6 +3838,26 @@ static ssize_t order_show(struct kmem_cache *s, char *buf) } SLAB_ATTR(order); +static ssize_t min_partial_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%lu\n", s->min_partial); +} + +static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long min; + int err; + + err = strict_strtoul(buf, 10, &min); + if (err) + return err; + + calculate_min_partial(s, min); + return length; +} +SLAB_ATTR(min_partial); + static ssize_t ctor_show(struct kmem_cache *s, char *buf) { if (s->ctor) { @@ -4153,6 +4173,7 @@ static struct attribute *slab_attrs[] = { &object_size_attr.attr, &objs_per_slab_attr.attr, &order_attr.attr, + &min_partial_attr.attr, &objects_attr.attr, &objects_partial_attr.attr, &total_objects_attr.attr, -- cgit v1.2.2 From cb83b42e23bd6c4bf91793a320fbe83787c13596 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:20 +0900 Subject: percpu: fix pcpu_chunk_struct_size Impact: fix short allocation leading to memory corruption While dropping rvalue wrapping macros around global parameters, pcpu_chunk_struct_size was set incorrectly resulting in shorter page pointer array. Fix it. Signed-off-by: Tejun Heo --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 997724c2ea24..ed92caa2aa3b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -850,7 +850,7 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + (1 << pcpu_unit_pages_shift) * sizeof(struct page *); + + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); /* allocate chunk slots */ pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); -- cgit v1.2.2 From c132937556f56ee4b831ef4b23f1846e05fde102 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:20 +0900 Subject: bootmem: clean up arch-specific bootmem wrapping Impact: cleaner and consistent bootmem wrapping By setting CONFIG_HAVE_ARCH_BOOTMEM_NODE, archs can define arch-specific wrappers for bootmem allocation. However, this is done a bit strangely in that only the high level convenience macros can be changed while lower level, but still exported, interface functions can't be wrapped. This not only is messy but also leads to strange situation where alloc_bootmem() does what the arch wants it to do but the equivalent __alloc_bootmem() call doesn't although they should be able to be used interchangeably. This patch updates bootmem such that archs can override / wrap the backend function - alloc_bootmem_core() instead of the highlevel interface functions to allow simpler and consistent wrapping. Also, HAVE_ARCH_BOOTMEM_NODE is renamed to HAVE_ARCH_BOOTMEM. Signed-off-by: Tejun Heo Cc: Johannes Weiner --- mm/bootmem.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 51a0ccf61e0e..d7140c008ba8 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -37,6 +37,16 @@ static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); static int bootmem_debug; +/* + * If an arch needs to apply workarounds to bootmem allocation, it can + * set CONFIG_HAVE_ARCH_BOOTMEM and define a wrapper around + * __alloc_bootmem_core(). + */ +#ifndef CONFIG_HAVE_ARCH_BOOTMEM +#define alloc_bootmem_core(bdata, size, align, goal, limit) \ + __alloc_bootmem_core((bdata), (size), (align), (goal), (limit)) +#endif + static int __init bootmem_debug_setup(char *buf) { bootmem_debug = 1; @@ -382,7 +392,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); } -#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE /** * reserve_bootmem - mark a page range as usable * @addr: starting address of the range @@ -403,7 +412,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, return mark_bootmem(start, end, 1, flags); } -#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, unsigned long step) @@ -428,7 +436,7 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, return ALIGN(base + off, align) - base; } -static void * __init alloc_bootmem_core(struct bootmem_data *bdata, +static void * __init __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { -- cgit v1.2.2 From c0c0a29379b5848aec2e8f1c58d853d3cb7118b8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: vmalloc: add @align to vm_area_register_early() Impact: allow larger alignment for early vmalloc area allocation Some early vmalloc users might want larger alignment, for example, for custom large page mapping. Add @align to vm_area_register_early(). While at it, drop docbook comment on non-existent @size. Signed-off-by: Tejun Heo Cc: Nick Piggin Cc: Ivan Kokshaysky --- mm/percpu.c | 2 +- mm/vmalloc.c | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index ed92caa2aa3b..41e7a5f5ab1b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -860,7 +860,7 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, /* init and register vm area */ static_vm.flags = VM_ALLOC; static_vm.size = pcpu_chunk_size; - vm_area_register_early(&static_vm); + vm_area_register_early(&static_vm, PAGE_SIZE); /* init static_chunk */ static_chunk = alloc_bootmem(pcpu_chunk_struct_size); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 224eca9650a8..366ae9ea6af2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -995,7 +995,7 @@ EXPORT_SYMBOL(vm_map_ram); /** * vm_area_register_early - register vmap area early during boot * @vm: vm_struct to register - * @size: size of area to register + * @align: requested alignment * * This function is used to register kernel vm area before * vmalloc_init() is called. @vm->size and @vm->flags should contain @@ -1004,12 +1004,15 @@ EXPORT_SYMBOL(vm_map_ram); * * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. */ -void __init vm_area_register_early(struct vm_struct *vm) +void __init vm_area_register_early(struct vm_struct *vm, size_t align) { static size_t vm_init_off __initdata; + unsigned long addr; + + addr = ALIGN(VMALLOC_START + vm_init_off, align); + vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; - vm->addr = (void *)VMALLOC_START + vm_init_off; - vm_init_off = PFN_ALIGN(vm_init_off + vm->size); + vm->addr = (void *)addr; vm->next = vmlist; vmlist = vm; -- cgit v1.2.2 From d9b55eeb1d55ef2dc5a4fdbff9604c2c68cb5649 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: percpu: remove unit_size power-of-2 restriction Impact: allow unit_size to be arbitrary multiple of PAGE_SIZE In dynamic percpu allocator, there is no reason the unit size should be power of two. Remove the restriction. As non-power-of-two unit size means that empty chunks fall into the same slot index as lightly occupied chunks which is bad for reclaming. Reserve an extra slot for empty chunks. Signed-off-by: Tejun Heo --- mm/percpu.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 41e7a5f5ab1b..d9e6e5d1dbd4 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -67,7 +67,7 @@ #include #include -#define PCPU_MIN_UNIT_PAGES_SHIFT 4 /* also max alloc size */ +#define PCPU_MIN_UNIT_PAGES 16 /* max alloc size in pages */ #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ @@ -83,9 +83,7 @@ struct pcpu_chunk { struct page *page[]; /* #cpus * UNIT_PAGES */ }; -static int pcpu_unit_pages_shift; static int pcpu_unit_pages; -static int pcpu_unit_shift; static int pcpu_unit_size; static int pcpu_chunk_size; static int pcpu_nr_slots; @@ -117,12 +115,19 @@ static DEFINE_MUTEX(pcpu_mutex); static struct list_head *pcpu_slot; /* chunk list slots */ static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ -static int pcpu_size_to_slot(int size) +static int __pcpu_size_to_slot(int size) { int highbit = fls(size); /* size is in bytes */ return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); } +static int pcpu_size_to_slot(int size) +{ + if (size == pcpu_unit_size) + return pcpu_nr_slots - 1; + return __pcpu_size_to_slot(size); +} + static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) { if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) @@ -133,7 +138,7 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) static int pcpu_page_idx(unsigned int cpu, int page_idx) { - return (cpu << pcpu_unit_pages_shift) + page_idx; + return cpu * pcpu_unit_pages + page_idx; } static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, @@ -659,7 +664,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) goto err; for_each_possible_cpu(cpu) - memset(chunk->vm->addr + (cpu << pcpu_unit_shift) + off, 0, + memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, size); return 0; @@ -722,7 +727,7 @@ void *__alloc_percpu(size_t size, size_t align) struct pcpu_chunk *chunk; int slot, off; - if (unlikely(!size || size > PAGE_SIZE << PCPU_MIN_UNIT_PAGES_SHIFT || + if (unlikely(!size || size > PCPU_MIN_UNIT_PAGES * PAGE_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " "percpu allocation\n", size, align); @@ -840,19 +845,19 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, unsigned int cpu; int err, i; - pcpu_unit_pages_shift = max_t(int, PCPU_MIN_UNIT_PAGES_SHIFT, - order_base_2(cpu_size) - PAGE_SHIFT); + pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_PAGES, PFN_UP(cpu_size)); pcpu_static_size = cpu_size; - pcpu_unit_pages = 1 << pcpu_unit_pages_shift; - pcpu_unit_shift = PAGE_SHIFT + pcpu_unit_pages_shift; - pcpu_unit_size = 1 << pcpu_unit_shift; + pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; - pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); - /* allocate chunk slots */ + /* + * Allocate chunk slots. The additional last slot is for + * empty chunks. + */ + pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); -- cgit v1.2.2 From 8d408b4be37bc49c9086531f2ebe411cf5731746 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: percpu: give more latitude to arch specific first chunk initialization Impact: more latitude for first percpu chunk allocation The first percpu chunk serves the kernel static percpu area and may or may not contain extra room for further dynamic allocation. Initialization of the first chunk needs to be done before normal memory allocation service is up, so it has its own init path - pcpu_setup_static(). It seems archs need more latitude while initializing the first chunk for example to take advantage of large page mapping. This patch makes the following changes to allow this. * Define PERCPU_DYNAMIC_RESERVE to give arch hint about how much space to reserve in the first chunk for further dynamic allocation. * Rename pcpu_setup_static() to pcpu_setup_first_chunk(). * Make pcpu_setup_first_chunk() much more flexible by fetching page pointer by callback and adding optional @unit_size, @free_size and @base_addr arguments which allow archs to selectively part of chunk initialization to their likings. Signed-off-by: Tejun Heo --- mm/percpu.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 116 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index d9e6e5d1dbd4..9ac01980cce0 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -48,8 +48,8 @@ * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back * - * - use pcpu_setup_static() during percpu area initialization to - * setup kernel static percpu area + * - use pcpu_setup_first_chunk() during percpu area initialization to + * setup the first chunk containing the kernel static percpu area */ #include @@ -67,7 +67,6 @@ #include #include -#define PCPU_MIN_UNIT_PAGES 16 /* max alloc size in pages */ #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ @@ -80,6 +79,7 @@ struct pcpu_chunk { int map_used; /* # of map entries used */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ + bool immutable; /* no [de]population allowed */ struct page *page[]; /* #cpus * UNIT_PAGES */ }; @@ -521,6 +521,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, unsigned int last = num_possible_cpus() - 1; unsigned int cpu; + /* unmap must not be done on immutable chunk */ + WARN_ON(chunk->immutable); + /* * Each flushing trial can be very expensive, issue flush on * the whole region at once rather than doing it for each cpu. @@ -602,6 +605,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) unsigned int cpu; int err; + /* map must not be done on immutable chunk */ + WARN_ON(chunk->immutable); + for_each_possible_cpu(cpu) { err = map_kernel_range_noflush( pcpu_chunk_addr(chunk, cpu, page_start), @@ -727,8 +733,7 @@ void *__alloc_percpu(size_t size, size_t align) struct pcpu_chunk *chunk; int slot, off; - if (unlikely(!size || size > PCPU_MIN_UNIT_PAGES * PAGE_SIZE || - align > PAGE_SIZE)) { + if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " "percpu allocation\n", size, align); return NULL; @@ -776,6 +781,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); static void pcpu_kill_chunk(struct pcpu_chunk *chunk) { + WARN_ON(chunk->immutable); pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); list_del(&chunk->list); rb_erase(&chunk->rb_node, &pcpu_addr_root); @@ -821,33 +827,73 @@ void free_percpu(void *ptr) EXPORT_SYMBOL_GPL(free_percpu); /** - * pcpu_setup_static - initialize kernel static percpu area - * @populate_pte_fn: callback to allocate pagetable - * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages - * @cpu_size: the size of static percpu area in bytes - * - * Initialize kernel static percpu area. The caller should allocate - * all the necessary pages and pass them in @pages. - * @populate_pte_fn() is called on each page to be used for percpu - * mapping and is responsible for making sure all the necessary page - * tables for the page is allocated. + * pcpu_setup_first_chunk - initialize the first percpu chunk + * @get_page_fn: callback to fetch page pointer + * @static_size: the size of static percpu area in bytes + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto + * @free_size: free size in bytes, 0 for auto + * @base_addr: mapped address, NULL for auto + * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary + * + * Initialize the first percpu chunk which contains the kernel static + * perpcu area. This function is to be called from arch percpu area + * setup path. The first two parameters are mandatory. The rest are + * optional. + * + * @get_page_fn() should return pointer to percpu page given cpu + * number and page number. It should at least return enough pages to + * cover the static area. The returned pages for static area should + * have been initialized with valid data. If @unit_size is specified, + * it can also return pages after the static area. NULL return + * indicates end of pages for the cpu. Note that @get_page_fn() must + * return the same number of pages for all cpus. + * + * @unit_size, if non-zero, determines unit size and must be aligned + * to PAGE_SIZE and equal to or larger than @static_size + @free_size. + * + * @free_size determines the number of free bytes after the static + * area in the first chunk. If zero, whatever left is available. + * Specifying non-zero value make percpu leave the area after + * @static_size + @free_size alone. + * + * Non-null @base_addr means that the caller already allocated virtual + * region for the first chunk and mapped it. percpu must not mess + * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL + * @populate_pte_fn doesn't make any sense. + * + * @populate_pte_fn is used to populate the pagetable. NULL means the + * caller already populated the pagetable. * * RETURNS: * The determined pcpu_unit_size which can be used to initialize * percpu access. */ -size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, - struct page **pages, size_t cpu_size) +size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, + size_t static_size, size_t unit_size, + size_t free_size, void *base_addr, + pcpu_populate_pte_fn_t populate_pte_fn) { static struct vm_struct static_vm; struct pcpu_chunk *static_chunk; - int nr_cpu_pages = DIV_ROUND_UP(cpu_size, PAGE_SIZE); unsigned int cpu; + int nr_pages; int err, i; - pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_PAGES, PFN_UP(cpu_size)); + /* santiy checks */ + BUG_ON(!static_size); + BUG_ON(!unit_size && free_size); + BUG_ON(unit_size && unit_size < static_size + free_size); + BUG_ON(unit_size & ~PAGE_MASK); + BUG_ON(base_addr && !unit_size); + BUG_ON(base_addr && populate_pte_fn); - pcpu_static_size = cpu_size; + if (unit_size) + pcpu_unit_pages = unit_size >> PAGE_SHIFT; + else + pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, + PFN_UP(static_size)); + + pcpu_static_size = static_size; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) @@ -862,29 +908,66 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); - /* init and register vm area */ - static_vm.flags = VM_ALLOC; - static_vm.size = pcpu_chunk_size; - vm_area_register_early(&static_vm, PAGE_SIZE); - /* init static_chunk */ static_chunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&static_chunk->list); static_chunk->vm = &static_vm; - static_chunk->free_size = pcpu_unit_size - pcpu_static_size; + + if (free_size) + static_chunk->free_size = free_size; + else + static_chunk->free_size = pcpu_unit_size - pcpu_static_size; + static_chunk->contig_hint = static_chunk->free_size; - /* assign pages and map them */ + /* allocate vm address */ + static_vm.flags = VM_ALLOC; + static_vm.size = pcpu_chunk_size; + + if (!base_addr) + vm_area_register_early(&static_vm, PAGE_SIZE); + else { + /* + * Pages already mapped. No need to remap into + * vmalloc area. In this case the static chunk can't + * be mapped or unmapped by percpu and is marked + * immutable. + */ + static_vm.addr = base_addr; + static_chunk->immutable = true; + } + + /* assign pages */ + nr_pages = -1; for_each_possible_cpu(cpu) { - for (i = 0; i < nr_cpu_pages; i++) { - *pcpu_chunk_pagep(static_chunk, cpu, i) = *pages++; - populate_pte_fn(pcpu_chunk_addr(static_chunk, cpu, i)); + for (i = 0; i < pcpu_unit_pages; i++) { + struct page *page = get_page_fn(cpu, i); + + if (!page) + break; + *pcpu_chunk_pagep(static_chunk, cpu, i) = page; } + + BUG_ON(i < PFN_UP(pcpu_static_size)); + + if (nr_pages < 0) + nr_pages = i; + else + BUG_ON(nr_pages != i); } - err = pcpu_map(static_chunk, 0, nr_cpu_pages); - if (err) - panic("failed to setup static percpu area, err=%d\n", err); + /* map them */ + if (populate_pte_fn) { + for_each_possible_cpu(cpu) + for (i = 0; i < nr_pages; i++) + populate_pte_fn(pcpu_chunk_addr(static_chunk, + cpu, i)); + + err = pcpu_map(static_chunk, 0, nr_pages); + if (err) + panic("failed to setup static percpu area, err=%d\n", + err); + } /* link static_chunk in */ pcpu_chunk_relocate(static_chunk, -1); -- cgit v1.2.2 From 40150d37be7f7949b2ec07d511244da856647d84 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 12:32:28 +0900 Subject: percpu: add __read_mostly to variables which are mostly read only Most global variables in percpu allocator are initialized during boot and read only from that point on. Add __read_mostly as per Rusty's suggestion. Signed-off-by: Tejun Heo Cc: Rusty Russell --- mm/percpu.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 9ac01980cce0..5954e7a9eb1e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -83,18 +83,18 @@ struct pcpu_chunk { struct page *page[]; /* #cpus * UNIT_PAGES */ }; -static int pcpu_unit_pages; -static int pcpu_unit_size; -static int pcpu_chunk_size; -static int pcpu_nr_slots; -static size_t pcpu_chunk_struct_size; +static int pcpu_unit_pages __read_mostly; +static int pcpu_unit_size __read_mostly; +static int pcpu_chunk_size __read_mostly; +static int pcpu_nr_slots __read_mostly; +static size_t pcpu_chunk_struct_size __read_mostly; /* the address of the first chunk which starts with the kernel static area */ -void *pcpu_base_addr; +void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); /* the size of kernel static area */ -static int pcpu_static_size; +static int pcpu_static_size __read_mostly; /* * One mutex to rule them all. @@ -112,7 +112,7 @@ static int pcpu_static_size; */ static DEFINE_MUTEX(pcpu_mutex); -static struct list_head *pcpu_slot; /* chunk list slots */ +static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ static int __pcpu_size_to_slot(int size) -- cgit v1.2.2 From c0bdb232b23b51c23e551041510ad6bea5ce5a92 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 25 Feb 2009 09:16:35 +0200 Subject: slub: rename calculate_min_partial() to set_min_partial() As suggested by Christoph Lameter, rename calculate_min_partial() to set_min_partial() as the function doesn't really do any calculations. Cc: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index a3e2d552ff46..77268d18e78d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2170,7 +2170,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) } #endif -static void calculate_min_partial(struct kmem_cache *s, unsigned long min) +static void set_min_partial(struct kmem_cache *s, unsigned long min) { if (min < MIN_PARTIAL) min = MIN_PARTIAL; @@ -2321,7 +2321,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, * The larger the object size is, the more pages we want on the partial * list to avoid pounding the page allocator excessively. */ - calculate_min_partial(s, ilog2(s->size)); + set_min_partial(s, ilog2(s->size)); s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; @@ -3853,7 +3853,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, if (err) return err; - calculate_min_partial(s, min); + set_min_partial(s, min); return length; } SLAB_ATTR(min_partial); -- cgit v1.2.2 From 3255aa2eb636a508fc82a73fabbb8aaf2ff23c0f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 25 Feb 2009 08:21:52 +0100 Subject: x86, mm: pass in 'total' to __copy_from_user_*nocache() Impact: cleanup, enable future change Add a 'total bytes copied' parameter to __copy_from_user_*nocache(), and update all the callsites. The parameter is not used yet - architecture code can use it to more intelligently decide whether the copy should be cached or non-temporal. Cc: Salman Qazi Cc: Nick Piggin Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- mm/filemap.c | 10 ++++++---- mm/filemap_xip.c | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 23acefe51808..60fd56772cc6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1816,14 +1816,14 @@ EXPORT_SYMBOL(file_remove_suid); static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { - size_t copied = 0, left = 0; + size_t copied = 0, left = 0, total = bytes; while (bytes) { char __user *buf = iov->iov_base + base; int copy = min(bytes, iov->iov_len - base); base = 0; - left = __copy_from_user_inatomic_nocache(vaddr, buf, copy); + left = __copy_from_user_inatomic_nocache(vaddr, buf, copy, total); copied += copy; bytes -= copy; vaddr += copy; @@ -1851,8 +1851,9 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, if (likely(i->nr_segs == 1)) { int left; char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_inatomic_nocache(kaddr + offset, - buf, bytes); + buf, bytes, bytes); copied = bytes - left; } else { copied = __iovec_copy_from_user_inatomic(kaddr + offset, @@ -1880,7 +1881,8 @@ size_t iov_iter_copy_from_user(struct page *page, if (likely(i->nr_segs == 1)) { int left; char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user_nocache(kaddr + offset, buf, bytes); + + left = __copy_from_user_nocache(kaddr + offset, buf, bytes, bytes); copied = bytes - left; } else { copied = __iovec_copy_from_user_inatomic(kaddr + offset, diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 0c04615651b7..bf54f8a2cf1d 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -354,7 +354,7 @@ __xip_file_write(struct file *filp, const char __user *buf, break; copied = bytes - - __copy_from_user_nocache(xip_mem + offset, buf, bytes); + __copy_from_user_nocache(xip_mem + offset, buf, bytes, bytes); if (likely(copied > 0)) { status = copied; -- cgit v1.2.2 From 34754b69a6f87aa6aa2860525a82f12532f83afd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Feb 2009 16:04:03 +0100 Subject: x86: make vmap yell louder when it is used under irqs_disabled() Signed-off-by: Ingo Molnar --- mm/vmalloc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4dd2636d0b92..f83a70167b99 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1257,6 +1257,7 @@ EXPORT_SYMBOL(vfree); void vunmap(const void *addr) { BUG_ON(in_interrupt()); + might_sleep(); __vunmap(addr, 0); } EXPORT_SYMBOL(vunmap); @@ -1276,6 +1277,8 @@ void *vmap(struct page **pages, unsigned int count, { struct vm_struct *area; + might_sleep(); + if (count > num_physpages) return NULL; -- cgit v1.2.2 From 02d51fdfb2bfcf6bbd776f983177f55868aa0a79 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 1 Mar 2009 15:42:36 +0900 Subject: percpu: kill compile warning in pcpu_populate_chunk() Impact: remove compile warning Mark local variable map_end in pcpu_populate_chunk() with uninitialized_var(). The variable is always used in tandem with map_start and guaranteed to be initialized before use but gcc doesn't understand that. Signed-off-by: Tejun Heo Reported-by: Ingo Molnar --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 5954e7a9eb1e..3d0f5456827c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -639,7 +639,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); int map_start = -1; - int map_end; + int uninitialized_var(map_end); unsigned int cpu; int i; -- cgit v1.2.2 From d0c4f570276cb4d2dc4215b90eb7cb6e2bdd4a15 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 1 Mar 2009 16:06:56 +0900 Subject: bootmem, x86: further fixes for arch-specific bootmem wrapping Impact: fix new breakages introduced by previous fix Commit c132937556f56ee4b831ef4b23f1846e05fde102 tried to clean up bootmem arch wrapper but it wasn't quite correct. Before the commit, the followings were broken. * Low level interface functions prefixed with __ ignored arch preference. * reserve_bootmem(...) can't be mapped into reserve_bootmem_node(NODE_DATA(0)->bdata, ...) because the node is not preference here. The region specified MUST fall into the specified region; otherwise, it will panic. After the commit, * If allocation fails for the arch preferred node, it should fallback to whatever is available. Instead, it simply failed allocation. There are too many internal details to allow generic wrapping and still keep things simple for archs. Plus, all that arch wants is a way to prefer certain node over another. This patch drops the generic wrapping around alloc_bootmem_core() and add alloc_bootmem_core() instead. If necessary, arch can define bootmem_arch_referred_node() macro or function which takes all allocation information and returns the preferred node. bootmem generic code will always try the preferred node first and then fallback to other nodes as usual. Breakages noted and changes reviewed by Johannes Weiner. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner --- mm/bootmem.c | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index d7140c008ba8..daf92713f7de 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -37,16 +37,6 @@ static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); static int bootmem_debug; -/* - * If an arch needs to apply workarounds to bootmem allocation, it can - * set CONFIG_HAVE_ARCH_BOOTMEM and define a wrapper around - * __alloc_bootmem_core(). - */ -#ifndef CONFIG_HAVE_ARCH_BOOTMEM -#define alloc_bootmem_core(bdata, size, align, goal, limit) \ - __alloc_bootmem_core((bdata), (size), (align), (goal), (limit)) -#endif - static int __init bootmem_debug_setup(char *buf) { bootmem_debug = 1; @@ -436,9 +426,9 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, return ALIGN(base + off, align) - base; } -static void * __init __alloc_bootmem_core(struct bootmem_data *bdata, - unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) +static void * __init alloc_bootmem_core(struct bootmem_data *bdata, + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) { unsigned long fallback = 0; unsigned long min, max, start, sidx, midx, step; @@ -538,17 +528,34 @@ find_block: return NULL; } +static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ +#ifdef CONFIG_HAVE_ARCH_BOOTMEM + bootmem_data_t *p_bdata; + + p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit); + if (p_bdata) + return alloc_bootmem_core(p_bdata, size, align, goal, limit); +#endif + return NULL; +} + static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { bootmem_data_t *bdata; + void *region; restart: - list_for_each_entry(bdata, &bdata_list, list) { - void *region; + region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); + if (region) + return region; + list_for_each_entry(bdata, &bdata_list, list) { if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) continue; if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) @@ -626,6 +633,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, { void *ptr; + ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); + if (ptr) + return ptr; + ptr = alloc_bootmem_core(bdata, size, align, goal, limit); if (ptr) return ptr; @@ -682,6 +693,10 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, { void *ptr; + ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); + if (ptr) + return ptr; + ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; -- cgit v1.2.2 From f180053694b43d5714bf56cb95499a3c32ff155c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 2 Mar 2009 11:00:57 +0100 Subject: x86, mm: dont use non-temporal stores in pagecache accesses Impact: standardize IO on cached ops On modern CPUs it is almost always a bad idea to use non-temporal stores, as the regression in this commit has shown it: 30d697f: x86: fix performance regression in write() syscall The kernel simply has no good information about whether using non-temporal stores is a good idea or not - and trying to add heuristics only increases complexity and inserts fragility. The regression on cached write()s took very long to be found - over two years. So dont take any chances and let the hardware decide how it makes use of its caches. The only exception is drivers/gpu/drm/i915/i915_gem.c: there were we are absolutely sure that another entity (the GPU) will pick up the dirty data immediately and that the CPU will not touch that data before the GPU will. Also, keep the _nocache() primitives to make it easier for people to experiment with these details. There may be more clear-cut cases where non-cached copies can be used, outside of filemap.c. Cc: Salman Qazi Cc: Nick Piggin Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- mm/filemap.c | 11 ++++------- mm/filemap_xip.c | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 60fd56772cc6..126d3973b3d1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1816,14 +1816,14 @@ EXPORT_SYMBOL(file_remove_suid); static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { - size_t copied = 0, left = 0, total = bytes; + size_t copied = 0, left = 0; while (bytes) { char __user *buf = iov->iov_base + base; int copy = min(bytes, iov->iov_len - base); base = 0; - left = __copy_from_user_inatomic_nocache(vaddr, buf, copy, total); + left = __copy_from_user_inatomic(vaddr, buf, copy); copied += copy; bytes -= copy; vaddr += copy; @@ -1851,9 +1851,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, if (likely(i->nr_segs == 1)) { int left; char __user *buf = i->iov->iov_base + i->iov_offset; - - left = __copy_from_user_inatomic_nocache(kaddr + offset, - buf, bytes, bytes); + left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); copied = bytes - left; } else { copied = __iovec_copy_from_user_inatomic(kaddr + offset, @@ -1881,8 +1879,7 @@ size_t iov_iter_copy_from_user(struct page *page, if (likely(i->nr_segs == 1)) { int left; char __user *buf = i->iov->iov_base + i->iov_offset; - - left = __copy_from_user_nocache(kaddr + offset, buf, bytes, bytes); + left = __copy_from_user(kaddr + offset, buf, bytes); copied = bytes - left; } else { copied = __iovec_copy_from_user_inatomic(kaddr + offset, diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index bf54f8a2cf1d..0c04615651b7 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -354,7 +354,7 @@ __xip_file_write(struct file *filp, const char __user *buf, break; copied = bytes - - __copy_from_user_nocache(xip_mem + offset, buf, bytes, bytes); + __copy_from_user_nocache(xip_mem + offset, buf, bytes); if (likely(copied > 0)) { status = copied; -- cgit v1.2.2 From 2441d15c97d498b18f03ae9fba262ffeae42a08b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: percpu: cosmetic renames in pcpu_setup_first_chunk() Impact: cosmetic, preparation for future changes Make the following renames in pcpur_setup_first_chunk() in preparation for future changes. * s/free_size/dyn_size/ * s/static_vm/first_vm/ * s/static_chunk/schunk/ Signed-off-by: Tejun Heo --- mm/percpu.c | 58 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 3d0f5456827c..9531590e6b69 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -831,7 +831,7 @@ EXPORT_SYMBOL_GPL(free_percpu); * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto - * @free_size: free size in bytes, 0 for auto + * @dyn_size: free size for dynamic allocation in bytes, 0 for auto * @base_addr: mapped address, NULL for auto * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary * @@ -849,12 +849,12 @@ EXPORT_SYMBOL_GPL(free_percpu); * return the same number of pages for all cpus. * * @unit_size, if non-zero, determines unit size and must be aligned - * to PAGE_SIZE and equal to or larger than @static_size + @free_size. + * to PAGE_SIZE and equal to or larger than @static_size + @dyn_size. * - * @free_size determines the number of free bytes after the static + * @dyn_size determines the number of free bytes after the static * area in the first chunk. If zero, whatever left is available. * Specifying non-zero value make percpu leave the area after - * @static_size + @free_size alone. + * @static_size + @dyn_size alone. * * Non-null @base_addr means that the caller already allocated virtual * region for the first chunk and mapped it. percpu must not mess @@ -870,19 +870,19 @@ EXPORT_SYMBOL_GPL(free_percpu); */ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t unit_size, - size_t free_size, void *base_addr, + size_t dyn_size, void *base_addr, pcpu_populate_pte_fn_t populate_pte_fn) { - static struct vm_struct static_vm; - struct pcpu_chunk *static_chunk; + static struct vm_struct first_vm; + struct pcpu_chunk *schunk; unsigned int cpu; int nr_pages; int err, i; /* santiy checks */ BUG_ON(!static_size); - BUG_ON(!unit_size && free_size); - BUG_ON(unit_size && unit_size < static_size + free_size); + BUG_ON(!unit_size && dyn_size); + BUG_ON(unit_size && unit_size < static_size + dyn_size); BUG_ON(unit_size & ~PAGE_MASK); BUG_ON(base_addr && !unit_size); BUG_ON(base_addr && populate_pte_fn); @@ -908,24 +908,24 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); - /* init static_chunk */ - static_chunk = alloc_bootmem(pcpu_chunk_struct_size); - INIT_LIST_HEAD(&static_chunk->list); - static_chunk->vm = &static_vm; + /* init static chunk */ + schunk = alloc_bootmem(pcpu_chunk_struct_size); + INIT_LIST_HEAD(&schunk->list); + schunk->vm = &first_vm; - if (free_size) - static_chunk->free_size = free_size; + if (dyn_size) + schunk->free_size = dyn_size; else - static_chunk->free_size = pcpu_unit_size - pcpu_static_size; + schunk->free_size = pcpu_unit_size - pcpu_static_size; - static_chunk->contig_hint = static_chunk->free_size; + schunk->contig_hint = schunk->free_size; /* allocate vm address */ - static_vm.flags = VM_ALLOC; - static_vm.size = pcpu_chunk_size; + first_vm.flags = VM_ALLOC; + first_vm.size = pcpu_chunk_size; if (!base_addr) - vm_area_register_early(&static_vm, PAGE_SIZE); + vm_area_register_early(&first_vm, PAGE_SIZE); else { /* * Pages already mapped. No need to remap into @@ -933,8 +933,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, * be mapped or unmapped by percpu and is marked * immutable. */ - static_vm.addr = base_addr; - static_chunk->immutable = true; + first_vm.addr = base_addr; + schunk->immutable = true; } /* assign pages */ @@ -945,7 +945,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (!page) break; - *pcpu_chunk_pagep(static_chunk, cpu, i) = page; + *pcpu_chunk_pagep(schunk, cpu, i) = page; } BUG_ON(i < PFN_UP(pcpu_static_size)); @@ -960,20 +960,20 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (populate_pte_fn) { for_each_possible_cpu(cpu) for (i = 0; i < nr_pages; i++) - populate_pte_fn(pcpu_chunk_addr(static_chunk, + populate_pte_fn(pcpu_chunk_addr(schunk, cpu, i)); - err = pcpu_map(static_chunk, 0, nr_pages); + err = pcpu_map(schunk, 0, nr_pages); if (err) panic("failed to setup static percpu area, err=%d\n", err); } - /* link static_chunk in */ - pcpu_chunk_relocate(static_chunk, -1); - pcpu_chunk_addr_insert(static_chunk); + /* link the first chunk in */ + pcpu_chunk_relocate(schunk, -1); + pcpu_chunk_addr_insert(schunk); /* we're done */ - pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0); + pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); return pcpu_unit_size; } -- cgit v1.2.2 From 61ace7fa2fff9c4b6641c506b6b3f1a9394a1b11 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: percpu: improve first chunk initial area map handling Impact: no functional change When the first chunk is created, its initial area map is not allocated because kmalloc isn't online yet. The map is allocated and initialized on the first allocation request on the chunk. This works fine but the scattering of initialization logic between the init function and allocation path is a bit confusing. This patch makes the first chunk initialize and use minimal statically allocated map from pcpu_setpu_first_chunk(). The map resizing path still needs to handle this specially but it's more straight-forward and gives more latitude to the init path. This will ease future changes. Signed-off-by: Tejun Heo --- mm/percpu.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 9531590e6b69..503ccad091af 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -93,9 +93,6 @@ static size_t pcpu_chunk_struct_size __read_mostly; void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); -/* the size of kernel static area */ -static int pcpu_static_size __read_mostly; - /* * One mutex to rule them all. * @@ -316,15 +313,28 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) /* reallocation required? */ if (chunk->map_alloc < target) { - int new_alloc = chunk->map_alloc; + int new_alloc; int *new; + new_alloc = PCPU_DFL_MAP_ALLOC; while (new_alloc < target) new_alloc *= 2; - new = pcpu_realloc(chunk->map, - chunk->map_alloc * sizeof(new[0]), - new_alloc * sizeof(new[0])); + if (chunk->map_alloc < PCPU_DFL_MAP_ALLOC) { + /* + * map_alloc smaller than the default size + * indicates that the chunk is one of the + * first chunks and still using static map. + * Allocate a dynamic one and copy. + */ + new = pcpu_realloc(NULL, 0, new_alloc * sizeof(new[0])); + if (new) + memcpy(new, chunk->map, + chunk->map_alloc * sizeof(new[0])); + } else + new = pcpu_realloc(chunk->map, + chunk->map_alloc * sizeof(new[0]), + new_alloc * sizeof(new[0])); if (!new) return -ENOMEM; @@ -367,22 +377,6 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) int max_contig = 0; int i, off; - /* - * The static chunk initially doesn't have map attached - * because kmalloc wasn't available during init. Give it one. - */ - if (unlikely(!chunk->map)) { - chunk->map = pcpu_realloc(NULL, 0, - PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); - if (!chunk->map) - return -ENOMEM; - - chunk->map_alloc = PCPU_DFL_MAP_ALLOC; - chunk->map[chunk->map_used++] = -pcpu_static_size; - if (chunk->free_size) - chunk->map[chunk->map_used++] = chunk->free_size; - } - for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { bool is_last = i + 1 == chunk->map_used; int head, tail; @@ -874,12 +868,14 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, pcpu_populate_pte_fn_t populate_pte_fn) { static struct vm_struct first_vm; + static int smap[2]; struct pcpu_chunk *schunk; unsigned int cpu; int nr_pages; int err, i; /* santiy checks */ + BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); BUG_ON(!unit_size && dyn_size); BUG_ON(unit_size && unit_size < static_size + dyn_size); @@ -893,7 +889,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, PFN_UP(static_size)); - pcpu_static_size = static_size; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) @@ -912,14 +907,20 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, schunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&schunk->list); schunk->vm = &first_vm; + schunk->map = smap; + schunk->map_alloc = ARRAY_SIZE(smap); if (dyn_size) schunk->free_size = dyn_size; else - schunk->free_size = pcpu_unit_size - pcpu_static_size; + schunk->free_size = pcpu_unit_size - static_size; schunk->contig_hint = schunk->free_size; + schunk->map[schunk->map_used++] = -static_size; + if (schunk->free_size) + schunk->map[schunk->map_used++] = schunk->free_size; + /* allocate vm address */ first_vm.flags = VM_ALLOC; first_vm.size = pcpu_chunk_size; @@ -948,7 +949,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, *pcpu_chunk_pagep(schunk, cpu, i) = page; } - BUG_ON(i < PFN_UP(pcpu_static_size)); + BUG_ON(i < PFN_UP(static_size)); if (nr_pages < 0) nr_pages = i; -- cgit v1.2.2 From cafe8816b217b98dc3f268d3b77445da498beb4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: percpu: use negative for auto for pcpu_setup_first_chunk() arguments Impact: argument semantic cleanup In pcpu_setup_first_chunk(), zero @unit_size and @dyn_size meant auto-sizing. It's okay for @unit_size as 0 doesn't make sense but 0 dynamic reserve size is valid. Alos, if arch @dyn_size is calculated from other parameters, it might end up passing in 0 @dyn_size and malfunction when the size is automatically adjusted. This patch makes both @unit_size and @dyn_size ssize_t and use -1 for auto sizing. Signed-off-by: Tejun Heo --- mm/percpu.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 503ccad091af..a84cf9977faf 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -824,8 +824,8 @@ EXPORT_SYMBOL_GPL(free_percpu); * pcpu_setup_first_chunk - initialize the first percpu chunk * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto - * @dyn_size: free size for dynamic allocation in bytes, 0 for auto + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto + * @dyn_size: free size for dynamic allocation in bytes, -1 for auto * @base_addr: mapped address, NULL for auto * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary * @@ -842,13 +842,14 @@ EXPORT_SYMBOL_GPL(free_percpu); * indicates end of pages for the cpu. Note that @get_page_fn() must * return the same number of pages for all cpus. * - * @unit_size, if non-zero, determines unit size and must be aligned - * to PAGE_SIZE and equal to or larger than @static_size + @dyn_size. + * @unit_size, if non-negative, specifies unit size and must be + * aligned to PAGE_SIZE and equal to or larger than @static_size + + * @dyn_size. * - * @dyn_size determines the number of free bytes after the static - * area in the first chunk. If zero, whatever left is available. - * Specifying non-zero value make percpu leave the area after - * @static_size + @dyn_size alone. + * @dyn_size, if non-negative, limits the number of bytes available + * for dynamic allocation in the first chunk. Specifying non-negative + * value make percpu leave alone the area beyond @static_size + + * @dyn_size. * * Non-null @base_addr means that the caller already allocated virtual * region for the first chunk and mapped it. percpu must not mess @@ -863,8 +864,9 @@ EXPORT_SYMBOL_GPL(free_percpu); * percpu access. */ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, size_t unit_size, - size_t dyn_size, void *base_addr, + size_t static_size, + ssize_t unit_size, ssize_t dyn_size, + void *base_addr, pcpu_populate_pte_fn_t populate_pte_fn) { static struct vm_struct first_vm; @@ -877,13 +879,17 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, /* santiy checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); - BUG_ON(!unit_size && dyn_size); - BUG_ON(unit_size && unit_size < static_size + dyn_size); - BUG_ON(unit_size & ~PAGE_MASK); - BUG_ON(base_addr && !unit_size); + if (unit_size >= 0) { + BUG_ON(unit_size < static_size + + (dyn_size >= 0 ? dyn_size : 0)); + BUG_ON(unit_size & ~PAGE_MASK); + } else { + BUG_ON(dyn_size >= 0); + BUG_ON(base_addr); + } BUG_ON(base_addr && populate_pte_fn); - if (unit_size) + if (unit_size >= 0) pcpu_unit_pages = unit_size >> PAGE_SHIFT; else pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, @@ -894,6 +900,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); + if (dyn_size < 0) + dyn_size = pcpu_unit_size - static_size; + /* * Allocate chunk slots. The additional last slot is for * empty chunks. @@ -909,12 +918,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, schunk->vm = &first_vm; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); - - if (dyn_size) - schunk->free_size = dyn_size; - else - schunk->free_size = pcpu_unit_size - static_size; - + schunk->free_size = dyn_size; schunk->contig_hint = schunk->free_size; schunk->map[schunk->map_used++] = -static_size; -- cgit v1.2.2 From 3e24aa58907c62bc79d1094e941a374568f62522 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: percpu: add an indirection ptr for chunk page map access Impact: allow sharing page map, no functional difference yet Make chunk->page access indirect by adding a pointer and renaming the actual array to page_ar. This will be used by future changes. Signed-off-by: Tejun Heo --- mm/percpu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index a84cf9977faf..5b47d9fe65f5 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -80,7 +80,8 @@ struct pcpu_chunk { int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ bool immutable; /* no [de]population allowed */ - struct page *page[]; /* #cpus * UNIT_PAGES */ + struct page **page; /* points to page array */ + struct page *page_ar[]; /* #cpus * UNIT_PAGES */ }; static int pcpu_unit_pages __read_mostly; @@ -696,6 +697,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); chunk->map_alloc = PCPU_DFL_MAP_ALLOC; chunk->map[chunk->map_used++] = pcpu_unit_size; + chunk->page = chunk->page_ar; chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); if (!chunk->vm) { @@ -918,6 +920,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, schunk->vm = &first_vm; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); + schunk->page = schunk->page_ar; schunk->free_size = dyn_size; schunk->contig_hint = schunk->free_size; -- cgit v1.2.2 From edcb463997ed7b2ffa3bac76e3e75957318f2e01 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: percpu, module: implement reserved allocation and use it for module percpu variables Impact: add reserved allocation functionality and use it for module percpu variables This patch implements reserved allocation from the first chunk. When setting up the first chunk, arch can ask to set aside certain number of bytes right after the core static area which is available only through a separate reserved allocator. This will be used primarily for module static percpu variables on architectures with limited relocation range to ensure that the module perpcu symbols are inside the relocatable range. If reserved area is requested, the first chunk becomes reserved and isn't available for regular allocation. If the first chunk also includes piggy-back dynamic allocation area, a separate chunk mapping the same region is created to serve dynamic allocation. The first one is called static first chunk and the second dynamic first chunk. Although they share the page map, their different area map initializations guarantee they serve disjoint areas according to their purposes. If arch doesn't setup reserved area, reserved allocation is handled like any other allocation. Signed-off-by: Tejun Heo --- mm/percpu.c | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 133 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 5b47d9fe65f5..ef8e169b7731 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -94,6 +94,11 @@ static size_t pcpu_chunk_struct_size __read_mostly; void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); +/* optional reserved chunk, only accessible for reserved allocations */ +static struct pcpu_chunk *pcpu_reserved_chunk; +/* offset limit of the reserved chunk */ +static int pcpu_reserved_chunk_limit; + /* * One mutex to rule them all. * @@ -201,13 +206,14 @@ static void *pcpu_realloc(void *p, size_t size, size_t new_size) * * This function is called after an allocation or free changed @chunk. * New slot according to the changed state is determined and @chunk is - * moved to the slot. + * moved to the slot. Note that the reserved chunk is never put on + * chunk slots. */ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { int nslot = pcpu_chunk_slot(chunk); - if (oslot != nslot) { + if (chunk != pcpu_reserved_chunk && oslot != nslot) { if (oslot < nslot) list_move(&chunk->list, &pcpu_slot[nslot]); else @@ -255,6 +261,15 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) struct rb_node *n, *parent; struct pcpu_chunk *chunk; + /* is it in the reserved chunk? */ + if (pcpu_reserved_chunk) { + void *start = pcpu_reserved_chunk->vm->addr; + + if (addr >= start && addr < start + pcpu_reserved_chunk_limit) + return pcpu_reserved_chunk; + } + + /* nah... search the regular ones */ n = *pcpu_chunk_rb_search(addr, &parent); if (!n) { /* no exactly matching chunk, the parent is the closest */ @@ -713,9 +728,10 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) } /** - * __alloc_percpu - allocate percpu area + * pcpu_alloc - the percpu allocator * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) + * @reserved: allocate from the reserved chunk if available * * Allocate percpu area of @size bytes aligned at @align. Might * sleep. Might trigger writeouts. @@ -723,7 +739,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -void *__alloc_percpu(size_t size, size_t align) +static void *pcpu_alloc(size_t size, size_t align, bool reserved) { void *ptr = NULL; struct pcpu_chunk *chunk; @@ -737,7 +753,18 @@ void *__alloc_percpu(size_t size, size_t align) mutex_lock(&pcpu_mutex); - /* allocate area */ + /* serve reserved allocations from the reserved chunk if available */ + if (reserved && pcpu_reserved_chunk) { + chunk = pcpu_reserved_chunk; + if (size > chunk->contig_hint) + goto out_unlock; + off = pcpu_alloc_area(chunk, size, align); + if (off >= 0) + goto area_found; + goto out_unlock; + } + + /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { list_for_each_entry(chunk, &pcpu_slot[slot], list) { if (size > chunk->contig_hint) @@ -773,8 +800,41 @@ out_unlock: mutex_unlock(&pcpu_mutex); return ptr; } + +/** + * __alloc_percpu - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Allocate percpu area of @size bytes aligned at @align. Might + * sleep. Might trigger writeouts. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void *__alloc_percpu(size_t size, size_t align) +{ + return pcpu_alloc(size, align, false); +} EXPORT_SYMBOL_GPL(__alloc_percpu); +/** + * __alloc_reserved_percpu - allocate reserved percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Allocate percpu area of @size bytes aligned at @align from reserved + * percpu area if arch has set it up; otherwise, allocation is served + * from the same dynamic area. Might sleep. Might trigger writeouts. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void *__alloc_reserved_percpu(size_t size, size_t align) +{ + return pcpu_alloc(size, align, true); +} + static void pcpu_kill_chunk(struct pcpu_chunk *chunk) { WARN_ON(chunk->immutable); @@ -826,6 +886,7 @@ EXPORT_SYMBOL_GPL(free_percpu); * pcpu_setup_first_chunk - initialize the first percpu chunk * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto * @dyn_size: free size for dynamic allocation in bytes, -1 for auto * @base_addr: mapped address, NULL for auto @@ -844,14 +905,22 @@ EXPORT_SYMBOL_GPL(free_percpu); * indicates end of pages for the cpu. Note that @get_page_fn() must * return the same number of pages for all cpus. * + * @reserved_size, if non-zero, specifies the amount of bytes to + * reserve after the static area in the first chunk. This reserves + * the first chunk such that it's available only through reserved + * percpu allocation. This is primarily used to serve module percpu + * static areas on architectures where the addressing model has + * limited offset range for symbol relocations to guarantee module + * percpu symbols fall inside the relocatable range. + * * @unit_size, if non-negative, specifies unit size and must be * aligned to PAGE_SIZE and equal to or larger than @static_size + - * @dyn_size. + * @reserved_size + @dyn_size. * * @dyn_size, if non-negative, limits the number of bytes available * for dynamic allocation in the first chunk. Specifying non-negative * value make percpu leave alone the area beyond @static_size + - * @dyn_size. + * @reserved_size + @dyn_size. * * Non-null @base_addr means that the caller already allocated virtual * region for the first chunk and mapped it. percpu must not mess @@ -861,28 +930,36 @@ EXPORT_SYMBOL_GPL(free_percpu); * @populate_pte_fn is used to populate the pagetable. NULL means the * caller already populated the pagetable. * + * If the first chunk ends up with both reserved and dynamic areas, it + * is served by two chunks - one to serve the core static and reserved + * areas and the other for the dynamic area. They share the same vm + * and page map but uses different area allocation map to stay away + * from each other. The latter chunk is circulated in the chunk slots + * and available for dynamic allocation like any other chunks. + * * RETURNS: * The determined pcpu_unit_size which can be used to initialize * percpu access. */ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, + size_t static_size, size_t reserved_size, ssize_t unit_size, ssize_t dyn_size, void *base_addr, pcpu_populate_pte_fn_t populate_pte_fn) { static struct vm_struct first_vm; - static int smap[2]; - struct pcpu_chunk *schunk; + static int smap[2], dmap[2]; + struct pcpu_chunk *schunk, *dchunk = NULL; unsigned int cpu; int nr_pages; int err, i; /* santiy checks */ - BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC); + BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || + ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); if (unit_size >= 0) { - BUG_ON(unit_size < static_size + + BUG_ON(unit_size < static_size + reserved_size + (dyn_size >= 0 ? dyn_size : 0)); BUG_ON(unit_size & ~PAGE_MASK); } else { @@ -895,7 +972,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, pcpu_unit_pages = unit_size >> PAGE_SHIFT; else pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, - PFN_UP(static_size)); + PFN_UP(static_size + reserved_size)); pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; @@ -903,7 +980,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); if (dyn_size < 0) - dyn_size = pcpu_unit_size - static_size; + dyn_size = pcpu_unit_size - static_size - reserved_size; /* * Allocate chunk slots. The additional last slot is for @@ -914,20 +991,49 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); - /* init static chunk */ + /* + * Initialize static chunk. If reserved_size is zero, the + * static chunk covers static area + dynamic allocation area + * in the first chunk. If reserved_size is not zero, it + * covers static area + reserved area (mostly used for module + * static percpu allocation). + */ schunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&schunk->list); schunk->vm = &first_vm; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); schunk->page = schunk->page_ar; - schunk->free_size = dyn_size; + + if (reserved_size) { + schunk->free_size = reserved_size; + pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ + } else { + schunk->free_size = dyn_size; + dyn_size = 0; /* dynamic area covered */ + } schunk->contig_hint = schunk->free_size; schunk->map[schunk->map_used++] = -static_size; if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; + pcpu_reserved_chunk_limit = static_size + schunk->free_size; + + /* init dynamic chunk if necessary */ + if (dyn_size) { + dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); + INIT_LIST_HEAD(&dchunk->list); + dchunk->vm = &first_vm; + dchunk->map = dmap; + dchunk->map_alloc = ARRAY_SIZE(dmap); + dchunk->page = schunk->page_ar; /* share page map with schunk */ + + dchunk->contig_hint = dchunk->free_size = dyn_size; + dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; + dchunk->map[dchunk->map_used++] = dchunk->free_size; + } + /* allocate vm address */ first_vm.flags = VM_ALLOC; first_vm.size = pcpu_chunk_size; @@ -937,12 +1043,14 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, else { /* * Pages already mapped. No need to remap into - * vmalloc area. In this case the static chunk can't - * be mapped or unmapped by percpu and is marked + * vmalloc area. In this case the first chunks can't + * be mapped or unmapped by percpu and are marked * immutable. */ first_vm.addr = base_addr; schunk->immutable = true; + if (dchunk) + dchunk->immutable = true; } /* assign pages */ @@ -978,8 +1086,13 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, } /* link the first chunk in */ - pcpu_chunk_relocate(schunk, -1); - pcpu_chunk_addr_insert(schunk); + if (!dchunk) { + pcpu_chunk_relocate(schunk, -1); + pcpu_chunk_addr_insert(schunk); + } else { + pcpu_chunk_relocate(dchunk, -1); + pcpu_chunk_addr_insert(dchunk); + } /* we're done */ pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); -- cgit v1.2.2 From 1880d93b80acc3171850e9df5048bcb26b75c2f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2009 00:44:09 +0900 Subject: percpu: replace pcpu_realloc() with pcpu_mem_alloc() and pcpu_mem_free() Impact: code reorganization for later changes With static map handling moved to pcpu_split_block(), pcpu_realloc() only clutters the code and it's also unsuitable for scheduled locking changes. Implement and use pcpu_mem_alloc/free() instead. Signed-off-by: Tejun Heo --- mm/percpu.c | 85 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 42 insertions(+), 43 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index ef8e169b7731..f1d0e905850c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -164,39 +164,41 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, } /** - * pcpu_realloc - versatile realloc - * @p: the current pointer (can be NULL for new allocations) - * @size: the current size in bytes (can be 0 for new allocations) - * @new_size: the wanted new size in bytes (can be 0 for free) + * pcpu_mem_alloc - allocate memory + * @size: bytes to allocate * - * More robust realloc which can be used to allocate, resize or free a - * memory area of arbitrary size. If the needed size goes over - * PAGE_SIZE, kernel VM is used. + * Allocate @size bytes. If @size is smaller than PAGE_SIZE, + * kzalloc() is used; otherwise, vmalloc() is used. The returned + * memory is always zeroed. * * RETURNS: - * The new pointer on success, NULL on failure. + * Pointer to the allocated area on success, NULL on failure. */ -static void *pcpu_realloc(void *p, size_t size, size_t new_size) +static void *pcpu_mem_alloc(size_t size) { - void *new; - - if (new_size <= PAGE_SIZE) - new = kmalloc(new_size, GFP_KERNEL); - else - new = vmalloc(new_size); - if (new_size && !new) - return NULL; - - memcpy(new, p, min(size, new_size)); - if (new_size > size) - memset(new + size, 0, new_size - size); + if (size <= PAGE_SIZE) + return kzalloc(size, GFP_KERNEL); + else { + void *ptr = vmalloc(size); + if (ptr) + memset(ptr, 0, size); + return ptr; + } +} +/** + * pcpu_mem_free - free memory + * @ptr: memory to free + * @size: size of the area + * + * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc(). + */ +static void pcpu_mem_free(void *ptr, size_t size) +{ if (size <= PAGE_SIZE) - kfree(p); + kfree(ptr); else - vfree(p); - - return new; + vfree(ptr); } /** @@ -331,29 +333,27 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) if (chunk->map_alloc < target) { int new_alloc; int *new; + size_t size; new_alloc = PCPU_DFL_MAP_ALLOC; while (new_alloc < target) new_alloc *= 2; - if (chunk->map_alloc < PCPU_DFL_MAP_ALLOC) { - /* - * map_alloc smaller than the default size - * indicates that the chunk is one of the - * first chunks and still using static map. - * Allocate a dynamic one and copy. - */ - new = pcpu_realloc(NULL, 0, new_alloc * sizeof(new[0])); - if (new) - memcpy(new, chunk->map, - chunk->map_alloc * sizeof(new[0])); - } else - new = pcpu_realloc(chunk->map, - chunk->map_alloc * sizeof(new[0]), - new_alloc * sizeof(new[0])); + new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); if (!new) return -ENOMEM; + size = chunk->map_alloc * sizeof(chunk->map[0]); + memcpy(new, chunk->map, size); + + /* + * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the + * chunk is one of the first chunks and still using + * static map. + */ + if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) + pcpu_mem_free(chunk->map, size); + chunk->map_alloc = new_alloc; chunk->map = new; } @@ -696,7 +696,7 @@ static void free_pcpu_chunk(struct pcpu_chunk *chunk) return; if (chunk->vm) free_vm_area(chunk->vm); - pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0); + pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); kfree(chunk); } @@ -708,8 +708,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) if (!chunk) return NULL; - chunk->map = pcpu_realloc(NULL, 0, - PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); chunk->map_alloc = PCPU_DFL_MAP_ALLOC; chunk->map[chunk->map_used++] = pcpu_unit_size; chunk->page = chunk->page_ar; -- cgit v1.2.2 From 9f7dcf224bd09ec9ebcbfb383bf2c465e0e0b03d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2009 00:44:09 +0900 Subject: percpu: move chunk area map extension out of area allocation Impact: code reorganization for later changes Separate out chunk area map extension into a separate function - pcpu_extend_area_map() - and call it directly from pcpu_alloc() such that pcpu_alloc_area() is guaranteed to have enough area map slots on invocation. With this change, pcpu_alloc_area() does only area allocation and the only failure mode is when the chunk doens't have enough room, so there's no need to distinguish it from memory allocation failures. Make it return -1 on such cases instead of hacky -ENOSPC. Signed-off-by: Tejun Heo --- mm/percpu.c | 108 +++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 60 insertions(+), 48 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index f1d0e905850c..7d9bc35e8ed2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -306,6 +306,50 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) rb_insert_color(&new->rb_node, &pcpu_addr_root); } +/** + * pcpu_extend_area_map - extend area map for allocation + * @chunk: target chunk + * + * Extend area map of @chunk so that it can accomodate an allocation. + * A single allocation can split an area into three areas, so this + * function makes sure that @chunk->map has at least two extra slots. + * + * RETURNS: + * 0 if noop, 1 if successfully extended, -errno on failure. + */ +static int pcpu_extend_area_map(struct pcpu_chunk *chunk) +{ + int new_alloc; + int *new; + size_t size; + + /* has enough? */ + if (chunk->map_alloc >= chunk->map_used + 2) + return 0; + + new_alloc = PCPU_DFL_MAP_ALLOC; + while (new_alloc < chunk->map_used + 2) + new_alloc *= 2; + + new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); + if (!new) + return -ENOMEM; + + size = chunk->map_alloc * sizeof(chunk->map[0]); + memcpy(new, chunk->map, size); + + /* + * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is + * one of the first chunks and still using static map. + */ + if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) + pcpu_mem_free(chunk->map, size); + + chunk->map_alloc = new_alloc; + chunk->map = new; + return 0; +} + /** * pcpu_split_block - split a map block * @chunk: chunk of interest @@ -321,44 +365,16 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) * depending on @head, is reduced by @tail bytes and @tail byte block * is inserted after the target block. * - * RETURNS: - * 0 on success, -errno on failure. + * @chunk->map must have enough free slots to accomodate the split. */ -static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) +static void pcpu_split_block(struct pcpu_chunk *chunk, int i, + int head, int tail) { int nr_extra = !!head + !!tail; - int target = chunk->map_used + nr_extra; - - /* reallocation required? */ - if (chunk->map_alloc < target) { - int new_alloc; - int *new; - size_t size; - - new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < target) - new_alloc *= 2; - - new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); - if (!new) - return -ENOMEM; - - size = chunk->map_alloc * sizeof(chunk->map[0]); - memcpy(new, chunk->map, size); - - /* - * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the - * chunk is one of the first chunks and still using - * static map. - */ - if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) - pcpu_mem_free(chunk->map, size); - chunk->map_alloc = new_alloc; - chunk->map = new; - } + BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra); - /* insert a new subblock */ + /* insert new subblocks */ memmove(&chunk->map[i + nr_extra], &chunk->map[i], sizeof(chunk->map[0]) * (chunk->map_used - i)); chunk->map_used += nr_extra; @@ -371,7 +387,6 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) chunk->map[i++] -= tail; chunk->map[i] = tail; } - return 0; } /** @@ -384,8 +399,11 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) * Note that this function only allocates the offset. It doesn't * populate or map the area. * + * @chunk->map must have at least two free slots. + * * RETURNS: - * Allocated offset in @chunk on success, -errno on failure. + * Allocated offset in @chunk on success, -1 if no matching area is + * found. */ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) { @@ -433,8 +451,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) /* split if warranted */ if (head || tail) { - if (pcpu_split_block(chunk, i, head, tail)) - return -ENOMEM; + pcpu_split_block(chunk, i, head, tail); if (head) { i++; off += head; @@ -461,14 +478,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) chunk->contig_hint = max_contig; /* fully scanned */ pcpu_chunk_relocate(chunk, oslot); - /* - * Tell the upper layer that this chunk has no area left. - * Note that this is not an error condition but a notification - * to upper layer that it needs to look at other chunks. - * -ENOSPC is chosen as it isn't used in memory subsystem and - * matches the meaning in a way. - */ - return -ENOSPC; + /* tell the upper layer that this chunk has no matching area */ + return -1; } /** @@ -755,7 +766,8 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) /* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; - if (size > chunk->contig_hint) + if (size > chunk->contig_hint || + pcpu_extend_area_map(chunk) < 0) goto out_unlock; off = pcpu_alloc_area(chunk, size, align); if (off >= 0) @@ -768,11 +780,11 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) list_for_each_entry(chunk, &pcpu_slot[slot], list) { if (size > chunk->contig_hint) continue; + if (pcpu_extend_area_map(chunk) < 0) + goto out_unlock; off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; - if (off != -ENOSPC) - goto out_unlock; } } -- cgit v1.2.2 From a56dbddf06b653ef9c04ca3767f260fd31ccebab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2009 00:44:11 +0900 Subject: percpu: move fully free chunk reclamation into a work Impact: code reorganization for later changes Do fully free chunk reclamation using a work. This change is to prepare for locking changes. Signed-off-by: Tejun Heo --- mm/percpu.c | 48 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 7d9bc35e8ed2..4c8a419119da 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -118,6 +119,10 @@ static DEFINE_MUTEX(pcpu_mutex); static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ +/* reclaim work to release fully free chunks, scheduled from free path */ +static void pcpu_reclaim(struct work_struct *work); +static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); + static int __pcpu_size_to_slot(int size) { int highbit = fls(size); /* size is in bytes */ @@ -846,13 +851,37 @@ void *__alloc_reserved_percpu(size_t size, size_t align) return pcpu_alloc(size, align, true); } -static void pcpu_kill_chunk(struct pcpu_chunk *chunk) +/** + * pcpu_reclaim - reclaim fully free chunks, workqueue function + * @work: unused + * + * Reclaim all fully free chunks except for the first one. + */ +static void pcpu_reclaim(struct work_struct *work) { - WARN_ON(chunk->immutable); - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); - list_del(&chunk->list); - rb_erase(&chunk->rb_node, &pcpu_addr_root); - free_pcpu_chunk(chunk); + LIST_HEAD(todo); + struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; + struct pcpu_chunk *chunk, *next; + + mutex_lock(&pcpu_mutex); + + list_for_each_entry_safe(chunk, next, head, list) { + WARN_ON(chunk->immutable); + + /* spare the first one */ + if (chunk == list_first_entry(head, struct pcpu_chunk, list)) + continue; + + rb_erase(&chunk->rb_node, &pcpu_addr_root); + list_move(&chunk->list, &todo); + } + + mutex_unlock(&pcpu_mutex); + + list_for_each_entry_safe(chunk, next, &todo, list) { + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); + free_pcpu_chunk(chunk); + } } /** @@ -877,14 +906,13 @@ void free_percpu(void *ptr) pcpu_free_area(chunk, off); - /* the chunk became fully free, kill one if there are other free ones */ + /* if there are more than one fully free chunks, wake up grim reaper */ if (chunk->free_size == pcpu_unit_size) { struct pcpu_chunk *pos; - list_for_each_entry(pos, - &pcpu_slot[pcpu_chunk_slot(chunk)], list) + list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - pcpu_kill_chunk(pos); + schedule_work(&pcpu_reclaim_work); break; } } -- cgit v1.2.2 From ccea34b5d0fbab081496d1860f31acee99fa8a6d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2009 00:44:13 +0900 Subject: percpu: finer grained locking to break deadlock and allow atomic free Impact: fix deadlock and allow atomic free Percpu allocation always uses GFP_KERNEL and whole alloc/free paths were protected by single mutex. All percpu allocations have been from GFP_KERNEL-safe context and the original allocator had this assumption too. However, by protecting both alloc and free paths with the same mutex, the new allocator creates free -> alloc -> GFP_KERNEL dependency which the original allocator didn't have. This can lead to deadlock if free is called from FS or IO paths. Also, in general, allocators are expected to allow free to be called from atomic context. This patch implements finer grained locking to break the deadlock and allow atomic free. For details, please read the "Synchronization rules" comment. While at it, also add CONTEXT: to function comments to describe which context they expect to be called from and what they do to it. This problem was reported by Thomas Gleixner and Peter Zijlstra. http://thread.gmane.org/gmane.linux.kernel/802384 Signed-off-by: Tejun Heo Reported-by: Thomas Gleixner Reported-by: Peter Zijlstra --- mm/percpu.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 124 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 4c8a419119da..bfe6a3afaf45 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; /* - * One mutex to rule them all. - * - * The following mutex is grabbed in the outermost public alloc/free - * interface functions and released only when the operation is - * complete. As such, every function in this file other than the - * outermost functions are called under pcpu_mutex. - * - * It can easily be switched to use spinlock such that only the area - * allocation and page population commit are protected with it doing - * actual [de]allocation without holding any lock. However, given - * what this allocator does, I think it's better to let them run - * sequentially. + * Synchronization rules. + * + * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former + * protects allocation/reclaim paths, chunks and chunk->page arrays. + * The latter is a spinlock and protects the index data structures - + * chunk slots, rbtree, chunks and area maps in chunks. + * + * During allocation, pcpu_alloc_mutex is kept locked all the time and + * pcpu_lock is grabbed and released as necessary. All actual memory + * allocations are done using GFP_KERNEL with pcpu_lock released. + * + * Free path accesses and alters only the index data structures, so it + * can be safely called from atomic context. When memory needs to be + * returned to the system, free path schedules reclaim_work which + * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be + * reclaimed, release both locks and frees the chunks. Note that it's + * necessary to grab both locks to remove a chunk from circulation as + * allocation path might be referencing the chunk with only + * pcpu_alloc_mutex locked. */ -static DEFINE_MUTEX(pcpu_mutex); +static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ +static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ @@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, * kzalloc() is used; otherwise, vmalloc() is used. The returned * memory is always zeroed. * + * CONTEXT: + * Does GFP_KERNEL allocation. + * * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ @@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size) * New slot according to the changed state is determined and @chunk is * moved to the slot. Note that the reserved chunk is never put on * chunk slots. + * + * CONTEXT: + * pcpu_lock. */ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { @@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr, * searchs for the chunk with the highest start address which isn't * beyond @addr. * + * CONTEXT: + * pcpu_lock. + * * RETURNS: * The address of the found chunk. */ @@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * @new: chunk to insert * * Insert @new into address rb tree. + * + * CONTEXT: + * pcpu_lock. */ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) { @@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) * A single allocation can split an area into three areas, so this * function makes sure that @chunk->map has at least two extra slots. * + * CONTEXT: + * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired + * if area map is extended. + * * RETURNS: * 0 if noop, 1 if successfully extended, -errno on failure. */ @@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) if (chunk->map_alloc >= chunk->map_used + 2) return 0; + spin_unlock_irq(&pcpu_lock); + new_alloc = PCPU_DFL_MAP_ALLOC; while (new_alloc < chunk->map_used + 2) new_alloc *= 2; new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); - if (!new) + if (!new) { + spin_lock_irq(&pcpu_lock); return -ENOMEM; + } + + /* + * Acquire pcpu_lock and switch to new area map. Only free + * could have happened inbetween, so map_used couldn't have + * grown. + */ + spin_lock_irq(&pcpu_lock); + BUG_ON(new_alloc < chunk->map_used + 2); size = chunk->map_alloc * sizeof(chunk->map[0]); memcpy(new, chunk->map, size); @@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) * is inserted after the target block. * * @chunk->map must have enough free slots to accomodate the split. + * + * CONTEXT: + * pcpu_lock. */ static void pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) @@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i, * * @chunk->map must have at least two free slots. * + * CONTEXT: + * pcpu_lock. + * * RETURNS: * Allocated offset in @chunk on success, -1 if no matching area is * found. @@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) * Free area starting from @freeme to @chunk. Note that this function * only modifies the allocation map. It doesn't depopulate or unmap * the area. + * + * CONTEXT: + * pcpu_lock. */ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) { @@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, * For each cpu, depopulate and unmap pages [@page_start,@page_end) * from @chunk. If @flush is true, vcache is flushed before unmapping * and tlb after. + * + * CONTEXT: + * pcpu_alloc_mutex. */ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, bool flush) @@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) * * For each cpu, populate and map pages [@page_start,@page_end) into * @chunk. The area is cleared on return. + * + * CONTEXT: + * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) { @@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available * - * Allocate percpu area of @size bytes aligned at @align. Might - * sleep. Might trigger writeouts. + * Allocate percpu area of @size bytes aligned at @align. + * + * CONTEXT: + * Does GFP_KERNEL allocation. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ static void *pcpu_alloc(size_t size, size_t align, bool reserved) { - void *ptr = NULL; struct pcpu_chunk *chunk; int slot, off; @@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) return NULL; } - mutex_lock(&pcpu_mutex); + mutex_lock(&pcpu_alloc_mutex); + spin_lock_irq(&pcpu_lock); /* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; if (size > chunk->contig_hint || pcpu_extend_area_map(chunk) < 0) - goto out_unlock; + goto fail_unlock; off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; - goto out_unlock; + goto fail_unlock; } +restart: /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { list_for_each_entry(chunk, &pcpu_slot[slot], list) { if (size > chunk->contig_hint) continue; - if (pcpu_extend_area_map(chunk) < 0) - goto out_unlock; + + switch (pcpu_extend_area_map(chunk)) { + case 0: + break; + case 1: + goto restart; /* pcpu_lock dropped, restart */ + default: + goto fail_unlock; + } + off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; @@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) } /* hmmm... no space left, create a new chunk */ + spin_unlock_irq(&pcpu_lock); + chunk = alloc_pcpu_chunk(); if (!chunk) - goto out_unlock; + goto fail_unlock_mutex; + + spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); pcpu_chunk_addr_insert(chunk); - - off = pcpu_alloc_area(chunk, size, align); - if (off < 0) - goto out_unlock; + goto restart; area_found: + spin_unlock_irq(&pcpu_lock); + /* populate, map and clear the area */ if (pcpu_populate_chunk(chunk, off, size)) { + spin_lock_irq(&pcpu_lock); pcpu_free_area(chunk, off); - goto out_unlock; + goto fail_unlock; } - ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off); -out_unlock: - mutex_unlock(&pcpu_mutex); - return ptr; + mutex_unlock(&pcpu_alloc_mutex); + + return __addr_to_pcpu_ptr(chunk->vm->addr + off); + +fail_unlock: + spin_unlock_irq(&pcpu_lock); +fail_unlock_mutex: + mutex_unlock(&pcpu_alloc_mutex); + return NULL; } /** @@ -825,6 +897,9 @@ out_unlock: * Allocate percpu area of @size bytes aligned at @align. Might * sleep. Might trigger writeouts. * + * CONTEXT: + * Does GFP_KERNEL allocation. + * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ @@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); * percpu area if arch has set it up; otherwise, allocation is served * from the same dynamic area. Might sleep. Might trigger writeouts. * + * CONTEXT: + * Does GFP_KERNEL allocation. + * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ @@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align) * @work: unused * * Reclaim all fully free chunks except for the first one. + * + * CONTEXT: + * workqueue context. */ static void pcpu_reclaim(struct work_struct *work) { @@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work) struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; - mutex_lock(&pcpu_mutex); + mutex_lock(&pcpu_alloc_mutex); + spin_lock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, head, list) { WARN_ON(chunk->immutable); @@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work) list_move(&chunk->list, &todo); } - mutex_unlock(&pcpu_mutex); + spin_unlock_irq(&pcpu_lock); + mutex_unlock(&pcpu_alloc_mutex); list_for_each_entry_safe(chunk, next, &todo, list) { pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); @@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work) * free_percpu - free percpu area * @ptr: pointer to area to free * - * Free percpu area @ptr. Might sleep. + * Free percpu area @ptr. + * + * CONTEXT: + * Can be called from atomic context. */ void free_percpu(void *ptr) { void *addr = __pcpu_ptr_to_addr(ptr); struct pcpu_chunk *chunk; + unsigned long flags; int off; if (!ptr) return; - mutex_lock(&pcpu_mutex); + spin_lock_irqsave(&pcpu_lock, flags); chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->vm->addr; @@ -917,7 +1004,7 @@ void free_percpu(void *ptr) } } - mutex_unlock(&pcpu_mutex); + spin_unlock_irqrestore(&pcpu_lock, flags); } EXPORT_SYMBOL_GPL(free_percpu); -- cgit v1.2.2 From e01009833e22dc87075d770554b34d797843ed23 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Mar 2009 16:27:48 +0900 Subject: percpu: make x86 addr <-> pcpu ptr conversion macros generic Impact: generic addr <-> pcpu ptr conversion macros There's nothing arch specific about x86 __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr(). With proper __per_cpu_load and __per_cpu_start defined, they'll do the right thing regardless of actual layout. Move these macros from arch/x86/include/asm/percpu.h to mm/percpu.c and allow archs to override it as necessary. Signed-off-by: Tejun Heo --- mm/percpu.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index bfe6a3afaf45..c6f38a2aface 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -46,7 +46,8 @@ * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate - * regular address to percpu pointer and back + * regular address to percpu pointer and back if they need to be + * different from the default * * - use pcpu_setup_first_chunk() during percpu area initialization to * setup the first chunk containing the kernel static percpu area @@ -67,11 +68,24 @@ #include #include +#include #include #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ +/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ +#ifndef __addr_to_pcpu_ptr +#define __addr_to_pcpu_ptr(addr) \ + (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ + + (unsigned long)__per_cpu_start) +#endif +#ifndef __pcpu_ptr_to_addr +#define __pcpu_ptr_to_addr(ptr) \ + (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ + - (unsigned long)__per_cpu_start) +#endif + struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ struct rb_node rb_node; /* key is chunk->vm->addr */ -- cgit v1.2.2 From 6074d5b0a319fe8400ff079a3c289406ca024321 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Mar 2009 16:27:48 +0900 Subject: percpu: more flexibility for @dyn_size of pcpu_setup_first_chunk() Impact: cleanup, more flexibility for first chunk init Non-negative @dyn_size used to be allowed iff @unit_size wasn't auto. This restriction stemmed from implementation detail and made things a bit less intuitive. This patch allows @dyn_size to be specified regardless of @unit_size and swaps the positions of @dyn_size and @unit_size so that the parameter order makes more sense (static, reserved and dyn sizes followed by enclosing unit_size). While at it, add @unit_size >= PCPU_MIN_UNIT_SIZE sanity check. Signed-off-by: Tejun Heo --- mm/percpu.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index c6f38a2aface..2f94661d3e36 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1027,8 +1027,8 @@ EXPORT_SYMBOL_GPL(free_percpu); * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto * @base_addr: mapped address, NULL for auto * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary * @@ -1053,14 +1053,14 @@ EXPORT_SYMBOL_GPL(free_percpu); * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * + * @dyn_size, if non-negative, determines the number of bytes + * available for dynamic allocation in the first chunk. Specifying + * non-negative value makes percpu leave alone the area beyond + * @static_size + @reserved_size + @dyn_size. + * * @unit_size, if non-negative, specifies unit size and must be * aligned to PAGE_SIZE and equal to or larger than @static_size + - * @reserved_size + @dyn_size. - * - * @dyn_size, if non-negative, limits the number of bytes available - * for dynamic allocation in the first chunk. Specifying non-negative - * value make percpu leave alone the area beyond @static_size + - * @reserved_size + @dyn_size. + * @reserved_size + if non-negative, @dyn_size. * * Non-null @base_addr means that the caller already allocated virtual * region for the first chunk and mapped it. percpu must not mess @@ -1083,12 +1083,14 @@ EXPORT_SYMBOL_GPL(free_percpu); */ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, - ssize_t unit_size, ssize_t dyn_size, + ssize_t dyn_size, ssize_t unit_size, void *base_addr, pcpu_populate_pte_fn_t populate_pte_fn) { static struct vm_struct first_vm; static int smap[2], dmap[2]; + size_t size_sum = static_size + reserved_size + + (dyn_size >= 0 ? dyn_size : 0); struct pcpu_chunk *schunk, *dchunk = NULL; unsigned int cpu; int nr_pages; @@ -1099,20 +1101,18 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); if (unit_size >= 0) { - BUG_ON(unit_size < static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0)); + BUG_ON(unit_size < size_sum); BUG_ON(unit_size & ~PAGE_MASK); - } else { - BUG_ON(dyn_size >= 0); + BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); + } else BUG_ON(base_addr); - } BUG_ON(base_addr && populate_pte_fn); if (unit_size >= 0) pcpu_unit_pages = unit_size >> PAGE_SHIFT; else pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, - PFN_UP(static_size + reserved_size)); + PFN_UP(size_sum)); pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; -- cgit v1.2.2 From 66c3a75772247c31feabefb724e082220a1ab060 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Mar 2009 16:27:48 +0900 Subject: percpu: generalize embedding first chunk setup helper Impact: code reorganization Separate out embedding first chunk setup helper from x86 embedding first chunk allocator and put it in mm/percpu.c. This will be used by the default percpu first chunk allocator and possibly by other archs. Signed-off-by: Tejun Heo --- mm/percpu.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 2f94661d3e36..1aa5d8fbca12 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1238,3 +1238,89 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); return pcpu_unit_size; } + +/* + * Embedding first chunk setup helper. + */ +static void *pcpue_ptr __initdata; +static size_t pcpue_size __initdata; +static size_t pcpue_unit_size __initdata; + +static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) +{ + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpue_size) + return NULL; + + return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); +} + +/** + * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto + * + * This is a helper to ease setting up embedded first percpu chunk and + * can be called where pcpu_setup_first_chunk() is expected. + * + * If this function is used to setup the first chunk, it is allocated + * as a contiguous area using bootmem allocator and used as-is without + * being mapped into vmalloc area. This enables the first chunk to + * piggy back on the linear physical mapping which often uses larger + * page size. + * + * When @dyn_size is positive, dynamic area might be larger than + * specified to fill page alignment. Also, when @dyn_size is auto, + * @dyn_size does not fill the whole first chunk but only what's + * necessary for page alignment after static and reserved areas. + * + * If the needed size is smaller than the minimum or specified unit + * size, the leftover is returned to the bootmem allocator. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access on success, -errno on failure. + */ +ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, + ssize_t dyn_size, ssize_t unit_size) +{ + unsigned int cpu; + + /* determine parameters and allocate */ + pcpue_size = PFN_ALIGN(static_size + reserved_size + + (dyn_size >= 0 ? dyn_size : 0)); + if (dyn_size != 0) + dyn_size = pcpue_size - static_size - reserved_size; + + if (unit_size >= 0) { + BUG_ON(unit_size < pcpue_size); + pcpue_unit_size = unit_size; + } else + pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); + + pcpue_ptr = __alloc_bootmem_nopanic( + num_possible_cpus() * pcpue_unit_size, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + if (!pcpue_ptr) + return -ENOMEM; + + /* return the leftover and copy */ + for_each_possible_cpu(cpu) { + void *ptr = pcpue_ptr + cpu * pcpue_unit_size; + + free_bootmem(__pa(ptr + pcpue_size), + pcpue_unit_size - pcpue_size); + memcpy(ptr, __per_cpu_load, static_size); + } + + /* we're ready, commit */ + pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", + pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); + + return pcpu_setup_first_chunk(pcpue_get_page, static_size, + reserved_size, dyn_size, + pcpue_unit_size, pcpue_ptr, NULL); +} -- cgit v1.2.2 From 60db56422043aaa455ac7f858ce23c273220f9d9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 11 Mar 2009 14:36:54 +0900 Subject: percpu: fix spurious alignment WARN in legacy SMP percpu allocator Impact: remove spurious WARN on legacy SMP percpu allocator Commit f2a8205c4ef1af917d175c36a4097ae5587791c8 incorrectly added too tight WARN_ON_ONCE() on alignments for UP and legacy SMP percpu allocator. Commit e317603694bfd17b28a40de9d65e1a4ec12f816e fixed it for UP but legacy SMP allocator was forgotten. Fix it. Signed-off-by: Tejun Heo Reported-by: Sachin P. Sant --- mm/allocpercpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index 3653c570232b..1882923bc706 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -120,7 +120,7 @@ void *__alloc_percpu(size_t size, size_t align) * on it. Larger alignment should only be used for module * percpu sections on SMP for which this path isn't used. */ - WARN_ON_ONCE(align > __alignof__(unsigned long long)); + WARN_ON_ONCE(align > SMP_CACHE_BYTES); if (unlikely(!pdata)) return NULL; -- cgit v1.2.2 From 4bb9c5c02153dfc89a6c73a6f32091413805ad7d Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Thu, 12 Mar 2009 17:45:27 -0700 Subject: VM, x86, PAT: Change is_linear_pfn_mapping to not use vm_pgoff Impact: fix false positive PAT warnings - also fix VirtalBox hang Use of vma->vm_pgoff to identify the pfnmaps that are fully mapped at mmap time is broken. vm_pgoff is set by generic mmap code even for cases where drivers are setting up the mappings at the fault time. The problem was originally reported here: http://marc.info/?l=linux-kernel&m=123383810628583&w=2 Change is_linear_pfn_mapping logic to overload VM_INSERTPAGE flag along with VM_PFNMAP to mean full PFNMAP setup at mmap time. Problem also tracked at: http://bugzilla.kernel.org/show_bug.cgi?id=12800 Reported-by: Thomas Hellstrom Tested-by: Frans Pop Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha @intel.com> Cc: Nick Piggin Cc: "ebiederm@xmission.com" Cc: # only for 2.6.29.1, not .28 LKML-Reference: <20090313004527.GA7176@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- mm/memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index baa999e87cd2..d7df5babcba9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1665,9 +1665,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * behaviour that some programs depend on. We mark the "original" * un-COW'ed pages by matching them up with "vma->vm_pgoff". */ - if (addr == vma->vm_start && end == vma->vm_end) + if (addr == vma->vm_start && end == vma->vm_end) { vma->vm_pgoff = pfn; - else if (is_cow_mapping(vma->vm_flags)) + vma->vm_flags |= VM_PFNMAP_AT_MMAP; + } else if (is_cow_mapping(vma->vm_flags)) return -EINVAL; vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; @@ -1679,6 +1680,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * needed from higher level routine calling unmap_vmas */ vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); + vma->vm_flags &= ~VM_PFNMAP_AT_MMAP; return -EINVAL; } -- cgit v1.2.2 From 895791dac6946d535991edd11341046f8e85ea77 Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Fri, 13 Mar 2009 16:35:44 -0700 Subject: VM, x86, PAT: add a new vm flag to track full pfnmap at mmap Impact: cleanup Add a new vm flag VM_PFN_AT_MMAP to identify a PFNMAP that is fully mapped with remap_pfn_range. Patch removes the overloading of VM_INSERTPAGE from the earlier patch. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Acked-by: Nick Piggin LKML-Reference: <20090313233543.GA19909@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index d7df5babcba9..2032ad2fc34b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1667,7 +1667,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, */ if (addr == vma->vm_start && end == vma->vm_end) { vma->vm_pgoff = pfn; - vma->vm_flags |= VM_PFNMAP_AT_MMAP; + vma->vm_flags |= VM_PFN_AT_MMAP; } else if (is_cow_mapping(vma->vm_flags)) return -EINVAL; @@ -1680,7 +1680,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * needed from higher level routine calling unmap_vmas */ vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); - vma->vm_flags &= ~VM_PFNMAP_AT_MMAP; + vma->vm_flags &= ~VM_PFN_AT_MMAP; return -EINVAL; } -- cgit v1.2.2 From 3297e760776af18a26bf30046cbaaae2e730c5c2 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 4 Mar 2009 22:49:41 -0500 Subject: highmem: atomic highmem kmap page pinning Most ARM machines have a non IO coherent cache, meaning that the dma_map_*() set of functions must clean and/or invalidate the affected memory manually before DMA occurs. And because the majority of those machines have a VIVT cache, the cache maintenance operations must be performed using virtual addresses. When a highmem page is kunmap'd, its mapping (and cache) remains in place in case it is kmap'd again. However if dma_map_page() is then called with such a page, some cache maintenance on the remaining mapping must be performed. In that case, page_address(page) is non null and we can use that to synchronize the cache. It is unlikely but still possible for kmap() to race and recycle the virtual address obtained above, and use it for another page before some on-going cache invalidation loop in dma_map_page() is done. In that case, the new mapping could end up with dirty cache lines for another page, and the unsuspecting cache invalidation loop in dma_map_page() might simply discard those dirty cache lines resulting in data loss. For example, let's consider this sequence of events: - dma_map_page(..., DMA_FROM_DEVICE) is called on a highmem page. --> - vaddr = page_address(page) is non null. In this case it is likely that the page has valid cache lines associated with vaddr. Remember that the cache is VIVT. --> for (i = vaddr; i < vaddr + PAGE_SIZE; i += 32) invalidate_cache_line(i); *** preemption occurs in the middle of the loop above *** - kmap_high() is called for a different page. --> - last_pkmap_nr wraps to zero and flush_all_zero_pkmaps() is called. The pkmap_count value for the page passed to dma_map_page() above happens to be 1, so the page is unmapped. But prior to that, flush_cache_kmaps() cleared the cache for it. So far so good. - A fresh pkmap entry is assigned for this kmap request. The Murphy law says this pkmap entry will eventually happen to use the same vaddr as the one which used to belong to the other page being processed by dma_map_page() in the preempted thread above. - The kmap_high() caller start dirtying the cache using the just assigned virtual mapping for its page. *** the first thread is rescheduled *** - The for(...) loop is resumed, but now cached data belonging to a different physical page is being discarded ! And this is not only a preemption issue as ARM can be SMP as well, making the above scenario just as likely. Hence the need for some kind of pkmap page pinning which can be used in any context, primarily for the benefit of dma_map_page() on ARM. This provides the necessary interface to cope with the above issue if ARCH_NEEDS_KMAP_HIGH_GET is defined, otherwise the resulting code is unchanged. Signed-off-by: Nicolas Pitre Reviewed-by: MinChan Kim Acked-by: Andrew Morton --- mm/highmem.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index b36b83b920ff..910198037bf5 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -67,6 +67,25 @@ pte_t * pkmap_page_table; static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); +/* + * Most architectures have no use for kmap_high_get(), so let's abstract + * the disabling of IRQ out of the locking in that case to save on a + * potential useless overhead. + */ +#ifdef ARCH_NEEDS_KMAP_HIGH_GET +#define lock_kmap() spin_lock_irq(&kmap_lock) +#define unlock_kmap() spin_unlock_irq(&kmap_lock) +#define lock_kmap_any(flags) spin_lock_irqsave(&kmap_lock, flags) +#define unlock_kmap_any(flags) spin_unlock_irqrestore(&kmap_lock, flags) +#else +#define lock_kmap() spin_lock(&kmap_lock) +#define unlock_kmap() spin_unlock(&kmap_lock) +#define lock_kmap_any(flags) \ + do { spin_lock(&kmap_lock); (void)(flags); } while (0) +#define unlock_kmap_any(flags) \ + do { spin_unlock(&kmap_lock); (void)(flags); } while (0) +#endif + static void flush_all_zero_pkmaps(void) { int i; @@ -113,9 +132,9 @@ static void flush_all_zero_pkmaps(void) */ void kmap_flush_unused(void) { - spin_lock(&kmap_lock); + lock_kmap(); flush_all_zero_pkmaps(); - spin_unlock(&kmap_lock); + unlock_kmap(); } static inline unsigned long map_new_virtual(struct page *page) @@ -145,10 +164,10 @@ start: __set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(&pkmap_map_wait, &wait); - spin_unlock(&kmap_lock); + unlock_kmap(); schedule(); remove_wait_queue(&pkmap_map_wait, &wait); - spin_lock(&kmap_lock); + lock_kmap(); /* Somebody else might have mapped it while we slept */ if (page_address(page)) @@ -184,29 +203,59 @@ void *kmap_high(struct page *page) * For highmem pages, we can't trust "virtual" until * after we have the lock. */ - spin_lock(&kmap_lock); + lock_kmap(); vaddr = (unsigned long)page_address(page); if (!vaddr) vaddr = map_new_virtual(page); pkmap_count[PKMAP_NR(vaddr)]++; BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); - spin_unlock(&kmap_lock); + unlock_kmap(); return (void*) vaddr; } EXPORT_SYMBOL(kmap_high); +#ifdef ARCH_NEEDS_KMAP_HIGH_GET +/** + * kmap_high_get - pin a highmem page into memory + * @page: &struct page to pin + * + * Returns the page's current virtual memory address, or NULL if no mapping + * exists. When and only when a non null address is returned then a + * matching call to kunmap_high() is necessary. + * + * This can be called from any context. + */ +void *kmap_high_get(struct page *page) +{ + unsigned long vaddr, flags; + + lock_kmap_any(flags); + vaddr = (unsigned long)page_address(page); + if (vaddr) { + BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 1); + pkmap_count[PKMAP_NR(vaddr)]++; + } + unlock_kmap_any(flags); + return (void*) vaddr; +} +#endif + /** * kunmap_high - map a highmem page into memory * @page: &struct page to unmap + * + * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called + * only from user context. */ void kunmap_high(struct page *page) { unsigned long vaddr; unsigned long nr; + unsigned long flags; int need_wakeup; - spin_lock(&kmap_lock); + lock_kmap_any(flags); vaddr = (unsigned long)page_address(page); BUG_ON(!vaddr); nr = PKMAP_NR(vaddr); @@ -232,7 +281,7 @@ void kunmap_high(struct page *page) */ need_wakeup = waitqueue_active(&pkmap_map_wait); } - spin_unlock(&kmap_lock); + unlock_kmap_any(flags); /* do wake-up, if needed, race-free outside of the spin lock */ if (need_wakeup) -- cgit v1.2.2 From 1a00df4a2cc001dd9f45890e690548c24b2fa2d9 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sat, 7 Mar 2009 00:36:21 +0900 Subject: slub: use get_track() Use get_track() in set_track() Signed-off-by: Akinobu Mita Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Pekka Enberg --- mm/slub.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index f21e25ad453b..e150b5c0424f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -374,14 +374,8 @@ static struct track *get_track(struct kmem_cache *s, void *object, static void set_track(struct kmem_cache *s, void *object, enum track_item alloc, unsigned long addr) { - struct track *p; - - if (s->offset) - p = object + s->offset + sizeof(void *); - else - p = object + s->inuse; + struct track *p = get_track(s, object, alloc); - p += alloc; if (addr) { p->addr = addr; p->cpu = smp_processor_id(); -- cgit v1.2.2 From 6fb8f424393025674fde7869b59f485d1e352182 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 16 Mar 2009 21:00:28 +1100 Subject: slob: fix lockup in slob_free() Don't hold SLOB lock when freeing the page. Reduces lock hold width. See the following thread for discussion of the bug: http://marc.info/?l=linux-kernel&m=123709983214143&w=2 Reported-by: Ingo Molnar Acked-by: Matt Mackall Signed-off-by: Nick Piggin Signed-off-by: Pekka Enberg --- mm/slob.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index bf7e8fc3aed8..f901653707a4 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -393,10 +393,11 @@ static void slob_free(void *block, int size) /* Go directly to page allocator. Do not pass slob allocator */ if (slob_page_free(sp)) clear_slob_page_free(sp); + spin_unlock_irqrestore(&slob_lock, flags); clear_slob_page(sp); free_slob_page(sp); free_page((unsigned long)b); - goto out; + return; } if (!slob_page_free(sp)) { -- cgit v1.2.2 From 26160158d3d3df548f4ee046cc6147fe048cfa9c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Mar 2009 09:35:06 +0100 Subject: Move the default_backing_dev_info out of readahead.c and into backing-dev.c It really makes no sense to have it in readahead.c, so move it where it belongs. Signed-off-by: Jens Axboe --- mm/backing-dev.c | 26 +++++++++++++++++++++++++- mm/readahead.c | 25 ------------------------- 2 files changed, 25 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8e8587444132..be68c956a660 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -2,11 +2,24 @@ #include #include #include +#include #include #include #include #include +void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ +} +EXPORT_SYMBOL(default_unplug_io_fn); + +struct backing_dev_info default_backing_dev_info = { + .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, + .state = 0, + .capabilities = BDI_CAP_MAP_COPY, + .unplug_io_fn = default_unplug_io_fn, +}; +EXPORT_SYMBOL_GPL(default_backing_dev_info); static struct class *bdi_class; @@ -166,9 +179,20 @@ static __init int bdi_class_init(void) bdi_debug_init(); return 0; } - postcore_initcall(bdi_class_init); +static int __init default_bdi_init(void) +{ + int err; + + err = bdi_init(&default_backing_dev_info); + if (!err) + bdi_register(&default_backing_dev_info, NULL, "default"); + + return err; +} +subsys_initcall(default_bdi_init); + int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { diff --git a/mm/readahead.c b/mm/readahead.c index bec83c15a78f..9ce303d4b810 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -17,19 +17,6 @@ #include #include -void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ -} -EXPORT_SYMBOL(default_unplug_io_fn); - -struct backing_dev_info default_backing_dev_info = { - .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, - .state = 0, - .capabilities = BDI_CAP_MAP_COPY, - .unplug_io_fn = default_unplug_io_fn, -}; -EXPORT_SYMBOL_GPL(default_backing_dev_info); - /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. @@ -233,18 +220,6 @@ unsigned long max_sane_readahead(unsigned long nr) + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); } -static int __init readahead_init(void) -{ - int err; - - err = bdi_init(&default_backing_dev_info); - if (!err) - bdi_register(&default_backing_dev_info, NULL, "default"); - - return err; -} -subsys_initcall(readahead_init); - /* * Submit IO for the read-ahead request in file_ra_state. */ -- cgit v1.2.2 From 1b5e62b42b55c509eea04c3c0f25e42c8b35b564 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 23 Mar 2009 08:57:38 +0800 Subject: writeback: double the dirty thresholds Enlarge default dirty ratios from 5/10 to 10/20. This fixes [Bug #12809] iozone regression with 2.6.29-rc6. The iozone benchmarks are performed on a 1200M file, with 8GB ram. iozone -i 0 -i 1 -i 2 -i 3 -i 4 -r 4k -s 64k -s 512m -s 1200m -b tmp.xls iozone -B -r 4k -s 64k -s 512m -s 1200m -b tmp.xls The performance regression is triggered by commit 1cf6e7d83bf3(mm: task dirty accounting fix), which makes more correct/thorough dirty accounting. The default 5/10 dirty ratios were picked (a) with the old dirty logic and (b) largely at random and (c) designed to be aggressive. In particular, that (a) means that having fixed some of the dirty accounting, maybe the real bug is now that it was always too aggressive, just hidden by an accounting issue. The enlarged 10/20 dirty ratios are just about enough to fix the regression. [ We will have to look at how this affects the old fsync() latency issue, but that probably will need independent work. - Linus ] Cc: Nick Piggin Cc: Peter Zijlstra Reported-by: "Lin, Ming M" Tested-by: "Lin, Ming M" Signed-off-by: Wu Fengguang Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 74dc57c74349..40ca7cdb653e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -66,7 +66,7 @@ static inline long sync_writeback_pages(void) /* * Start background writeback (via pdflush) at this percentage */ -int dirty_background_ratio = 5; +int dirty_background_ratio = 10; /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of @@ -83,7 +83,7 @@ int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ -int vm_dirty_ratio = 10; +int vm_dirty_ratio = 20; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of -- cgit v1.2.2 From 1a2142afa5646ad5af44bbe1febaa5e0b7e71156 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 Mar 2009 22:05:10 -0600 Subject: cpumask: remove dangerous CPU_MASK_ALL_PTR, &CPU_MASK_ALL Impact: cleanup (Thanks to Al Viro for reminding me of this, via Ingo) CPU_MASK_ALL is the (deprecated) "all bits set" cpumask, defined as so: #define CPU_MASK_ALL (cpumask_t) { { ... } } Taking the address of such a temporary is questionable at best, unfortunately 321a8e9d (cpumask: add CPU_MASK_ALL_PTR macro) added CPU_MASK_ALL_PTR: #define CPU_MASK_ALL_PTR (&CPU_MASK_ALL) Which formalizes this practice. One day gcc could bite us over this usage (though we seem to have gotten away with it so far). So replace everywhere which used &CPU_MASK_ALL or CPU_MASK_ALL_PTR with the modern "cpu_all_mask" (a real const struct cpumask *). Signed-off-by: Rusty Russell Acked-by: Ingo Molnar Reported-by: Al Viro Cc: Mike Travis --- mm/pdflush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/pdflush.c b/mm/pdflush.c index 15de509b68fd..118905e3d788 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -191,7 +191,7 @@ static int pdflush(void *dummy) /* * Some configs put our parent kthread in a limited cpuset, - * which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL. + * which kthread() overrides, forcing cpus_allowed == cpu_all_mask. * Our needs are more modest - cut back to our cpusets cpus_allowed. * This is needed as pdflush's are dynamically created and destroyed. * The boottime pdflush's are easily placed w/o these 2 lines. -- cgit v1.2.2 From aa85ea5b89c36c51200d795dd788139bd9b8cf50 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 Mar 2009 22:05:15 -0600 Subject: cpumask: use new cpumask_ functions in core code. Impact: cleanup Time to clean up remaining laggards using the old cpu_ functions. Signed-off-by: Rusty Russell Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Trond.Myklebust@netapp.com --- mm/allocpercpu.c | 2 +- mm/vmstat.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index 1882923bc706..139d5b7b6621 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -143,7 +143,7 @@ void free_percpu(void *__pdata) { if (unlikely(!__pdata)) return; - __percpu_depopulate_mask(__pdata, &cpu_possible_map); + __percpu_depopulate_mask(__pdata, cpu_possible_mask); kfree(__percpu_disguise(__pdata)); } EXPORT_SYMBOL_GPL(free_percpu); diff --git a/mm/vmstat.c b/mm/vmstat.c index 91149746bb8d..8cd81ea1ddc1 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -27,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask) memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); - for_each_cpu_mask_nr(cpu, *cpumask) { + for_each_cpu(cpu, cpumask) { struct vm_event_state *this = &per_cpu(vm_event_states, cpu); for (i = 0; i < NR_VM_EVENT_ITEMS; i++) -- cgit v1.2.2 From e713a21d8251a4c91772f592af46407dfb0b2e4f Mon Sep 17 00:00:00 2001 From: Alexey Zaytsev Date: Sat, 10 Jan 2009 02:47:57 +0300 Subject: trivial: Fix dubious bitwise 'or' usage spotted by sparse. It doesn't change the semantics, but it looks like the logical 'or' was meant to be used here. Signed-off-by: Alexey Zaytsev Signed-off-by: Jiri Kosina --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5c44ed49ca93..daa36f103e77 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -331,7 +331,7 @@ static int destroy_compound_page(struct page *page, unsigned long order) for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - if (unlikely(!PageTail(p) | (p->first_page != page))) { + if (unlikely(!PageTail(p) || (p->first_page != page))) { bad_page(page); bad++; } -- cgit v1.2.2 From 19cefdffbfe0f7e280f21e80875937e8700e99e2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 15 Mar 2009 06:03:11 +0100 Subject: lockdep: annotate reclaim context (__GFP_NOFS), fix SLOB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: build fix fix typo in mm/slob.c: mm/slob.c:469: error: ‘flags’ undeclared (first use in this function) mm/slob.c:469: error: (Each undeclared identifier is reported only once mm/slob.c:469: error: for each function it appears in.) Cc: Nick Piggin Cc: Peter Zijlstra LKML-Reference: <20090128135457.350751756@chello.nl> Signed-off-by: Ingo Molnar --- mm/slob.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index 1264799df5d1..4b1c0c1d63cb 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -464,7 +464,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) unsigned int *m; int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); - lockdep_trace_alloc(flags); + lockdep_trace_alloc(gfp); if (size < PAGE_SIZE - align) { if (!size) -- cgit v1.2.2 From ef161a9863b045909142daea9490b067997f3dc5 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 31 Mar 2009 15:19:25 -0700 Subject: mm: mminit_validate_memmodel_limits(): remove redundant test In case if start_pfn overlap the upper bound no need to test end_pfn again since we have it already trimmed. Signed-off-by: Cyrill Gorcunov Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index 083f5b63e7a8..da432d9f0ae8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -164,9 +164,7 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, WARN_ON_ONCE(1); *start_pfn = max_sparsemem_pfn; *end_pfn = max_sparsemem_pfn; - } - - if (*end_pfn > max_sparsemem_pfn) { + } else if (*end_pfn > max_sparsemem_pfn) { mminit_dprintk(MMINIT_WARNING, "pfnvalidation", "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n", *start_pfn, *end_pfn, max_sparsemem_pfn); -- cgit v1.2.2 From d086817dc0d42f1be8db4138233d33e1dd16a956 Mon Sep 17 00:00:00 2001 From: MinChan Kim Date: Tue, 31 Mar 2009 15:19:26 -0700 Subject: vmap: remove needless lock and list in vmap vmap's dirty_list is unused. It's for optimizing flushing. but Nick didn't write the code yet. so, we don't need it until time as it is needed. This patch removes vmap_block's dirty_list and codes related to it. Signed-off-by: MinChan Kim Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af58324c361a..fab19876b4d1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -671,10 +671,7 @@ struct vmap_block { DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); union { - struct { - struct list_head free_list; - struct list_head dirty_list; - }; + struct list_head free_list; struct rcu_head rcu_head; }; }; @@ -741,7 +738,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); INIT_LIST_HEAD(&vb->free_list); - INIT_LIST_HEAD(&vb->dirty_list); vb_idx = addr_to_vb_idx(va->va_start); spin_lock(&vmap_block_tree_lock); @@ -772,12 +768,7 @@ static void free_vmap_block(struct vmap_block *vb) struct vmap_block *tmp; unsigned long vb_idx; - spin_lock(&vb->vbq->lock); - if (!list_empty(&vb->free_list)) - list_del(&vb->free_list); - if (!list_empty(&vb->dirty_list)) - list_del(&vb->dirty_list); - spin_unlock(&vb->vbq->lock); + BUG_ON(!list_empty(&vb->free_list)); vb_idx = addr_to_vb_idx(vb->va->va_start); spin_lock(&vmap_block_tree_lock); @@ -862,11 +853,7 @@ static void vb_free(const void *addr, unsigned long size) spin_lock(&vb->lock); bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); - if (!vb->dirty) { - spin_lock(&vb->vbq->lock); - list_add(&vb->dirty_list, &vb->vbq->dirty); - spin_unlock(&vb->vbq->lock); - } + vb->dirty += 1UL << order; if (vb->dirty == VMAP_BBMAP_BITS) { BUG_ON(vb->free || !list_empty(&vb->free_list)); -- cgit v1.2.2 From a12888f772dab4bf5e6f73668dc4f5f6026a7014 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 31 Mar 2009 15:19:27 -0700 Subject: oom_kill: don't call for int_sqrt(0) There is no need to call for int_sqrt if argument is 0. Signed-off-by: Cyrill Gorcunov Cc: Pekka Enberg Cc: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 40ba05061a4f..d3b9bac085b5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -55,7 +55,7 @@ static DEFINE_SPINLOCK(zone_scan_lock); unsigned long badness(struct task_struct *p, unsigned long uptime) { - unsigned long points, cpu_time, run_time, s; + unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; @@ -110,12 +110,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) else run_time = 0; - s = int_sqrt(cpu_time); - if (s) - points /= s; - s = int_sqrt(int_sqrt(run_time)); - if (s) - points /= s; + if (cpu_time) + points /= int_sqrt(cpu_time); + if (run_time) + points /= int_sqrt(int_sqrt(run_time)); /* * Niced processes are most likely less important, so double -- cgit v1.2.2 From a6dc60f8975ad96d162915e07703a4439c80dcf0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Mar 2009 15:19:30 -0700 Subject: vmscan: rename sc.may_swap to may_unmap sc.may_swap does not only influence reclaiming of anon pages but pages mapped into pagetables in general, which also includes mapped file pages. In shrink_page_list(): if (!sc->may_swap && page_mapped(page)) goto keep_locked; For anon pages, this makes sense as they are always mapped and reclaiming them always requires swapping. But mapped file pages are skipped here as well and it has nothing to do with swapping. The real effect of the knob is whether mapped pages are unmapped and reclaimed or not. Rename it to `may_unmap' to have its name match its actual meaning more precisely. Signed-off-by: Johannes Weiner Reviewed-by: MinChan Kim Reviewed-by: KOSAKI Motohiro Cc: Lee Schermerhorn Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 479e46719394..1bca60f0c527 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -60,8 +60,8 @@ struct scan_control { int may_writepage; - /* Can pages be swapped as part of reclaim? */ - int may_swap; + /* Can mapped pages be reclaimed? */ + int may_unmap; /* This context's SWAP_CLUSTER_MAX. If freeing memory for * suspend, we effectively ignore SWAP_CLUSTER_MAX. @@ -606,7 +606,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (unlikely(!page_evictable(page, NULL))) goto cull_mlocked; - if (!sc->may_swap && page_mapped(page)) + if (!sc->may_unmap && page_mapped(page)) goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ @@ -1694,7 +1694,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, .swap_cluster_max = SWAP_CLUSTER_MAX, - .may_swap = 1, + .may_unmap = 1, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, @@ -1713,7 +1713,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, { struct scan_control sc = { .may_writepage = !laptop_mode, - .may_swap = 1, + .may_unmap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, @@ -1723,7 +1723,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, struct zonelist *zonelist; if (noswap) - sc.may_swap = 0; + sc.may_unmap = 0; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -1762,7 +1762,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .may_swap = 1, + .may_unmap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = vm_swappiness, .order = order, @@ -2110,7 +2110,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) struct reclaim_state reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .may_swap = 0, + .may_unmap = 0, .swap_cluster_max = nr_pages, .may_writepage = 1, .isolate_pages = isolate_pages_global, @@ -2147,7 +2147,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) /* Force reclaiming mapped pages in the passes #3 and #4 */ if (pass > 2) - sc.may_swap = 1; + sc.may_unmap = 1; for (prio = DEF_PRIORITY; prio >= 0; prio--) { unsigned long nr_to_scan = nr_pages - ret; @@ -2290,7 +2290,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) int priority; struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, -- cgit v1.2.2 From ee99c71c59f897436ec65debb99372b3146f9985 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 31 Mar 2009 15:19:31 -0700 Subject: mm: introduce for_each_populated_zone() macro Impact: cleanup In almost cases, for_each_zone() is used with populated_zone(). It's because almost function doesn't need memoryless node information. Therefore, for_each_populated_zone() can help to make code simplify. This patch has no functional change. [akpm@linux-foundation.org: small cleanup] Signed-off-by: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Reviewed-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 26 +++++--------------------- mm/vmscan.c | 4 +--- mm/vmstat.c | 11 ++--------- 3 files changed, 8 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a3803ea8c27d..cbd532161f68 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -922,13 +922,10 @@ static void drain_pages(unsigned int cpu) unsigned long flags; struct zone *zone; - for_each_zone(zone) { + for_each_populated_zone(zone) { struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - if (!populated_zone(zone)) - continue; - pset = zone_pcp(zone, cpu); pcp = &pset->pcp; @@ -1879,10 +1876,7 @@ void show_free_areas(void) int cpu; struct zone *zone; - for_each_zone(zone) { - if (!populated_zone(zone)) - continue; - + for_each_populated_zone(zone) { show_node(zone); printk("%s per-cpu:\n", zone->name); @@ -1922,12 +1916,9 @@ void show_free_areas(void) global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE)); - for_each_zone(zone) { + for_each_populated_zone(zone) { int i; - if (!populated_zone(zone)) - continue; - show_node(zone); printk("%s" " free:%lukB" @@ -1967,12 +1958,9 @@ void show_free_areas(void) printk("\n"); } - for_each_zone(zone) { + for_each_populated_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; - if (!populated_zone(zone)) - continue; - show_node(zone); printk("%s: ", zone->name); @@ -2784,11 +2772,7 @@ static int __cpuinit process_zones(int cpu) node_set_state(node, N_CPU); /* this node has a cpu */ - for_each_zone(zone) { - - if (!populated_zone(zone)) - continue; - + for_each_populated_zone(zone) { zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), GFP_KERNEL, node); if (!zone_pcp(zone, cpu)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1bca60f0c527..301f057fd115 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2061,11 +2061,9 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, struct zone *zone; unsigned long ret = 0; - for_each_zone(zone) { + for_each_populated_zone(zone) { enum lru_list l; - if (!populated_zone(zone)) - continue; if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) continue; diff --git a/mm/vmstat.c b/mm/vmstat.c index 8cd81ea1ddc1..9826766f1274 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -135,11 +135,7 @@ static void refresh_zone_stat_thresholds(void) int cpu; int threshold; - for_each_zone(zone) { - - if (!zone->present_pages) - continue; - + for_each_populated_zone(zone) { threshold = calculate_threshold(zone); for_each_online_cpu(cpu) @@ -301,12 +297,9 @@ void refresh_cpu_vm_stats(int cpu) int i; int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; - for_each_zone(zone) { + for_each_populated_zone(zone) { struct per_cpu_pageset *p; - if (!populated_zone(zone)) - continue; - p = zone_pcp(zone, cpu); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) -- cgit v1.2.2 From 0a0dd05dd7e1a800241888cbf515bf8d3dc2e59c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 31 Mar 2009 15:19:33 -0700 Subject: mm: don't call mark_page_accessed() in do_swap_page() commit bf3f3bc5e734706730c12a323f9b2068052aa1f0 (mm: don't mark_page_accessed in fault path) only remove the mark_page_accessed() in filemap_fault(). Therefore, swap-backed pages and file-backed pages have inconsistent behavior. mark_page_accessed() should be removed from do_swap_page(). Signed-off-by: KOSAKI Motohiro Cc: Nick Piggin Cc: Hugh Dickins Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 2032ad2fc34b..0017111214c5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2435,8 +2435,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGMAJFAULT); } - mark_page_accessed(page); - lock_page(page); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); -- cgit v1.2.2 From d979677c4c02f0a72db5a03ecd8184bd9d6695c8 Mon Sep 17 00:00:00 2001 From: MinChan Kim Date: Tue, 31 Mar 2009 15:19:34 -0700 Subject: mm: shrink_all_memory(): use sc.nr_reclaimed Commit a79311c14eae4bb946a97af25f3e1b17d625985d "vmscan: bail out of direct reclaim after swap_cluster_max pages" moved the nr_reclaimed counter into the scan control to accumulate the number of all reclaimed pages in a reclaim invocation. shrink_all_memory() can use the same mechanism. it increase code consistency and redability. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: MinChan Kim Signed-off-by: KOSAKI Motohiro Signed-off-by: Johannes Weiner Cc: "Rafael J. Wysocki" Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 301f057fd115..b15dcbb9e174 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2050,16 +2050,15 @@ unsigned long global_lru_pages(void) #ifdef CONFIG_PM /* * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages - * from LRU lists system-wide, for given pass and priority, and returns the - * number of reclaimed pages + * from LRU lists system-wide, for given pass and priority. * * For pass > 3 we also try to shrink the LRU lists that contain a few pages */ -static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, +static void shrink_all_zones(unsigned long nr_pages, int prio, int pass, struct scan_control *sc) { struct zone *zone; - unsigned long ret = 0; + unsigned long nr_reclaimed = 0; for_each_populated_zone(zone) { enum lru_list l; @@ -2082,14 +2081,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, zone->lru[l].nr_scan = 0; nr_to_scan = min(nr_pages, lru_pages); - ret += shrink_list(l, nr_to_scan, zone, + nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, prio); - if (ret >= nr_pages) - return ret; + if (nr_reclaimed >= nr_pages) { + sc->nr_reclaimed = nr_reclaimed; + return; + } } } } - return ret; + sc->nr_reclaimed = nr_reclaimed; } /* @@ -2103,7 +2104,6 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, unsigned long shrink_all_memory(unsigned long nr_pages) { unsigned long lru_pages, nr_slab; - unsigned long ret = 0; int pass; struct reclaim_state reclaim_state; struct scan_control sc = { @@ -2125,8 +2125,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (!reclaim_state.reclaimed_slab) break; - ret += reclaim_state.reclaimed_slab; - if (ret >= nr_pages) + sc.nr_reclaimed += reclaim_state.reclaimed_slab; + if (sc.nr_reclaimed >= nr_pages) goto out; nr_slab -= reclaim_state.reclaimed_slab; @@ -2148,18 +2148,18 @@ unsigned long shrink_all_memory(unsigned long nr_pages) sc.may_unmap = 1; for (prio = DEF_PRIORITY; prio >= 0; prio--) { - unsigned long nr_to_scan = nr_pages - ret; + unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; sc.nr_scanned = 0; - ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); - if (ret >= nr_pages) + shrink_all_zones(nr_to_scan, prio, pass, &sc); + if (sc.nr_reclaimed >= nr_pages) goto out; reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, global_lru_pages()); - ret += reclaim_state.reclaimed_slab; - if (ret >= nr_pages) + sc.nr_reclaimed += reclaim_state.reclaimed_slab; + if (sc.nr_reclaimed >= nr_pages) goto out; if (sc.nr_scanned && prio < DEF_PRIORITY - 2) @@ -2168,21 +2168,23 @@ unsigned long shrink_all_memory(unsigned long nr_pages) } /* - * If ret = 0, we could not shrink LRUs, but there may be something - * in slab caches + * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be + * something in slab caches */ - if (!ret) { + if (!sc.nr_reclaimed) { do { reclaim_state.reclaimed_slab = 0; shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); - ret += reclaim_state.reclaimed_slab; - } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); + sc.nr_reclaimed += reclaim_state.reclaimed_slab; + } while (sc.nr_reclaimed < nr_pages && + reclaim_state.reclaimed_slab > 0); } + out: current->reclaim_state = NULL; - return ret; + return sc.nr_reclaimed; } #endif -- cgit v1.2.2 From 9786bf841da57fac3457a1dac41acb4c1f2eced6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Mar 2009 15:19:35 -0700 Subject: vmscan: clip swap_cluster_max in shrink_all_memory() shrink_inactive_list() scans in sc->swap_cluster_max chunks until it hits the scan limit it was passed. shrink_inactive_list() { do { isolate_pages(swap_cluster_max) shrink_page_list() } while (nr_scanned < max_scan); } This assumes that swap_cluster_max is not bigger than the scan limit because the latter is checked only after at least one iteration. In shrink_all_memory() sc->swap_cluster_max is initialized to the overall reclaim goal in the beginning but not decreased while reclaim is making progress which leads to subsequent calls to shrink_inactive_list() reclaiming way too much in the one iteration that is done unconditionally. Set sc->swap_cluster_max always to the proper goal before doing shrink_all_zones() shrink_list() shrink_inactive_list(). While the current shrink_all_memory() happily reclaims more than actually requested, this patch fixes it to never exceed the goal: unpatched wanted=10000 reclaimed=13356 wanted=10000 reclaimed=19711 wanted=10000 reclaimed=10289 wanted=10000 reclaimed=17306 wanted=10000 reclaimed=10700 wanted=10000 reclaimed=10004 wanted=10000 reclaimed=13301 wanted=10000 reclaimed=10976 wanted=10000 reclaimed=10605 wanted=10000 reclaimed=10088 wanted=10000 reclaimed=15000 patched wanted=10000 reclaimed=10000 wanted=10000 reclaimed=9599 wanted=10000 reclaimed=8476 wanted=10000 reclaimed=8326 wanted=10000 reclaimed=10000 wanted=10000 reclaimed=10000 wanted=10000 reclaimed=9919 wanted=10000 reclaimed=10000 wanted=10000 reclaimed=10000 wanted=10000 reclaimed=10000 wanted=10000 reclaimed=10000 wanted=10000 reclaimed=9624 wanted=10000 reclaimed=10000 wanted=10000 reclaimed=10000 wanted=8500 reclaimed=8092 wanted=316 reclaimed=316 Signed-off-by: Johannes Weiner Reviewed-by: MinChan Kim Acked-by: Nigel Cunningham Acked-by: "Rafael J. Wysocki" Reviewed-by: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index b15dcbb9e174..9578063cd943 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2109,7 +2109,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages) struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_unmap = 0, - .swap_cluster_max = nr_pages, .may_writepage = 1, .isolate_pages = isolate_pages_global, }; @@ -2151,6 +2150,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; sc.nr_scanned = 0; + sc.swap_cluster_max = nr_to_scan; shrink_all_zones(nr_to_scan, prio, pass, &sc); if (sc.nr_reclaimed >= nr_pages) goto out; -- cgit v1.2.2 From bd775c42ea5f7c766d03a287083837cf05e7e738 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 31 Mar 2009 15:19:37 -0700 Subject: mm: add comment why mark_page_accessed() would be better than pte_mkyoung() in follow_page() At first look, mark_page_accessed() in follow_page() seems a bit strange. It seems pte_mkyoung() would be better consistent with other kernel code. However, it is intentional. The commit log said: ------------------------------------------------ commit 9e45f61d69be9024a2e6bef3831fb04d90fac7a8 Author: akpm Date: Fri Aug 15 07:24:59 2003 +0000 [PATCH] Use mark_page_accessed() in follow_page() Touching a page via follow_page() counts as a reference so we should be either setting the referenced bit in the pte or running mark_page_accessed(). Altering the pte is tricky because we haven't implemented an atomic pte_mkyoung(). And mark_page_accessed() is better anyway because it has more aging state: it can move the page onto the active list. BKrev: 3f3c8acbplT8FbwBVGtth7QmnqWkIw ------------------------------------------------ The atomic issue is still true nowadays. adding comment help to understand code intention and it would be better. [akpm@linux-foundation.org: clarify text] Signed-off-by: KOSAKI Motohiro Signed-off-by: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 0017111214c5..5b4ad5e4f98d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1151,6 +1151,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); + /* + * pte_mkyoung() would be more correct here, but atomic care + * is needed to avoid losing the dirty bit: it is easier to use + * mark_page_accessed(). + */ mark_page_accessed(page); } unlock: -- cgit v1.2.2 From bd2f6199cf9af472aeefa1b642c9f504f19e6008 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Mar 2009 15:19:38 -0700 Subject: vmscan: respect higher order in zone_reclaim() During page allocation, there are two stages of direct reclaim that are applied to each zone in the preferred list. The first stage using zone_reclaim() reclaims unmapped file backed pages and slab pages if over defined limits as these are cheaper to reclaim. The caller specifies the order of the target allocation but the scan control is not being correctly initialised. The impact is that the correct number of pages are being reclaimed but that lumpy reclaim is not being applied. This increases the chances of a full direct reclaim via try_to_free_pages() is required. This patch initialises the order field of the scan control as requested by the caller. [mel@csn.ul.ie: rewrote changelog] Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Cc: Rik van Riel Cc: Andy Whitcroft Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 9578063cd943..51f2df04d7cf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2295,6 +2295,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, .swappiness = vm_swappiness, + .order = order, .isolate_pages = isolate_pages_global, }; unsigned long slab_reclaimable; -- cgit v1.2.2 From e3a7cca1ef4c1af9b0acef9bd66eff6582a737b5 Mon Sep 17 00:00:00 2001 From: Edward Shishkin Date: Tue, 31 Mar 2009 15:19:39 -0700 Subject: vfs: add/use account_page_dirtied() Add a helper function account_page_dirtied(). Use that from two callsites. reiser4 adds a function which adds a third callsite. Signed-off-by: Edward Shishkin Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 40ca7cdb653e..6aa92b03c747 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1197,6 +1197,20 @@ int __set_page_dirty_no_writeback(struct page *page) return 0; } +/* + * Helper function for set_page_dirty family. + * NOTE: This relies on being atomic wrt interrupts. + */ +void account_page_dirtied(struct page *page, struct address_space *mapping) +{ + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + task_dirty_inc(current); + task_io_account_write(PAGE_CACHE_SIZE); + } +} + /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. @@ -1226,13 +1240,7 @@ int __set_page_dirty_nobuffers(struct page *page) if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_dirty_inc(current); - task_io_account_write(PAGE_CACHE_SIZE); - } + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } -- cgit v1.2.2 From 2443462b0a04ef0f82ad48f4fd0ef4ac5b24c4b7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Mar 2009 15:23:12 -0700 Subject: mm: move pagevec stripping to save unlock-relock In shrink_active_list() after the deactivation loop, we strip buffer heads from the potentially remaining pages in the pagevec. Currently, this drops the zone's lru lock for stripping, only to reacquire it again afterwards to update statistics. It is not necessary to strip the pages before updating the stats, so move the whole thing out of the protected region and save the extra locking. Signed-off-by: Johannes Weiner Reviewed-by: MinChan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 51f2df04d7cf..988aef933016 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1298,14 +1298,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); pgdeactivate += pgmoved; - if (buffer_heads_over_limit) { - spin_unlock_irq(&zone->lru_lock); - pagevec_strip(&pvec); - spin_lock_irq(&zone->lru_lock); - } __count_zone_vm_events(PGREFILL, zone, pgscanned); __count_vm_events(PGDEACTIVATE, pgdeactivate); spin_unlock_irq(&zone->lru_lock); + if (buffer_heads_over_limit) + pagevec_strip(&pvec); if (vm_swap_full()) pagevec_swap_free(&pvec); -- cgit v1.2.2 From ad1c3544d0a85da7738ce8cff6f8a148da57935c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Mar 2009 15:23:13 -0700 Subject: mm: don't free swap slots on page deactivation The pagevec_swap_free() at the end of shrink_active_list() was introduced in 68a22394 "vmscan: free swap space on swap-in/activation" when shrink_active_list() was still rotating referenced active pages. In 7e9cd48 "vmscan: fix pagecache reclaim referenced bit check" this was changed, the rotating removed but the pagevec_swap_free() after the rotation loop was forgotten, applying now to the pagevec of the deactivation loop instead. Now swap space is freed for deactivated pages. And only for those that happen to be on the pagevec after the deactivation loop. Complete 7e9cd48 and remove the rest of the swap freeing. Signed-off-by: Johannes Weiner Acked-by: Rik van Riel Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 988aef933016..e70fae31e968 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1303,9 +1303,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, spin_unlock_irq(&zone->lru_lock); if (buffer_heads_over_limit) pagevec_strip(&pvec); - if (vm_swap_full()) - pagevec_swap_free(&pvec); - pagevec_release(&pvec); } -- cgit v1.2.2 From d1d7487173eab8352125cf6cc271940f24254bd4 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 31 Mar 2009 15:23:14 -0700 Subject: mm: remove pagevec_swap_free() pagevec_swap_free() is now unused. Signed-off-by: KOSAKI Motohiro Cc: Johannes Weiner Cc: Rik van Riel Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 8adb9feb61e1..6e83084c1f6c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -456,29 +456,6 @@ void pagevec_strip(struct pagevec *pvec) } } -/** - * pagevec_swap_free - try to free swap space from the pages in a pagevec - * @pvec: pagevec with swapcache pages to free the swap space of - * - * The caller needs to hold an extra reference to each page and - * not hold the page lock on the pages. This function uses a - * trylock on the page lock so it may not always free the swap - * space associated with a page. - */ -void pagevec_swap_free(struct pagevec *pvec) -{ - int i; - - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - - if (PageSwapCache(page) && trylock_page(page)) { - try_to_free_swap(page); - unlock_page(page); - } - } -} - /** * pagevec_lookup - gang pagecache lookup * @pvec: Where the resulting pages are placed -- cgit v1.2.2 From e2f17d9459aeccf4e013e31cbd741d6b1858eec4 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 31 Mar 2009 15:23:15 -0700 Subject: hugetlb: chg cannot become less than 0 chg is unsigned, so it cannot be less than 0. Also, since region_chg returns long, let vma_needs_reservation() forward this to alloc_huge_page(). Store it as long as well. all callers cast it to long anyway. Signed-off-by: Roel Kluin Cc: Andy Whitcroft Cc: Mel Gorman Cc: Adam Litke Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 107da3d809a8..28c655ba9353 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -918,7 +918,7 @@ static void return_unused_surplus_pages(struct hstate *h, * an instantiated the change should be committed via vma_commit_reservation. * No action is required on failure. */ -static int vma_needs_reservation(struct hstate *h, +static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct address_space *mapping = vma->vm_file->f_mapping; @@ -933,7 +933,7 @@ static int vma_needs_reservation(struct hstate *h, return 1; } else { - int err; + long err; pgoff_t idx = vma_hugecache_offset(h, vma, addr); struct resv_map *reservations = vma_resv_map(vma); @@ -969,7 +969,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, struct page *page; struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - unsigned int chg; + long chg; /* * Processes that did not create the mapping will have no reserves and -- cgit v1.2.2 From 610a77e04a8d9fe8764dc484e2182fa251ce1cc2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 31 Mar 2009 15:23:16 -0700 Subject: memdup_user(): introduce I notice there are many places doing copy_from_user() which follows kmalloc(): dst = kmalloc(len, GFP_KERNEL); if (!dst) return -ENOMEM; if (copy_from_user(dst, src, len)) { kfree(dst); return -EFAULT } memdup_user() is a wrapper of the above code. With this new function, we don't have to write 'len' twice, which can lead to typos/mistakes. It also produces smaller code and kernel text. A quick grep shows 250+ places where memdup_user() *may* be used. I'll prepare a patchset to do this conversion. Signed-off-by: Li Zefan Cc: KOSAKI Motohiro Cc: Americo Wang Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index 37eaccdf3054..7c122e49f769 100644 --- a/mm/util.c +++ b/mm/util.c @@ -69,6 +69,36 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) } EXPORT_SYMBOL(kmemdup); +/** + * memdup_user - duplicate memory region from user space + * + * @src: source address in user space + * @len: number of bytes to copy + * + * Returns an ERR_PTR() on failure. + */ +void *memdup_user(const void __user *src, size_t len) +{ + void *p; + + /* + * Always use GFP_KERNEL, since copy_from_user() can sleep and + * cause pagefault, which makes it pointless to use GFP_NOFS + * or GFP_ATOMIC. + */ + p = kmalloc_track_caller(len, GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, src, len)) { + kfree(p); + return ERR_PTR(-EFAULT); + } + + return p; +} +EXPORT_SYMBOL(memdup_user); + /** * __krealloc - like krealloc() but don't free @p. * @p: object to reallocate memory for. -- cgit v1.2.2 From 6a11f75b6a17b5d9ac5025f8d048382fd1f47377 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 31 Mar 2009 15:23:17 -0700 Subject: generic debug pagealloc CONFIG_DEBUG_PAGEALLOC is now supported by x86, powerpc, sparc64, and s390. This patch implements it for the rest of the architectures by filling the pages with poison byte patterns after free_pages() and verifying the poison patterns before alloc_pages(). This generic one cannot detect invalid page accesses immediately but invalid read access may cause invalid dereference by poisoned memory and invalid write access can be detected after a long delay. Signed-off-by: Akinobu Mita Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig.debug | 17 +++++++ mm/Makefile | 1 + mm/debug-pagealloc.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 147 insertions(+) create mode 100644 mm/Kconfig.debug create mode 100644 mm/debug-pagealloc.c (limited to 'mm') diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug new file mode 100644 index 000000000000..c8d62d49a44e --- /dev/null +++ b/mm/Kconfig.debug @@ -0,0 +1,17 @@ +config WANT_PAGE_DEBUG_FLAGS + bool + +config PAGE_POISONING + bool "Debug page memory allocations" + depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC + depends on !HIBERNATION + select DEBUG_PAGEALLOC + select WANT_PAGE_DEBUG_FLAGS + help + Fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages(). This results in a large slowdown, + but helps to find certain types of memory corruptions. + + This option cannot enalbe with hibernation. Otherwise, it will get + wrong messages for memory corruption because the free pages are not + saved to the suspend image. diff --git a/mm/Makefile b/mm/Makefile index 818569b68f46..ec73c68b6015 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_FAILSLAB) += failslab.o diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c new file mode 100644 index 000000000000..a1e3324de2b5 --- /dev/null +++ b/mm/debug-pagealloc.c @@ -0,0 +1,129 @@ +#include +#include +#include +#include + +static inline void set_page_poison(struct page *page) +{ + __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); +} + +static inline void clear_page_poison(struct page *page) +{ + __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); +} + +static inline bool page_poison(struct page *page) +{ + return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); +} + +static void poison_highpage(struct page *page) +{ + /* + * Page poisoning for highmem pages is not implemented. + * + * This can be called from interrupt contexts. + * So we need to create a new kmap_atomic slot for this + * application and it will need interrupt protection. + */ +} + +static void poison_page(struct page *page) +{ + void *addr; + + if (PageHighMem(page)) { + poison_highpage(page); + return; + } + set_page_poison(page); + addr = page_address(page); + memset(addr, PAGE_POISON, PAGE_SIZE); +} + +static void poison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + poison_page(page + i); +} + +static bool single_bit_flip(unsigned char a, unsigned char b) +{ + unsigned char error = a ^ b; + + return error && !(error & (error - 1)); +} + +static void check_poison_mem(unsigned char *mem, size_t bytes) +{ + unsigned char *start; + unsigned char *end; + + for (start = mem; start < mem + bytes; start++) { + if (*start != PAGE_POISON) + break; + } + if (start == mem + bytes) + return; + + for (end = mem + bytes - 1; end > start; end--) { + if (*end != PAGE_POISON) + break; + } + + if (!printk_ratelimit()) + return; + else if (start == end && single_bit_flip(*start, PAGE_POISON)) + printk(KERN_ERR "pagealloc: single bit error\n"); + else + printk(KERN_ERR "pagealloc: memory corruption\n"); + + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, + end - start + 1, 1); + dump_stack(); +} + +static void unpoison_highpage(struct page *page) +{ + /* + * See comment in poison_highpage(). + * Highmem pages should not be poisoned for now + */ + BUG_ON(page_poison(page)); +} + +static void unpoison_page(struct page *page) +{ + if (PageHighMem(page)) { + unpoison_highpage(page); + return; + } + if (page_poison(page)) { + void *addr = page_address(page); + + check_poison_mem(addr, PAGE_SIZE); + clear_page_poison(page); + } +} + +static void unpoison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + unpoison_page(page + i); +} + +void kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (!debug_pagealloc_enabled) + return; + + if (enable) + unpoison_pages(page, numpages); + else + poison_pages(page, numpages); +} -- cgit v1.2.2 From 704503d836042d4a4c7685b7036e7de0418fbc0f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:23:18 -0700 Subject: mm: fix proc_dointvec_userhz_jiffies "breakage" Addresses http://bugzilla.kernel.org/show_bug.cgi?id=9838 On i386, HZ=1000, jiffies_to_clock_t() converts time in a somewhat strange way from the user's point of view: # echo 500 >/proc/sys/vm/dirty_writeback_centisecs # cat /proc/sys/vm/dirty_writeback_centisecs 499 So, we have 5000 jiffies converted to only 499 clock ticks and reported back. TICK_NSEC = 999848 ACTHZ = 256039 Keeping in-kernel variable in units passed from userspace will fix issue of course, but this probably won't be right for every sysctl. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Alexey Dobriyan Cc: Peter Zijlstra Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6aa92b03c747..30351f0063ac 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -92,14 +92,14 @@ int vm_dirty_ratio = 20; unsigned long vm_dirty_bytes; /* - * The interval between `kupdate'-style writebacks, in jiffies + * The interval between `kupdate'-style writebacks */ -int dirty_writeback_interval = 5 * HZ; +unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */ /* - * The longest number of jiffies for which data is allowed to remain dirty + * The longest time for which data is allowed to remain dirty */ -int dirty_expire_interval = 30 * HZ; +unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */ /* * Flag that makes the machine dump writes/reads and block dirtyings. @@ -770,9 +770,9 @@ static void wb_kupdate(unsigned long arg) sync_supers(); - oldest_jif = jiffies - dirty_expire_interval; + oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval); start_jif = jiffies; - next_jif = start_jif + dirty_writeback_interval; + next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10); nr_to_write = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); @@ -801,9 +801,10 @@ static void wb_kupdate(unsigned long arg) int dirty_writeback_centisecs_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, file, buffer, length, ppos); if (dirty_writeback_interval) - mod_timer(&wb_timer, jiffies + dirty_writeback_interval); + mod_timer(&wb_timer, jiffies + + msecs_to_jiffies(dirty_writeback_interval * 10)); else del_timer(&wb_timer); return 0; @@ -905,7 +906,8 @@ void __init page_writeback_init(void) { int shift; - mod_timer(&wb_timer, jiffies + dirty_writeback_interval); + mod_timer(&wb_timer, + jiffies + msecs_to_jiffies(dirty_writeback_interval * 10)); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); -- cgit v1.2.2 From c2ec175c39f62949438354f603f4aa170846aabb Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 31 Mar 2009 15:23:21 -0700 Subject: mm: page_mkwrite change prototype to match fault Change the page_mkwrite prototype to take a struct vm_fault, and return VM_FAULT_xxx flags. There should be no functional change. This makes it possible to return much more detailed error information to the VM (and also can provide more information eg. virtual_address to the driver, which might be important in some special cases). This is required for a subsequent fix. And will also make it easier to merge page_mkwrite() with fault() in future. Signed-off-by: Nick Piggin Cc: Chris Mason Cc: Trond Myklebust Cc: Miklos Szeredi Cc: Steven Whitehouse Cc: Mark Fasheh Cc: Joel Becker Cc: Artem Bityutskiy Cc: Felix Blyakher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 5b4ad5e4f98d..cf6873e91c6a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1945,6 +1945,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * get_user_pages(.write=1, .force=1). */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { + struct vm_fault vmf; + int tmp; + + vmf.virtual_address = (void __user *)(address & + PAGE_MASK); + vmf.pgoff = old_page->index; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.page = old_page; + /* * Notify the address space that the page is about to * become writable so that it can prohibit this or wait @@ -1956,8 +1965,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); - if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) + tmp = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + ret = tmp; goto unwritable_page; + } /* * Since we dropped the lock we need to revalidate @@ -2106,7 +2119,7 @@ oom: unwritable_page: page_cache_release(old_page); - return VM_FAULT_SIGBUS; + return ret; } /* @@ -2648,9 +2661,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, * to become writable */ if (vma->vm_ops->page_mkwrite) { + int tmp; + unlock_page(page); - if (vma->vm_ops->page_mkwrite(vma, page) < 0) { - ret = VM_FAULT_SIGBUS; + vmf.flags |= FAULT_FLAG_MKWRITE; + tmp = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + ret = tmp; anon = 1; /* no anon but release vmf.page */ goto out_unlocked; } -- cgit v1.2.2 From f4112de6b679d84bd9b9681c7504be7bdfb7c7d5 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 31 Mar 2009 15:23:25 -0700 Subject: mm: introduce debug_kmap_atomic x86 has debug_kmap_atomic_prot() which is error checking function for kmap_atomic. It is usefull for the other architectures, although it needs CONFIG_TRACE_IRQFLAGS_SUPPORT. This patch exposes it to the other architectures. Signed-off-by: Akinobu Mita Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index 910198037bf5..68eb1d9b63fa 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -422,3 +422,48 @@ void __init page_address_init(void) } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ + +#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT) + +void debug_kmap_atomic(enum km_type type) +{ + static unsigned warn_count = 10; + + if (unlikely(warn_count == 0)) + return; + + if (unlikely(in_interrupt())) { + if (in_irq()) { + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } else if (!irqs_disabled()) { /* softirq */ + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && + type != KM_SKB_SUNRPC_DATA && + type != KM_SKB_DATA_SOFTIRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } + } + + if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || + type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { + if (!irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { + if (irq_count() == 0 && !irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } +} + +#endif -- cgit v1.2.2 From 33925b25d2c00a29664f1994ab350a9bff70f7a2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 31 Mar 2009 15:23:26 -0700 Subject: nommu: there is no mlock() for NOMMU, so don't provide the bits The mlock() facility does not exist for NOMMU since all mappings are effectively locked anyway, so we don't make the bits available when they're not useful. Signed-off-by: David Howells Reviewed-by: KOSAKI Motohiro Cc: Peter Zijlstra Cc: Greg Ungerer Cc: Johannes Weiner Cc: Rik van Riel Cc: Lee Schermerhorn Cc: Enrik Berkhan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 8 ++++++++ mm/internal.h | 8 +++++--- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index a5b77811fdf2..8c895973dfba 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -214,5 +214,13 @@ config UNEVICTABLE_LRU will use one page flag and increase the code size a little, say Y unless you know what you are doing. +config HAVE_MLOCK + bool + default y if MMU=y + +config HAVE_MLOCKED_PAGE_BIT + bool + default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y + config MMU_NOTIFIER bool diff --git a/mm/internal.h b/mm/internal.h index 478223b73a2a..987bb03fbdd8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -63,6 +63,7 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +#ifdef CONFIG_HAVE_MLOCK extern long mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void munlock_vma_pages_range(struct vm_area_struct *vma, @@ -71,6 +72,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) { munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); } +#endif #ifdef CONFIG_UNEVICTABLE_LRU /* @@ -90,7 +92,7 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT /* * Called only in fault path via page_evictable() for a new page * to determine if it's being mapped into a LOCKED vma. @@ -165,7 +167,7 @@ static inline void free_page_mlock(struct page *page) } } -#else /* CONFIG_UNEVICTABLE_LRU */ +#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { return 0; @@ -175,7 +177,7 @@ static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } static inline void free_page_mlock(struct page *page) { } -#endif /* CONFIG_UNEVICTABLE_LRU */ +#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ /* * Return the mem_map entry representing the 'offset' subpage within -- cgit v1.2.2 From 71aa653c6bfa6743d838342105ebc067145394e4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 31 Mar 2009 15:23:28 -0700 Subject: nommu: make CONFIG_UNEVICTABLE_LRU available when CONFIG_MMU=n Make CONFIG_UNEVICTABLE_LRU available when CONFIG_MMU=n. There's no logical reason it shouldn't be available, and it can be used for ramfs. Signed-off-by: David Howells Reviewed-by: KOSAKI Motohiro Cc: Peter Zijlstra Cc: Greg Ungerer Cc: Johannes Weiner Cc: Rik van Riel Cc: Lee Schermerhorn Cc: Enrik Berkhan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 8c895973dfba..b53427ad30a3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -206,7 +206,6 @@ config VIRT_TO_BUS config UNEVICTABLE_LRU bool "Add LRU list to track non-evictable pages" default y - depends on MMU help Keeps unevictable pages off of the active and inactive pageout lists, so kswapd will not waste CPU time or have its balancing -- cgit v1.2.2 From 88c3bd707c2552bcef93cc3724647903aece159d Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 31 Mar 2009 15:23:29 -0700 Subject: vmscan: print shrink_slab symbol name on negative shrinker objects When a shrinker has a negative number of objects to delete, the symbol name of the shrinker should be printed, not shrink_slab. This also makes the error message slightly more informative. Cc: Ingo Molnar Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index e70fae31e968..f4619c6cd59e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -214,8 +214,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, do_div(delta, lru_pages + 1); shrinker->nr += delta; if (shrinker->nr < 0) { - printk(KERN_ERR "%s: nr=%ld\n", - __func__, shrinker->nr); + printk(KERN_ERR "shrink_slab: %pF negative objects to " + "delete nr=%ld\n", + shrinker->shrink, shrinker->nr); shrinker->nr = max_pass; } -- cgit v1.2.2 From 327c0e968645f2601a43f5ea7c19c7b3a5fa0a34 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 31 Mar 2009 15:23:31 -0700 Subject: vmscan: fix it to take care of nodemask try_to_free_pages() is used for the direct reclaim of up to SWAP_CLUSTER_MAX pages when watermarks are low. The caller to alloc_pages_nodemask() can specify a nodemask of nodes that are allowed to be used but this is not passed to try_to_free_pages(). This can lead to unnecessary reclaim of pages that are unusable by the caller and int the worst case lead to allocation failure as progress was not been make where it is needed. This patch passes the nodemask used for alloc_pages_nodemask() to try_to_free_pages(). Reviewed-by: KOSAKI Motohiro Acked-by: Mel Gorman Signed-off-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- mm/vmscan.c | 13 +++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cbd532161f68..0284e528748d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1582,7 +1582,8 @@ nofail_alloc: reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); + did_some_progress = try_to_free_pages(zonelist, order, + gfp_mask, nodemask); p->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); diff --git a/mm/vmscan.c b/mm/vmscan.c index f4619c6cd59e..06e72693b458 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -78,6 +78,12 @@ struct scan_control { /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; + /* Pluggable isolate pages callback */ unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, @@ -1538,7 +1544,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, struct zone *zone; sc->all_unreclaimable = 1; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, + sc->nodemask) { if (!populated_zone(zone)) continue; /* @@ -1683,7 +1690,7 @@ out: } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, - gfp_t gfp_mask) + gfp_t gfp_mask, nodemask_t *nodemask) { struct scan_control sc = { .gfp_mask = gfp_mask, @@ -1694,6 +1701,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .order = order, .mem_cgroup = NULL, .isolate_pages = isolate_pages_global, + .nodemask = nodemask, }; return do_try_to_free_pages(zonelist, &sc); @@ -1714,6 +1722,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .order = 0, .mem_cgroup = mem_cont, .isolate_pages = mem_cgroup_isolate_pages, + .nodemask = NULL, /* we don't care the placement */ }; struct zonelist *zonelist; -- cgit v1.2.2 From 9fab5619bdd7f84cdd22cc760778f759f9819a33 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 31 Mar 2009 15:23:33 -0700 Subject: shmem: writepage directly to swap Synopsis: if shmem_writepage calls swap_writepage directly, most shmem swap loads benefit, and a catastrophic interaction between SLUB and some flash storage is avoided. shmem_writepage() has always been peculiar in making no attempt to write: it has just transferred a shmem page from file cache to swap cache, then let that page make its way around the LRU again before being written and freed. The idea was that people use tmpfs because they want those pages to stay in RAM; so although we give it an overflow to swap, we should resist writing too soon, giving those pages a second chance before they can be reclaimed. That was always questionable, and I've toyed with this patch for years; but never had a clear justification to depart from the original design. It became more questionable in 2.6.28, when the split LRU patches classed shmem and tmpfs pages as SwapBacked rather than as file_cache: that in itself gives them more resistance to reclaim than normal file pages. I prepared this patch for 2.6.29, but the merge window arrived before I'd completed gathering statistics to justify sending it in. Then while comparing SLQB against SLUB, running SLUB on a laptop I'd habitually used with SLAB, I found SLUB to run my tmpfs kbuild swapping tests five times slower than SLAB or SLQB - other machines slower too, but nowhere near so bad. Simpler "cp -a" swapping tests showed the same. slub_max_order=0 brings sanity to all, but heavy swapping is too far from normal to justify such a tuning. The crucial factor on that laptop turns out to be that I'm using an SD card for swap. What happens is this: By default, SLUB uses order-2 pages for shmem_inode_cache (and many other fs inodes), so creating tmpfs files under memory pressure brings lumpy reclaim into play. One subpage of the order is chosen from the bottom of the LRU as usual, then the other three picked out from their random positions on the LRUs. In a tmpfs load, many of these pages will be ones which already passed through shmem_writepage, so already have swap allocated. And though their offsets on swap were probably allocated sequentially, now that the pages are picked off at random, their swap offsets are scattered. But the flash storage on the SD card is very sensitive to having its writes merged: once swap is written at scattered offsets, performance falls apart. Rotating disk seeks increase too, but less disastrously. So: stop giving shmem/tmpfs pages a second pass around the LRU, write them out to swap as soon as their swap has been allocated. It's surely possible to devise an artificial load which runs faster the old way, one whose sizing is such that the tmpfs pages on their second pass are the ones that are wanted again, and other pages not. But I've not yet found such a load: on all machines, under the loads I've tried, immediate swap_writepage speeds up shmem swapping: especially when using the SLUB allocator (and more effectively than slub_max_order=0), but also with the others; and it also reduces the variance between runs. How much faster varies widely: a factor of five is rare, 5% is common. One load which might have suffered: imagine a swapping shmem load in a limited mem_cgroup on a machine with plenty of memory. Before 2.6.29 the swapcache was not charged, and such a load would have run quickest with the shmem swapcache never written to swap. But now swapcache is charged, so even this load benefits from shmem_writepage directly to swap. Apologies for the #ifndef CONFIG_SWAP swap_writepage() stub in swap.h: it's silly because that will never get called; but refactoring shmem.c sensibly according to CONFIG_SWAP will be a separate task. Signed-off-by: Hugh Dickins Acked-by: Pekka Enberg Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 7ec78e24a30d..d94d2e9146bc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1068,8 +1068,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) swap_duplicate(swap); BUG_ON(page_mapped(page)); page_cache_release(page); /* pagecache ref */ - set_page_dirty(page); - unlock_page(page); + swap_writepage(page, wbc); if (inode) { mutex_lock(&shmem_swaplist_mutex); /* move instead of add in case we're racing */ -- cgit v1.2.2 From ee3b4290aec03022cfb67c9adba9f1b3215245f0 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Thu, 2 Apr 2009 16:56:30 -0700 Subject: generic debug pagealloc: build fix This fixes a build failure with generic debug pagealloc: mm/debug-pagealloc.c: In function 'set_page_poison': mm/debug-pagealloc.c:8: error: 'struct page' has no member named 'debug_flags' mm/debug-pagealloc.c: In function 'clear_page_poison': mm/debug-pagealloc.c:13: error: 'struct page' has no member named 'debug_flags' mm/debug-pagealloc.c: In function 'page_poison': mm/debug-pagealloc.c:18: error: 'struct page' has no member named 'debug_flags' mm/debug-pagealloc.c: At top level: mm/debug-pagealloc.c:120: error: redefinition of 'kernel_map_pages' include/linux/mm.h:1278: error: previous definition of 'kernel_map_pages' was here mm/debug-pagealloc.c: In function 'kernel_map_pages': mm/debug-pagealloc.c:122: error: 'debug_pagealloc_enabled' undeclared (first use in this function) by fixing - debug_flags should be in struct page - define DEBUG_PAGEALLOC config option for all architectures Signed-off-by: Akinobu Mita Reported-by: Alexander Beregalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig.debug | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm') diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index c8d62d49a44e..bb01e298f260 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -1,3 +1,12 @@ +config DEBUG_PAGEALLOC + bool "Debug page memory allocations" + depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC + depends on !HIBERNATION || !PPC && !SPARC + ---help--- + Unmap pages from the kernel linear mapping after free_pages(). + This results in a large slowdown, but helps to find certain types + of memory corruptions. + config WANT_PAGE_DEBUG_FLAGS bool -- cgit v1.2.2 From 33e5d76979cf01e3834814fe0aea569d1d602c1a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 2 Apr 2009 16:56:32 -0700 Subject: nommu: fix a number of issues with the per-MM VMA patch Fix a number of issues with the per-MM VMA patch: (1) Make mmap_pages_allocated an atomic_long_t, just in case this is used on a NOMMU system with more than 2G pages. Makes no difference on a 32-bit system. (2) Report vma->vm_pgoff * PAGE_SIZE as a 64-bit value, not a 32-bit value, lest it overflow. (3) Move the allocation of the vm_area_struct slab back for fork.c. (4) Use KMEM_CACHE() for both vm_area_struct and vm_region slabs. (5) Use BUG_ON() rather than if () BUG(). (6) Make the default validate_nommu_regions() a static inline rather than a #define. (7) Make free_page_series()'s objection to pages with a refcount != 1 more informative. (8) Adjust the __put_nommu_region() banner comment to indicate that the semaphore must be held for writing. (9) Limit the number of warnings about munmaps of non-mmapped regions. Reported-by: Andrew Morton Signed-off-by: David Howells Cc: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 3 --- mm/nommu.c | 52 +++++++++++++++++++++++++--------------------------- 2 files changed, 25 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 1abb9185a686..4a3841186c11 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2481,7 +2481,4 @@ void mm_drop_all_locks(struct mm_struct *mm) */ void __init mmap_init(void) { - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); } diff --git a/mm/nommu.c b/mm/nommu.c index 2fcf47d449b4..72eda4aee2cb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -69,7 +69,7 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ int heap_stack_gap = 0; -atomic_t mmap_pages_allocated; +atomic_long_t mmap_pages_allocated; EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(num_physpages); @@ -463,12 +463,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) */ void __init mmap_init(void) { - vm_region_jar = kmem_cache_create("vm_region_jar", - sizeof(struct vm_region), 0, - SLAB_PANIC, NULL); - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); + vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); } /* @@ -486,27 +481,24 @@ static noinline void validate_nommu_regions(void) return; last = rb_entry(lastp, struct vm_region, vm_rb); - if (unlikely(last->vm_end <= last->vm_start)) - BUG(); - if (unlikely(last->vm_top < last->vm_end)) - BUG(); + BUG_ON(unlikely(last->vm_end <= last->vm_start)); + BUG_ON(unlikely(last->vm_top < last->vm_end)); while ((p = rb_next(lastp))) { region = rb_entry(p, struct vm_region, vm_rb); last = rb_entry(lastp, struct vm_region, vm_rb); - if (unlikely(region->vm_end <= region->vm_start)) - BUG(); - if (unlikely(region->vm_top < region->vm_end)) - BUG(); - if (unlikely(region->vm_start < last->vm_top)) - BUG(); + BUG_ON(unlikely(region->vm_end <= region->vm_start)); + BUG_ON(unlikely(region->vm_top < region->vm_end)); + BUG_ON(unlikely(region->vm_start < last->vm_top)); lastp = p; } } #else -#define validate_nommu_regions() do {} while(0) +static void validate_nommu_regions(void) +{ +} #endif /* @@ -563,16 +555,17 @@ static void free_page_series(unsigned long from, unsigned long to) struct page *page = virt_to_page(from); kdebug("- free %lx", from); - atomic_dec(&mmap_pages_allocated); + atomic_long_dec(&mmap_pages_allocated); if (page_count(page) != 1) - kdebug("free page %p [%d]", page, page_count(page)); + kdebug("free page %p: refcount not one: %d", + page, page_count(page)); put_page(page); } } /* * release a reference to a region - * - the caller must hold the region semaphore, which this releases + * - the caller must hold the region semaphore for writing, which this releases * - the region may not have been added to the tree yet, in which case vm_top * will equal vm_start */ @@ -1096,7 +1089,7 @@ static int do_mmap_private(struct vm_area_struct *vma, goto enomem; total = 1 << order; - atomic_add(total, &mmap_pages_allocated); + atomic_long_add(total, &mmap_pages_allocated); point = rlen >> PAGE_SHIFT; @@ -1107,7 +1100,7 @@ static int do_mmap_private(struct vm_area_struct *vma, order = ilog2(total - point); n = 1 << order; kdebug("shave %lu/%lu @%lu", n, total - point, total); - atomic_sub(n, &mmap_pages_allocated); + atomic_long_sub(n, &mmap_pages_allocated); total -= n; set_page_refcounted(pages + total); __free_pages(pages + total, order); @@ -1536,10 +1529,15 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* find the first potentially overlapping VMA */ vma = find_vma(mm, start); if (!vma) { - printk(KERN_WARNING - "munmap of memory not mmapped by process %d (%s):" - " 0x%lx-0x%lx\n", - current->pid, current->comm, start, start + len - 1); + static int limit = 0; + if (limit < 5) { + printk(KERN_WARNING + "munmap of memory not mmapped by process %d" + " (%s): 0x%lx-0x%lx\n", + current->pid, current->comm, + start, start + len - 1); + limit++; + } return -EINVAL; } -- cgit v1.2.2 From 98f4ebb290a7dca8c48f27ec1d2cab8fa7982dad Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 2 Apr 2009 16:56:39 -0700 Subject: mm: align vmstat_work's timer Even though vmstat_work is marked deferrable, there are still benefits to aligning it. For certain applications we want to keep OS jitter as low as possible and aligning timers and work so they occur together can reduce their overall impact. Signed-off-by: Anton Blanchard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 9826766f1274..66f6130976cb 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -891,7 +891,7 @@ static void vmstat_update(struct work_struct *w) { refresh_cpu_vm_stats(smp_processor_id()); schedule_delayed_work(&__get_cpu_var(vmstat_work), - sysctl_stat_interval); + round_jiffies_relative(sysctl_stat_interval)); } static void __cpuinit start_cpu_timer(int cpu) @@ -899,7 +899,8 @@ static void __cpuinit start_cpu_timer(int cpu) struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); - schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu); + schedule_delayed_work_on(cpu, vmstat_work, + __round_jiffies_relative(HZ, cpu)); } /* -- cgit v1.2.2 From 58984ce21d315b70df1a43644df7416ea7c9bfd8 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 2 Apr 2009 16:56:42 -0700 Subject: mm: do_xip_mapping_read: fix length calculation The calculation of the value nr in do_xip_mapping_read is incorrect. If the copy required more than one iteration in the do while loop the copies variable will be non-zero. The maximum length that may be passed to the call to copy_to_user(buf+copied, xip_mem+offset, nr) is len-copied but the check only compares against (nr > len). This bug is the cause for the heap corruption Carsten has been chasing for so long: *** glibc detected *** /bin/bash: free(): invalid next size (normal): 0x00000000800e39f0 *** ======= Backtrace: ========= /lib64/libc.so.6[0x200000b9b44] /lib64/libc.so.6(cfree+0x8e)[0x200000bdade] /bin/bash(free_buffered_stream+0x32)[0x80050e4e] /bin/bash(close_buffered_stream+0x1c)[0x80050ea4] /bin/bash(unset_bash_input+0x2a)[0x8001c366] /bin/bash(make_child+0x1d4)[0x8004115c] /bin/bash[0x8002fc3c] /bin/bash(execute_command_internal+0x656)[0x8003048e] /bin/bash(execute_command+0x5e)[0x80031e1e] /bin/bash(execute_command_internal+0x79a)[0x800305d2] /bin/bash(execute_command+0x5e)[0x80031e1e] /bin/bash(reader_loop+0x270)[0x8001efe0] /bin/bash(main+0x1328)[0x8001e960] /lib64/libc.so.6(__libc_start_main+0x100)[0x200000592a8] /bin/bash(clearerr+0x5e)[0x8001c092] With this bug fix the commit 0e4a9b59282914fe057ab17027f55123964bc2e2 "ext2/xip: refuse to change xip flag during remount with busy inodes" can be removed again. Cc: Carsten Otte Cc: Nick Piggin Cc: Jared Hulbert Cc: Signed-off-by: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap_xip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 0c04615651b7..427dfe3ce78c 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -89,8 +89,8 @@ do_xip_mapping_read(struct address_space *mapping, } } nr = nr - offset; - if (nr > len) - nr = len; + if (nr > len - copied) + nr = len - copied; error = mapping->a_ops->get_xip_mem(mapping, index, 0, &xip_mem, &xip_pfn); -- cgit v1.2.2 From bf6aede712334d7338d5c47a5ee5ba3883c82a61 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 2 Apr 2009 16:56:54 -0700 Subject: workqueue: add to_delayed_work() helper function It is a fairly common operation to have a pointer to a work and to need a pointer to the delayed work it is contained in. In particular, all delayed works which want to rearm themselves will have to do that. So it would seem fair to offer a helper function for this operation. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jean Delvare Acked-by: Ingo Molnar Cc: "David S. Miller" Cc: Herbert Xu Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Greg KH Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 825c606f691d..208323fd37bc 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3992,8 +3992,7 @@ static void cache_reap(struct work_struct *w) struct kmem_cache *searchp; struct kmem_list3 *l3; int node = numa_node_id(); - struct delayed_work *work = - container_of(w, struct delayed_work, work); + struct delayed_work *work = to_delayed_work(w); if (!mutex_trylock(&cache_chain_mutex)) /* Give up. Setup the next iteration. */ -- cgit v1.2.2 From ec64f51545fffbc4cb968f0cea56341a4b07e85a Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:26 -0700 Subject: cgroup: fix frequent -EBUSY at rmdir In following situation, with memory subsystem, /groupA use_hierarchy==1 /01 some tasks /02 some tasks /03 some tasks /04 empty When tasks under 01/02/03 hit limit on /groupA, hierarchical reclaim is triggered and the kernel walks tree under groupA. In this case, rmdir /groupA/04 fails with -EBUSY frequently because of temporal refcnt from the kernel. In general. cgroup can be rmdir'd if there are no children groups and no tasks. Frequent fails of rmdir() is not useful to users. (And the reason for -EBUSY is unknown to users.....in most cases) This patch tries to modify above behavior, by - retries if css_refcnt is got by someone. - add "return value" to pre_destroy() and allows subsystem to say "we're really busy!" Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8e4be9cb2a6a..8ffec674c5ac 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2272,11 +2272,12 @@ free_out: return ERR_PTR(-ENOMEM); } -static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, +static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - mem_cgroup_force_empty(mem, false); + + return mem_cgroup_force_empty(mem, false); } static void mem_cgroup_destroy(struct cgroup_subsys *ss, -- cgit v1.2.2 From 04046e1a0a34286382e913f8fc461440c21d88e8 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:33 -0700 Subject: memcg: use CSS ID Assigning CSS ID for each memcg and use css_get_next() for scanning hierarchy. Assume folloing tree. group_A (ID=3) /01 (ID=4) /0A (ID=7) /02 (ID=10) group_B (ID=5) and task in group_A/01/0A hits limit at group_A. reclaim will be done in following order (round-robin). group_A(3) -> group_A/01 (4) -> group_A/01/0A (7) -> group_A/02(10) -> group_A -> ..... Round robin by ID. The last visited cgroup is recorded and restart from it when it start reclaim again. (More smart algorithm can be implemented..) No cgroup_mutex or hierarchy_mutex is required. Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 220 +++++++++++++++++++++----------------------------------- 1 file changed, 82 insertions(+), 138 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8ffec674c5ac..61fd9590c135 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -95,6 +95,15 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, return ret; } +static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) +{ + s64 ret; + + ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); + ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); + return ret; +} + /* * per-zone information in memory controller. */ @@ -154,9 +163,9 @@ struct mem_cgroup { /* * While reclaiming in a hiearchy, we cache the last child we - * reclaimed from. Protected by hierarchy_mutex + * reclaimed from. */ - struct mem_cgroup *last_scanned_child; + int last_scanned_child; /* * Should the accounting and control be hierarchical, per subtree? */ @@ -629,103 +638,6 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) -/* - * This routine finds the DFS walk successor. This routine should be - * called with hierarchy_mutex held - */ -static struct mem_cgroup * -__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) -{ - struct cgroup *cgroup, *curr_cgroup, *root_cgroup; - - curr_cgroup = curr->css.cgroup; - root_cgroup = root_mem->css.cgroup; - - if (!list_empty(&curr_cgroup->children)) { - /* - * Walk down to children - */ - cgroup = list_entry(curr_cgroup->children.next, - struct cgroup, sibling); - curr = mem_cgroup_from_cont(cgroup); - goto done; - } - -visit_parent: - if (curr_cgroup == root_cgroup) { - /* caller handles NULL case */ - curr = NULL; - goto done; - } - - /* - * Goto next sibling - */ - if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { - cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, - sibling); - curr = mem_cgroup_from_cont(cgroup); - goto done; - } - - /* - * Go up to next parent and next parent's sibling if need be - */ - curr_cgroup = curr_cgroup->parent; - goto visit_parent; - -done: - return curr; -} - -/* - * Visit the first child (need not be the first child as per the ordering - * of the cgroup list, since we track last_scanned_child) of @mem and use - * that to reclaim free pages from. - */ -static struct mem_cgroup * -mem_cgroup_get_next_node(struct mem_cgroup *root_mem) -{ - struct cgroup *cgroup; - struct mem_cgroup *orig, *next; - bool obsolete; - - /* - * Scan all children under the mem_cgroup mem - */ - mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); - - orig = root_mem->last_scanned_child; - obsolete = mem_cgroup_is_obsolete(orig); - - if (list_empty(&root_mem->css.cgroup->children)) { - /* - * root_mem might have children before and last_scanned_child - * may point to one of them. We put it later. - */ - if (orig) - VM_BUG_ON(!obsolete); - next = NULL; - goto done; - } - - if (!orig || obsolete) { - cgroup = list_first_entry(&root_mem->css.cgroup->children, - struct cgroup, sibling); - next = mem_cgroup_from_cont(cgroup); - } else - next = __mem_cgroup_get_next_node(orig, root_mem); - -done: - if (next) - mem_cgroup_get(next); - root_mem->last_scanned_child = next; - if (orig) - mem_cgroup_put(orig); - mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); - return (next) ? next : root_mem; -} - static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) { if (do_swap_account) { @@ -755,46 +667,79 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) } /* - * Dance down the hierarchy if needed to reclaim memory. We remember the - * last child we reclaimed from, so that we don't end up penalizing - * one child extensively based on its position in the children list. + * Visit the first child (need not be the first child as per the ordering + * of the cgroup list, since we track last_scanned_child) of @mem and use + * that to reclaim free pages from. + */ +static struct mem_cgroup * +mem_cgroup_select_victim(struct mem_cgroup *root_mem) +{ + struct mem_cgroup *ret = NULL; + struct cgroup_subsys_state *css; + int nextid, found; + + if (!root_mem->use_hierarchy) { + css_get(&root_mem->css); + ret = root_mem; + } + + while (!ret) { + rcu_read_lock(); + nextid = root_mem->last_scanned_child + 1; + css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, + &found); + if (css && css_tryget(css)) + ret = container_of(css, struct mem_cgroup, css); + + rcu_read_unlock(); + /* Updates scanning parameter */ + spin_lock(&root_mem->reclaim_param_lock); + if (!css) { + /* this means start scan from ID:1 */ + root_mem->last_scanned_child = 0; + } else + root_mem->last_scanned_child = found; + spin_unlock(&root_mem->reclaim_param_lock); + } + + return ret; +} + +/* + * Scan the hierarchy if needed to reclaim memory. We remember the last child + * we reclaimed from, so that we don't end up penalizing one child extensively + * based on its position in the children list. * * root_mem is the original ancestor that we've been reclaim from. + * + * We give up and return to the caller when we visit root_mem twice. + * (other groups can be removed while we're walking....) */ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, gfp_t gfp_mask, bool noswap) { - struct mem_cgroup *next_mem; - int ret = 0; - - /* - * Reclaim unconditionally and don't check for return value. - * We need to reclaim in the current group and down the tree. - * One might think about checking for children before reclaiming, - * but there might be left over accounting, even after children - * have left. - */ - ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, - get_swappiness(root_mem)); - if (mem_cgroup_check_under_limit(root_mem)) - return 1; /* indicate reclaim has succeeded */ - if (!root_mem->use_hierarchy) - return ret; - - next_mem = mem_cgroup_get_next_node(root_mem); - - while (next_mem != root_mem) { - if (mem_cgroup_is_obsolete(next_mem)) { - next_mem = mem_cgroup_get_next_node(root_mem); + struct mem_cgroup *victim; + int ret, total = 0; + int loop = 0; + + while (loop < 2) { + victim = mem_cgroup_select_victim(root_mem); + if (victim == root_mem) + loop++; + if (!mem_cgroup_local_usage(&victim->stat)) { + /* this cgroup's local usage == 0 */ + css_put(&victim->css); continue; } - ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, - get_swappiness(next_mem)); + /* we use swappiness of local cgroup */ + ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, + get_swappiness(victim)); + css_put(&victim->css); + total += ret; if (mem_cgroup_check_under_limit(root_mem)) - return 1; /* indicate reclaim has succeeded */ - next_mem = mem_cgroup_get_next_node(root_mem); + return 1 + total; } - return ret; + return total; } bool mem_cgroup_oom_called(struct task_struct *task) @@ -1324,8 +1269,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) res_counter_uncharge(&mem->res, PAGE_SIZE); if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) res_counter_uncharge(&mem->memsw, PAGE_SIZE); - mem_cgroup_charge_statistics(mem, pc, false); + ClearPageCgroupUsed(pc); /* * pc->mem_cgroup is not cleared here. It will be accessed when it's @@ -2178,6 +2123,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) { int node; + free_css_id(&mem_cgroup_subsys, &mem->css); + for_each_node_state(node, N_POSSIBLE) free_mem_cgroup_per_zone_info(mem, node); @@ -2228,11 +2175,12 @@ static struct cgroup_subsys_state * __ref mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { struct mem_cgroup *mem, *parent; + long error = -ENOMEM; int node; mem = mem_cgroup_alloc(); if (!mem) - return ERR_PTR(-ENOMEM); + return ERR_PTR(error); for_each_node_state(node, N_POSSIBLE) if (alloc_mem_cgroup_per_zone_info(mem, node)) @@ -2260,7 +2208,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&mem->res, NULL); res_counter_init(&mem->memsw, NULL); } - mem->last_scanned_child = NULL; + mem->last_scanned_child = 0; spin_lock_init(&mem->reclaim_param_lock); if (parent) @@ -2269,7 +2217,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) return &mem->css; free_out: __mem_cgroup_free(mem); - return ERR_PTR(-ENOMEM); + return ERR_PTR(error); } static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, @@ -2284,12 +2232,7 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - struct mem_cgroup *last_scanned_child = mem->last_scanned_child; - if (last_scanned_child) { - VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child)); - mem_cgroup_put(last_scanned_child); - } mem_cgroup_put(mem); } @@ -2328,6 +2271,7 @@ struct cgroup_subsys mem_cgroup_subsys = { .populate = mem_cgroup_populate, .attach = mem_cgroup_move_task, .early_init = 0, + .use_id = 1, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP -- cgit v1.2.2 From 14067bb3e24b96d92e22d19c18c0119edf5575e5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:35 -0700 Subject: memcg: hierarchical stat Clean up memory.stat file routine and show "total" hierarchical stat. This patch does - renamed get_all_zonestat to be get_local_zonestat. - remove old mem_cgroup_stat_desc, which is only for per-cpu stat. - add mcs_stat to cover both of per-cpu/per-lru stat. - add "total" stat of hierarchy (*) - add a callback system to scan all memcg under a root. == "total" is added. [kamezawa@localhost ~]$ cat /opt/cgroup/xxx/memory.stat cache 0 rss 0 pgpgin 0 pgpgout 0 inactive_anon 0 active_anon 0 inactive_file 0 active_file 0 unevictable 0 hierarchical_memory_limit 50331648 hierarchical_memsw_limit 9223372036854775807 total_cache 65536 total_rss 192512 total_pgpgin 218 total_pgpgout 155 total_inactive_anon 0 total_active_anon 135168 total_inactive_file 61440 total_active_file 4096 total_unevictable 0 == (*) maybe the user can do calc hierarchical stat by his own program in userland but if it can be written in clean way, it's worth to be shown, I think. Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 160 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 119 insertions(+), 41 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 61fd9590c135..33fc0302e29e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -256,7 +256,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc) return mem_cgroup_zoneinfo(mem, nid, zid); } -static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, +static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, enum lru_list idx) { int nid, zid; @@ -317,6 +317,42 @@ static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) return css_is_removed(&mem->css); } + +/* + * Call callback function against all cgroup under hierarchy tree. + */ +static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, + int (*func)(struct mem_cgroup *, void *)) +{ + int found, ret, nextid; + struct cgroup_subsys_state *css; + struct mem_cgroup *mem; + + if (!root->use_hierarchy) + return (*func)(root, data); + + nextid = 1; + do { + ret = 0; + mem = NULL; + + rcu_read_lock(); + css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, + &found); + if (css && css_tryget(css)) + mem = container_of(css, struct mem_cgroup, css); + rcu_read_unlock(); + + if (mem) { + ret = (*func)(mem, data); + css_put(&mem->css); + } + nextid = found + 1; + } while (!ret && css); + + return ret; +} + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -510,8 +546,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ unsigned long gb; unsigned long inactive_ratio; - inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); - active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); + inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); + active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -1838,54 +1874,90 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) return 0; } -static const struct mem_cgroup_stat_desc { - const char *msg; - u64 unit; -} mem_cgroup_stat_desc[] = { - [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, - [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, - [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, - [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, + +/* For read statistics */ +enum { + MCS_CACHE, + MCS_RSS, + MCS_PGPGIN, + MCS_PGPGOUT, + MCS_INACTIVE_ANON, + MCS_ACTIVE_ANON, + MCS_INACTIVE_FILE, + MCS_ACTIVE_FILE, + MCS_UNEVICTABLE, + NR_MCS_STAT, +}; + +struct mcs_total_stat { + s64 stat[NR_MCS_STAT]; }; +struct { + char *local_name; + char *total_name; +} memcg_stat_strings[NR_MCS_STAT] = { + {"cache", "total_cache"}, + {"rss", "total_rss"}, + {"pgpgin", "total_pgpgin"}, + {"pgpgout", "total_pgpgout"}, + {"inactive_anon", "total_inactive_anon"}, + {"active_anon", "total_active_anon"}, + {"inactive_file", "total_inactive_file"}, + {"active_file", "total_active_file"}, + {"unevictable", "total_unevictable"} +}; + + +static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) +{ + struct mcs_total_stat *s = data; + s64 val; + + /* per cpu stat */ + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); + s->stat[MCS_CACHE] += val * PAGE_SIZE; + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); + s->stat[MCS_RSS] += val * PAGE_SIZE; + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); + s->stat[MCS_PGPGIN] += val; + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); + s->stat[MCS_PGPGOUT] += val; + + /* per zone stat */ + val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); + s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; + val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); + s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; + val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); + s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; + val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); + s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; + val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); + s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; + return 0; +} + +static void +mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) +{ + mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); +} + static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, struct cgroup_map_cb *cb) { struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); - struct mem_cgroup_stat *stat = &mem_cont->stat; + struct mcs_total_stat mystat; int i; - for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { - s64 val; + memset(&mystat, 0, sizeof(mystat)); + mem_cgroup_get_local_stat(mem_cont, &mystat); - val = mem_cgroup_read_stat(stat, i); - val *= mem_cgroup_stat_desc[i].unit; - cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); - } - /* showing # of active pages */ - { - unsigned long active_anon, inactive_anon; - unsigned long active_file, inactive_file; - unsigned long unevictable; - - inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, - LRU_INACTIVE_ANON); - active_anon = mem_cgroup_get_all_zonestat(mem_cont, - LRU_ACTIVE_ANON); - inactive_file = mem_cgroup_get_all_zonestat(mem_cont, - LRU_INACTIVE_FILE); - active_file = mem_cgroup_get_all_zonestat(mem_cont, - LRU_ACTIVE_FILE); - unevictable = mem_cgroup_get_all_zonestat(mem_cont, - LRU_UNEVICTABLE); - - cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); - cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); - cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); - cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); - cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); + for (i = 0; i < NR_MCS_STAT; i++) + cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); - } + /* Hierarchical information */ { unsigned long long limit, memsw_limit; memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); @@ -1894,6 +1966,12 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); } + memset(&mystat, 0, sizeof(mystat)); + mem_cgroup_get_total_stat(mem_cont, &mystat); + for (i = 0; i < NR_MCS_STAT; i++) + cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); + + #ifdef CONFIG_DEBUG_VM cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); -- cgit v1.2.2 From 81d39c20f5ee2437d71709beb82597e2a38efbbc Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:36 -0700 Subject: memcg: fix shrinking memory to return -EBUSY by fixing retry algorithm As pointed out, shrinking memcg's limit should return -EBUSY after reasonable retries. This patch tries to fix the current behavior of shrink_usage. Before looking into "shrink should return -EBUSY" problem, we should fix hierarchical reclaim code. It compares current usage and current limit, but it only makes sense when the kernel reclaims memory because hit limits. This is also a problem. What this patch does are. 1. add new argument "shrink" to hierarchical reclaim. If "shrink==true", hierarchical reclaim returns immediately and the caller checks the kernel should shrink more or not. (At shrinking memory, usage is always smaller than limit. So check for usage < limit is useless.) 2. For adjusting to above change, 2 changes in "shrink"'s retry path. 2-a. retry_count depends on # of children because the kernel visits the children under hierarchy one by one. 2-b. rather than checking return value of hierarchical_reclaim's progress, compares usage-before-shrink and usage-after-shrink. If usage-before-shrink <= usage-after-shrink, retry_count is decremented. Reported-by: Li Zefan Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Cc: Balbir Singh Cc: Daisuke Nishimura Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 33fc0302e29e..6f6a575e77ad 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -702,6 +702,23 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) return swappiness; } +static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) +{ + int *val = data; + (*val)++; + return 0; +} +/* + * This function returns the number of memcg under hierarchy tree. Returns + * 1(self count) if no children. + */ +static int mem_cgroup_count_children(struct mem_cgroup *mem) +{ + int num = 0; + mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); + return num; +} + /* * Visit the first child (need not be the first child as per the ordering * of the cgroup list, since we track last_scanned_child) of @mem and use @@ -750,9 +767,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) * * We give up and return to the caller when we visit root_mem twice. * (other groups can be removed while we're walking....) + * + * If shrink==true, for avoiding to free too much, this returns immedieately. */ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, - gfp_t gfp_mask, bool noswap) + gfp_t gfp_mask, bool noswap, bool shrink) { struct mem_cgroup *victim; int ret, total = 0; @@ -771,6 +790,13 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, get_swappiness(victim)); css_put(&victim->css); + /* + * At shrinking usage, we can't check we should stop here or + * reclaim more. It's depends on callers. last_scanned_child + * will work enough for keeping fairness under tree. + */ + if (shrink) + return ret; total += ret; if (mem_cgroup_check_under_limit(root_mem)) return 1 + total; @@ -856,7 +882,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, goto nomem; ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, - noswap); + noswap, false); if (ret) continue; @@ -1489,7 +1515,8 @@ int mem_cgroup_shrink_usage(struct page *page, return 0; do { - progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true); + progress = mem_cgroup_hierarchical_reclaim(mem, + gfp_mask, true, false); progress += mem_cgroup_check_under_limit(mem); } while (!progress && --retry); @@ -1504,11 +1531,21 @@ static DEFINE_MUTEX(set_limit_mutex); static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { - - int retry_count = MEM_CGROUP_RECLAIM_RETRIES; + int retry_count; int progress; u64 memswlimit; int ret = 0; + int children = mem_cgroup_count_children(memcg); + u64 curusage, oldusage; + + /* + * For keeping hierarchical_reclaim simple, how long we should retry + * is depends on callers. We set our retry-count to be function + * of # of children which we should visit in this loop. + */ + retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; + + oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); while (retry_count) { if (signal_pending(current)) { @@ -1534,8 +1571,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, break; progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, - false); - if (!progress) retry_count--; + false, true); + curusage = res_counter_read_u64(&memcg->res, RES_USAGE); + /* Usage is reduced ? */ + if (curusage >= oldusage) + retry_count--; + else + oldusage = curusage; } return ret; @@ -1544,13 +1586,16 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, unsigned long long val) { - int retry_count = MEM_CGROUP_RECLAIM_RETRIES; + int retry_count; u64 memlimit, oldusage, curusage; - int ret; + int children = mem_cgroup_count_children(memcg); + int ret = -EBUSY; if (!do_swap_account) return -EINVAL; - + /* see mem_cgroup_resize_res_limit */ + retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; + oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); while (retry_count) { if (signal_pending(current)) { ret = -EINTR; @@ -1574,11 +1619,13 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, if (!ret) break; - oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); - mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true); + mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + /* Usage is reduced ? */ if (curusage >= oldusage) retry_count--; + else + oldusage = curusage; } return ret; } -- cgit v1.2.2 From 0b7f569e45bb6be142d87017030669a6a7d327a1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:38 -0700 Subject: memcg: fix OOM killer under memcg This patch tries to fix OOM Killer problems caused by hierarchy. Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to kill a task in memcg. But, when hierarchy is used, it's broken and correct task cannot be killed. For example, in following cgroup /groupA/ hierarchy=1, limit=1G, 01 nolimit 02 nolimit All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to groupA's 1Gbytes but OOM Killer just kills tasks in groupA. This patch provides makes the bad process be selected from all tasks under hierarchy. BTW, currently, oom_jiffies is updated against groupA in above case. oom_jiffies of tree should be updated. To see how oom_jiffies is used, please check mem_cgroup_oom_called() callers. [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: const fix] Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6f6a575e77ad..025f8abfae2d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -295,6 +295,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *mem = NULL; + + if (!mm) + return NULL; /* * Because we have no locks, mm->owner's may be being moved to other * cgroup. We use css_tryget() here even if this looks @@ -486,10 +489,20 @@ void mem_cgroup_move_lists(struct page *page, int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) { int ret; + struct mem_cgroup *curr = NULL; task_lock(task); - ret = task->mm && mm_match_cgroup(task->mm, mem); + rcu_read_lock(); + curr = try_get_mem_cgroup_from_mm(task->mm); + rcu_read_unlock(); task_unlock(task); + if (!curr) + return 0; + if (curr->use_hierarchy) + ret = css_is_ancestor(&curr->css, &mem->css); + else + ret = (curr == mem); + css_put(&curr->css); return ret; } @@ -820,6 +833,19 @@ bool mem_cgroup_oom_called(struct task_struct *task) rcu_read_unlock(); return ret; } + +static int record_last_oom_cb(struct mem_cgroup *mem, void *data) +{ + mem->last_oom_jiffies = jiffies; + return 0; +} + +static void record_last_oom(struct mem_cgroup *mem) +{ + mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); +} + + /* * Unlike exported interface, "oom" parameter is added. if oom==true, * oom-killer can be invoked. @@ -902,7 +928,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, mutex_lock(&memcg_tasklist); mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); mutex_unlock(&memcg_tasklist); - mem_over_limit->last_oom_jiffies = jiffies; + record_last_oom(mem_over_limit); } goto nomem; } -- cgit v1.2.2 From e222432bfa7dcf6ec008622a978c9f284ed5e3a9 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Thu, 2 Apr 2009 16:57:39 -0700 Subject: memcg: show memcg information during OOM Add RSS and swap to OOM output from memcg Display memcg values like failcnt, usage and limit when an OOM occurs due to memcg. Thanks to Johannes Weiner, Li Zefan, David Rientjes, Kamezawa Hiroyuki, Daisuke Nishimura and KOSAKI Motohiro for review. Sample output ------------- Task in /a/x killed as a result of limit of /a memory: usage 1048576kB, limit 1048576kB, failcnt 4183 memory+swap: usage 1400964kB, limit 9007199254740991kB, failcnt 0 [akpm@linux-foundation.org: compilation fix] [akpm@linux-foundation.org: fix kerneldoc and whitespace] [akpm@linux-foundation.org: add printk facility level] Signed-off-by: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Li Zefan Cc: Paul Menage Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/oom_kill.c | 1 + 2 files changed, 70 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 025f8abfae2d..2bdb6149faeb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -721,6 +722,74 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) (*val)++; return 0; } + +/** + * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. + * @memcg: The memory cgroup that went over limit + * @p: Task that is going to be killed + * + * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is + * enabled + */ +void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) +{ + struct cgroup *task_cgrp; + struct cgroup *mem_cgrp; + /* + * Need a buffer in BSS, can't rely on allocations. The code relies + * on the assumption that OOM is serialized for memory controller. + * If this assumption is broken, revisit this code. + */ + static char memcg_name[PATH_MAX]; + int ret; + + if (!memcg) + return; + + + rcu_read_lock(); + + mem_cgrp = memcg->css.cgroup; + task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); + + ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); + if (ret < 0) { + /* + * Unfortunately, we are unable to convert to a useful name + * But we'll still print out the usage information + */ + rcu_read_unlock(); + goto done; + } + rcu_read_unlock(); + + printk(KERN_INFO "Task in %s killed", memcg_name); + + rcu_read_lock(); + ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); + if (ret < 0) { + rcu_read_unlock(); + goto done; + } + rcu_read_unlock(); + + /* + * Continues from above, so we don't need an KERN_ level + */ + printk(KERN_CONT " as a result of limit of %s\n", memcg_name); +done: + + printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", + res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, + res_counter_read_u64(&memcg->res, RES_FAILCNT)); + printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " + "failcnt %llu\n", + res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, + res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); +} + /* * This function returns the number of memcg under hierarchy tree. Returns * 1(self count) if no children. diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d3b9bac085b5..2f3166e308d9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -394,6 +394,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); + mem_cgroup_print_oom_info(mem, current); show_mem(); if (sysctl_oom_dump_tasks) dump_tasks(mem); -- cgit v1.2.2 From c137b5ece4b111e46981aae7da77315b9909809f Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 2 Apr 2009 16:57:40 -0700 Subject: memcg: remove mem_cgroup_calc_mapped_ratio() Currently, mem_cgroup_calc_mapped_ratio() is unused at all. it can be removed and KAMEZAWA-san suggested it. Signed-off-by: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2bdb6149faeb..7bb14fdc780c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -507,23 +507,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) return ret; } -/* - * Calculate mapped_ratio under memory controller. This will be used in - * vmscan.c for deteremining we have to reclaim mapped pages. - */ -int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) -{ - long total, rss; - - /* - * usage is recorded in bytes. But, here, we assume the number of - * physical pages can be represented by "long" on any arch. - */ - total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; - rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); - return (int)((rss * 100L) / total); -} - /* * prev_priority control...this will be used in memory reclaim path. */ -- cgit v1.2.2 From 3c776e64660028236313f0e54f3a9945764422df Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 2 Apr 2009 16:57:43 -0700 Subject: memcg: charge swapcache to proper memcg memcg_test.txt says at 4.1: This swap-in is one of the most complicated work. In do_swap_page(), following events occur when pte is unchanged. (1) the page (SwapCache) is looked up. (2) lock_page() (3) try_charge_swapin() (4) reuse_swap_page() (may call delete_swap_cache()) (5) commit_charge_swapin() (6) swap_free(). Considering following situation for example. (A) The page has not been charged before (2) and reuse_swap_page() doesn't call delete_from_swap_cache(). (B) The page has not been charged before (2) and reuse_swap_page() calls delete_from_swap_cache(). (C) The page has been charged before (2) and reuse_swap_page() doesn't call delete_from_swap_cache(). (D) The page has been charged before (2) and reuse_swap_page() calls delete_from_swap_cache(). memory.usage/memsw.usage changes to this page/swp_entry will be Case (A) (B) (C) (D) Event Before (2) 0/ 1 0/ 1 1/ 1 1/ 1 =========================================== (3) +1/+1 +1/+1 +1/+1 +1/+1 (4) - 0/ 0 - -1/ 0 (5) 0/-1 0/ 0 -1/-1 0/ 0 (6) - 0/-1 - 0/-1 =========================================== Result 1/ 1 1/ 1 1/ 1 1/ 1 In any cases, charges to this page should be 1/ 1. In case of (D), mem_cgroup_try_get_from_swapcache() returns NULL (because lookup_swap_cgroup() returns NULL), so "+1/+1" at (3) means charges to the memcg("foo") to which the "current" belongs. OTOH, "-1/0" at (4) and "0/-1" at (6) means uncharges from the memcg("baa") to which the page has been charged. So, if the "foo" and "baa" is different(for example because of task move), this charge will be moved from "baa" to "foo". I think this is an unexpected behavior. This patch fixes this by modifying mem_cgroup_try_get_from_swapcache() to return the memcg to which the swapcache has been charged if PCG_USED bit is set. IIUC, checking PCG_USED bit of swapcache is safe under page lock. Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7bb14fdc780c..81b0ae8183d0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -994,13 +994,24 @@ nomem: static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) { struct mem_cgroup *mem; + struct page_cgroup *pc; swp_entry_t ent; + VM_BUG_ON(!PageLocked(page)); + if (!PageSwapCache(page)) return NULL; - ent.val = page_private(page); - mem = lookup_swap_cgroup(ent); + pc = lookup_page_cgroup(page); + /* + * Used bit of swapcache is solid under page lock. + */ + if (PageCgroupUsed(pc)) + mem = pc->mem_cgroup; + else { + ent.val = page_private(page); + mem = lookup_swap_cgroup(ent); + } if (!mem) return NULL; if (!css_tryget(&mem->css)) -- cgit v1.2.2 From a3b2d692690aef228e493b1beaafe5364cab3237 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:45 -0700 Subject: cgroups: use css id in swap cgroup for saving memory v5 Try to use CSS ID for records in swap_cgroup. By this, on 64bit machine, size of swap_cgroup goes down to 2 bytes from 8bytes. This means, when 2GB of swap is equipped, (assume the page size is 4096bytes) From size of swap_cgroup = 2G/4k * 8 = 4Mbytes. To size of swap_cgroup = 2G/4k * 2 = 1Mbytes. Reduction is large. Of course, there are trade-offs. This CSS ID will add overhead to swap-in/swap-out/swap-free. But in general, - swap is a resource which the user tend to avoid use. - If swap is never used, swap_cgroup area is not used. - Reading traditional manuals, size of swap should be proportional to size of memory. Memory size of machine is increasing now. I think reducing size of swap_cgroup makes sense. Note: - ID->CSS lookup routine has no locks, it's under RCU-Read-Side. - memcg can be obsolete at rmdir() but not freed while refcnt from swap_cgroup is available. Changelog v4->v5: - reworked on to memcg-charge-swapcache-to-proper-memcg.patch Changlog ->v4: - fixed not configured case. - deleted unnecessary comments. - fixed NULL pointer bug. - fixed message in dmesg. [nishimura@mxp.nes.nec.co.jp: css_tryget can be called twice in !PageCgroupUsed case] Signed-off-by: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Balbir Singh Cc: Paul Menage Cc: Hugh Dickins Signed-off-by: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++--------- mm/page_cgroup.c | 32 +++++++++++------------- 2 files changed, 76 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 81b0ae8183d0..55dea5968464 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -991,10 +991,31 @@ nomem: return -ENOMEM; } + +/* + * A helper function to get mem_cgroup from ID. must be called under + * rcu_read_lock(). The caller must check css_is_removed() or some if + * it's concern. (dropping refcnt from swap can be called against removed + * memcg.) + */ +static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) +{ + struct cgroup_subsys_state *css; + + /* ID 0 is unused ID */ + if (!id) + return NULL; + css = css_lookup(&mem_cgroup_subsys, id); + if (!css) + return NULL; + return container_of(css, struct mem_cgroup, css); +} + static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) { struct mem_cgroup *mem; struct page_cgroup *pc; + unsigned short id; swp_entry_t ent; VM_BUG_ON(!PageLocked(page)); @@ -1006,16 +1027,19 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) /* * Used bit of swapcache is solid under page lock. */ - if (PageCgroupUsed(pc)) + if (PageCgroupUsed(pc)) { mem = pc->mem_cgroup; - else { + if (mem && !css_tryget(&mem->css)) + mem = NULL; + } else { ent.val = page_private(page); - mem = lookup_swap_cgroup(ent); + id = lookup_swap_cgroup(ent); + rcu_read_lock(); + mem = mem_cgroup_lookup(id); + if (mem && !css_tryget(&mem->css)) + mem = NULL; + rcu_read_unlock(); } - if (!mem) - return NULL; - if (!css_tryget(&mem->css)) - return NULL; return mem; } @@ -1276,12 +1300,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (do_swap_account && !ret && PageSwapCache(page)) { swp_entry_t ent = {.val = page_private(page)}; + unsigned short id; /* avoid double counting */ - mem = swap_cgroup_record(ent, NULL); + id = swap_cgroup_record(ent, 0); + rcu_read_lock(); + mem = mem_cgroup_lookup(id); if (mem) { + /* + * We did swap-in. Then, this entry is doubly counted + * both in mem and memsw. We uncharge it, here. + * Recorded ID can be obsolete. We avoid calling + * css_tryget() + */ res_counter_uncharge(&mem->memsw, PAGE_SIZE); mem_cgroup_put(mem); } + rcu_read_unlock(); } return ret; } @@ -1346,13 +1380,21 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) */ if (do_swap_account && PageSwapCache(page)) { swp_entry_t ent = {.val = page_private(page)}; + unsigned short id; struct mem_cgroup *memcg; - memcg = swap_cgroup_record(ent, NULL); + + id = swap_cgroup_record(ent, 0); + rcu_read_lock(); + memcg = mem_cgroup_lookup(id); if (memcg) { + /* + * This recorded memcg can be obsolete one. So, avoid + * calling css_tryget + */ res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_put(memcg); } - + rcu_read_unlock(); } /* add this page(page_cgroup) to the LRU we want. */ @@ -1473,7 +1515,7 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) MEM_CGROUP_CHARGE_TYPE_SWAPOUT); /* record memcg information */ if (do_swap_account && memcg) { - swap_cgroup_record(ent, memcg); + swap_cgroup_record(ent, css_id(&memcg->css)); mem_cgroup_get(memcg); } if (memcg) @@ -1488,15 +1530,23 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) void mem_cgroup_uncharge_swap(swp_entry_t ent) { struct mem_cgroup *memcg; + unsigned short id; if (!do_swap_account) return; - memcg = swap_cgroup_record(ent, NULL); + id = swap_cgroup_record(ent, 0); + rcu_read_lock(); + memcg = mem_cgroup_lookup(id); if (memcg) { + /* + * We uncharge this because swap is freed. + * This memcg can be obsolete one. We avoid calling css_tryget + */ res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_put(memcg); } + rcu_read_unlock(); } #endif diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index ceecfbb143fa..ebf81074bed4 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -285,12 +285,8 @@ struct swap_cgroup_ctrl { struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; -/* - * This 8bytes seems big..maybe we can reduce this when we can use "id" for - * cgroup rather than pointer. - */ struct swap_cgroup { - struct mem_cgroup *val; + unsigned short id; }; #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) #define SC_POS_MASK (SC_PER_PAGE - 1) @@ -342,10 +338,10 @@ not_enough_page: * @ent: swap entry to be recorded into * @mem: mem_cgroup to be recorded * - * Returns old value at success, NULL at failure. - * (Of course, old value can be NULL.) + * Returns old value at success, 0 at failure. + * (Of course, old value can be 0.) */ -struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) { int type = swp_type(ent); unsigned long offset = swp_offset(ent); @@ -354,18 +350,18 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) struct swap_cgroup_ctrl *ctrl; struct page *mappage; struct swap_cgroup *sc; - struct mem_cgroup *old; + unsigned short old; if (!do_swap_account) - return NULL; + return 0; ctrl = &swap_cgroup_ctrl[type]; mappage = ctrl->map[idx]; sc = page_address(mappage); sc += pos; - old = sc->val; - sc->val = mem; + old = sc->id; + sc->id = id; return old; } @@ -374,9 +370,9 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry * @ent: swap entry to be looked up. * - * Returns pointer to mem_cgroup at success. NULL at failure. + * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) */ -struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent) +unsigned short lookup_swap_cgroup(swp_entry_t ent) { int type = swp_type(ent); unsigned long offset = swp_offset(ent); @@ -385,16 +381,16 @@ struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent) struct swap_cgroup_ctrl *ctrl; struct page *mappage; struct swap_cgroup *sc; - struct mem_cgroup *ret; + unsigned short ret; if (!do_swap_account) - return NULL; + return 0; ctrl = &swap_cgroup_ctrl[type]; mappage = ctrl->map[idx]; sc = page_address(mappage); sc += pos; - ret = sc->val; + ret = sc->id; return ret; } @@ -432,7 +428,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) printk(KERN_INFO "swap_cgroup: uses %ld bytes of vmalloc for pointer array space" - " and %ld bytes to hold mem_cgroup pointers on swap\n", + " and %ld bytes to hold mem_cgroup information per swap ents\n", array_size, length * PAGE_SIZE); printk(KERN_INFO "swap_cgroup can be disabled by noswapaccount boot option.\n"); -- cgit v1.2.2 From 627991a20b3f4d504d20466ab405fe035cb1a20a Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:47 -0700 Subject: memcg: remove redundant message at swapon It's pointed out that swap_cgroup's message at swapon() is nonsense. Because * It can be calculated very easily if all necessary information is written in Kconfig. * It's not necessary to annoying people at every swapon(). In other view, now, memory usage per swp_entry is reduced to 2bytes from 8bytes(64bit) and I think it's reasonably small. Reported-by: Hugh Dickins Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index ebf81074bed4..791905c991df 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -426,13 +426,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) } mutex_unlock(&swap_cgroup_mutex); - printk(KERN_INFO - "swap_cgroup: uses %ld bytes of vmalloc for pointer array space" - " and %ld bytes to hold mem_cgroup information per swap ents\n", - array_size, length * PAGE_SIZE); - printk(KERN_INFO - "swap_cgroup can be disabled by noswapaccount boot option.\n"); - return 0; nomem: printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); -- cgit v1.2.2 From 83aae4c737866da3280f51fd15da58eddd788397 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 2 Apr 2009 16:57:48 -0700 Subject: memcg: cleanup cache_charge Current mem_cgroup_cache_charge is a bit complicated especially in the case of shmem's swap-in. This patch cleans it up by using try_charge_swapin and commit_charge_swapin. Signed-off-by: Daisuke Nishimura Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 60 ++++++++++++++++++++++----------------------------------- 1 file changed, 23 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 55dea5968464..2fc6d6c48238 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1238,6 +1238,10 @@ int mem_cgroup_newpage_charge(struct page *page, MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); } +static void +__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, + enum charge_type ctype); + int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { @@ -1274,16 +1278,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, unlock_page_cgroup(pc); } - if (do_swap_account && PageSwapCache(page)) { - mem = try_get_mem_cgroup_from_swapcache(page); - if (mem) - mm = NULL; - else - mem = NULL; - /* SwapCache may be still linked to LRU now. */ - mem_cgroup_lru_del_before_commit_swapcache(page); - } - if (unlikely(!mm && !mem)) mm = &init_mm; @@ -1291,32 +1285,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, return mem_cgroup_charge_common(page, mm, gfp_mask, MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); - ret = mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); - if (mem) - css_put(&mem->css); - if (PageSwapCache(page)) - mem_cgroup_lru_add_after_commit_swapcache(page); + /* shmem */ + if (PageSwapCache(page)) { + ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); + if (!ret) + __mem_cgroup_commit_charge_swapin(page, mem, + MEM_CGROUP_CHARGE_TYPE_SHMEM); + } else + ret = mem_cgroup_charge_common(page, mm, gfp_mask, + MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); - if (do_swap_account && !ret && PageSwapCache(page)) { - swp_entry_t ent = {.val = page_private(page)}; - unsigned short id; - /* avoid double counting */ - id = swap_cgroup_record(ent, 0); - rcu_read_lock(); - mem = mem_cgroup_lookup(id); - if (mem) { - /* - * We did swap-in. Then, this entry is doubly counted - * both in mem and memsw. We uncharge it, here. - * Recorded ID can be obsolete. We avoid calling - * css_tryget() - */ - res_counter_uncharge(&mem->memsw, PAGE_SIZE); - mem_cgroup_put(mem); - } - rcu_read_unlock(); - } return ret; } @@ -1359,7 +1337,9 @@ charge_cur_mm: return __mem_cgroup_try_charge(mm, mask, ptr, true); } -void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) +static void +__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, + enum charge_type ctype) { struct page_cgroup *pc; @@ -1369,7 +1349,7 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) return; pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); + __mem_cgroup_commit_charge(ptr, pc, ctype); mem_cgroup_lru_add_after_commit_swapcache(page); /* * Now swap is on-memory. This means this page may be @@ -1400,6 +1380,12 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) } +void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) +{ + __mem_cgroup_commit_charge_swapin(page, ptr, + MEM_CGROUP_CHARGE_TYPE_MAPPED); +} + void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) { if (mem_cgroup_disabled()) -- cgit v1.2.2 From 03fb3d2af96c2783c3a5bc03f3d984cf422f0e69 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 3 Apr 2009 16:42:35 +0100 Subject: FS-Cache: Release page->private after failed readahead The attached patch causes read_cache_pages() to release page-private data on a page for which add_to_page_cache() fails. If the filler function fails, then the problematic page is left attached to the pagecache (with appropriate flags set, one presumes) and the remaining to-be-attached pages are invalidated and discarded. This permits pages with caching references associated with them to be cleaned up. The invalidatepage() address space op is called (indirectly) to do the honours. Signed-off-by: David Howells Acked-by: Steve Dickson Acked-by: Trond Myklebust Acked-by: Rik van Riel Acked-by: Al Viro Tested-by: Daire Byrne --- mm/readahead.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index 9ce303d4b810..6be927569cf6 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -31,6 +31,41 @@ EXPORT_SYMBOL_GPL(file_ra_state_init); #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) +/* + * see if a page needs releasing upon read_cache_pages() failure + * - the caller of read_cache_pages() may have set PG_private before calling, + * such as the NFS fs marking pages that are cached locally on disk, thus we + * need to give the fs a chance to clean up in the event of an error + */ +static void read_cache_pages_invalidate_page(struct address_space *mapping, + struct page *page) +{ + if (PagePrivate(page)) { + if (!trylock_page(page)) + BUG(); + page->mapping = mapping; + do_invalidatepage(page, 0); + page->mapping = NULL; + unlock_page(page); + } + page_cache_release(page); +} + +/* + * release a list of pages, invalidating them first if need be + */ +static void read_cache_pages_invalidate_pages(struct address_space *mapping, + struct list_head *pages) +{ + struct page *victim; + + while (!list_empty(pages)) { + victim = list_to_page(pages); + list_del(&victim->lru); + read_cache_pages_invalidate_page(mapping, victim); + } +} + /** * read_cache_pages - populate an address space with some pages & start reads against them * @mapping: the address_space @@ -52,14 +87,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) { - page_cache_release(page); + read_cache_pages_invalidate_page(mapping, page); continue; } page_cache_release(page); ret = filler(data, page); if (unlikely(ret)) { - put_pages_list(pages); + read_cache_pages_invalidate_pages(mapping, pages); break; } task_io_account_read(PAGE_CACHE_SIZE); -- cgit v1.2.2 From 266cf658efcf6ac33541a46740f74f50c79d2b6b Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 3 Apr 2009 16:42:36 +0100 Subject: FS-Cache: Recruit a page flags for cache management Recruit a page flag to aid in cache management. The following extra flag is defined: (1) PG_fscache (PG_private_2) The marked page is backed by a local cache and is pinning resources in the cache driver. If PG_fscache is set, then things that checked for PG_private will now also check for that. This includes things like truncation and page invalidation. The function page_has_private() had been added to make the checks for both PG_private and PG_private_2 at the same time. Signed-off-by: David Howells Acked-by: Steve Dickson Acked-by: Trond Myklebust Acked-by: Rik van Riel Acked-by: Al Viro Tested-by: Daire Byrne --- mm/filemap.c | 3 +++ mm/migrate.c | 10 +++++----- mm/readahead.c | 9 +++++---- mm/swap.c | 4 ++-- mm/truncate.c | 10 +++++----- mm/vmscan.c | 6 +++--- 6 files changed, 23 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 126d3973b3d1..cbc5772e7171 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2463,6 +2463,9 @@ EXPORT_SYMBOL(generic_file_aio_write); * (presumably at page->private). If the release was successful, return `1'. * Otherwise return zero. * + * This may also be called if PG_fscache is set on a page, indicating that the + * page is known to the local caching routines. + * * The @gfp_mask argument specifies whether I/O may be performed to release * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). * diff --git a/mm/migrate.c b/mm/migrate.c index a9eff3f092f6..068655d8f883 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -250,7 +250,7 @@ out: * The number of remaining references must be: * 1 for anonymous pages without a mapping * 2 for pages with a mapping - * 3 for pages with a mapping and PagePrivate set. + * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ static int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) @@ -270,7 +270,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->page_tree, page_index(page)); - expected_count = 2 + !!PagePrivate(page); + expected_count = 2 + !!page_has_private(page); if (page_count(page) != expected_count || (struct page *)radix_tree_deref_slot(pslot) != page) { spin_unlock_irq(&mapping->tree_lock); @@ -386,7 +386,7 @@ EXPORT_SYMBOL(fail_migrate_page); /* * Common logic to directly migrate a single page suitable for - * pages that do not use PagePrivate. + * pages that do not use PagePrivate/PagePrivate2. * * Pages are locked upon entry and exit. */ @@ -522,7 +522,7 @@ static int fallback_migrate_page(struct address_space *mapping, * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ - if (PagePrivate(page) && + if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; @@ -655,7 +655,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * free the metadata, so the page can be freed. */ if (!page->mapping) { - if (!PageAnon(page) && PagePrivate(page)) { + if (!PageAnon(page) && page_has_private(page)) { /* * Go direct to try_to_free_buffers() here because * a) that's what try_to_release_page() would do anyway diff --git a/mm/readahead.c b/mm/readahead.c index 6be927569cf6..133b6d525513 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -33,14 +33,15 @@ EXPORT_SYMBOL_GPL(file_ra_state_init); /* * see if a page needs releasing upon read_cache_pages() failure - * - the caller of read_cache_pages() may have set PG_private before calling, - * such as the NFS fs marking pages that are cached locally on disk, thus we - * need to give the fs a chance to clean up in the event of an error + * - the caller of read_cache_pages() may have set PG_private or PG_fscache + * before calling, such as the NFS fs marking pages that are cached locally + * on disk, thus we need to give the fs a chance to clean up in the event of + * an error */ static void read_cache_pages_invalidate_page(struct address_space *mapping, struct page *page) { - if (PagePrivate(page)) { + if (page_has_private(page)) { if (!trylock_page(page)) BUG(); page->mapping = mapping; diff --git a/mm/swap.c b/mm/swap.c index 6e83084c1f6c..bede23ce64ea 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -448,8 +448,8 @@ void pagevec_strip(struct pagevec *pvec) for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - if (PagePrivate(page) && trylock_page(page)) { - if (PagePrivate(page)) + if (page_has_private(page) && trylock_page(page)) { + if (page_has_private(page)) try_to_release_page(page, 0); unlock_page(page); } diff --git a/mm/truncate.c b/mm/truncate.c index 1229211104f8..55206fab7b99 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -50,7 +50,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) static inline void truncate_partial_page(struct page *page, unsigned partial) { zero_user_segment(page, partial, PAGE_CACHE_SIZE); - if (PagePrivate(page)) + if (page_has_private(page)) do_invalidatepage(page, partial); } @@ -99,7 +99,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (page->mapping != mapping) return; - if (PagePrivate(page)) + if (page_has_private(page)) do_invalidatepage(page, 0); cancel_dirty_page(page, PAGE_CACHE_SIZE); @@ -126,7 +126,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) if (page->mapping != mapping) return 0; - if (PagePrivate(page) && !try_to_release_page(page, 0)) + if (page_has_private(page) && !try_to_release_page(page, 0)) return 0; clear_page_mlock(page); @@ -348,7 +348,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page->mapping != mapping) return 0; - if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) + if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; spin_lock_irq(&mapping->tree_lock); @@ -356,7 +356,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) goto failed; clear_page_mlock(page); - BUG_ON(PagePrivate(page)); + BUG_ON(page_has_private(page)); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); page_cache_release(page); /* pagecache ref */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 06e72693b458..425244988bb2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -283,7 +283,7 @@ static inline int page_mapping_inuse(struct page *page) static inline int is_page_cache_freeable(struct page *page) { - return page_count(page) - !!PagePrivate(page) == 2; + return page_count(page) - !!page_has_private(page) == 2; } static int may_write_to_queue(struct backing_dev_info *bdi) @@ -367,7 +367,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * Some data journaling orphaned pages can have * page->mapping == NULL while being dirty with clean buffers. */ - if (PagePrivate(page)) { + if (page_has_private(page)) { if (try_to_free_buffers(page)) { ClearPageDirty(page); printk("%s: orphaned page\n", __func__); @@ -727,7 +727,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * process address space (page_count == 1) it can be freed. * Otherwise, leave the page on the LRU so it is swappable. */ - if (PagePrivate(page)) { + if (page_has_private(page)) { if (!try_to_release_page(page, sc->gfp_mask)) goto activate_locked; if (!mapping && page_count(page) == 1) { -- cgit v1.2.2 From 385e1ca5f21c4680ad6a46a3aa2ea8af99e99c92 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 3 Apr 2009 16:42:39 +0100 Subject: CacheFiles: Permit the page lock state to be monitored Add a function to install a monitor on the page lock waitqueue for a particular page, thus allowing the page being unlocked to be detected. This is used by CacheFiles to detect read completion on a page in the backing filesystem so that it can then copy the data to the waiting netfs page. Signed-off-by: David Howells Acked-by: Steve Dickson Acked-by: Trond Myklebust Acked-by: Rik van Riel Acked-by: Al Viro Tested-by: Daire Byrne --- mm/filemap.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index cbc5772e7171..fc11974f2bee 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -564,6 +564,24 @@ void wait_on_page_bit(struct page *page, int bit_nr) } EXPORT_SYMBOL(wait_on_page_bit); +/** + * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue + * @page - Page defining the wait queue of interest + * @waiter - Waiter to add to the queue + * + * Add an arbitrary @waiter to the wait queue for the nominated @page. + */ +void add_page_wait_queue(struct page *page, wait_queue_t *waiter) +{ + wait_queue_head_t *q = page_waitqueue(page); + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, waiter); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(add_page_wait_queue); + /** * unlock_page - unlock a locked page * @page: the page -- cgit v1.2.2