From 1536cb39338aff16b0e30cc6708da03b268337f7 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:04:05 -0700 Subject: mm/slab.c: add __init to init_lock_keys init_lock_keys is only called by __init kmem_cache_init_late Signed-off-by: Fabian Frederick Acked-by: Christoph Lameter Acked-by: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 3070b929a1bf..18ac44b7558d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -569,7 +569,7 @@ static inline void on_slab_lock_classes(struct kmem_cache *cachep) on_slab_lock_classes_node(cachep, node); } -static inline void init_lock_keys(void) +static inline void __init init_lock_keys(void) { int node; @@ -577,7 +577,7 @@ static inline void init_lock_keys(void) init_node_lock_keys(node); } #else -static void init_node_lock_keys(int q) +static void __init init_node_lock_keys(int q) { } -- cgit v1.2.2 From 44c5356fb460053112ab87c9601df1605054edca Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 6 Aug 2014 16:04:07 -0700 Subject: slab common: add functions for kmem_cache_node access The patchset provides two new functions in mm/slab.h and modifies SLAB and SLUB to use these. The kmem_cache_node structure is shared between both allocators and the use of common accessors will allow us to move more code into slab_common.c in the future. This patch (of 3): These functions allow to eliminate repeatedly used code in both SLAB and SLUB and also allow for the insertion of debugging code that may be needed in the development process. Signed-off-by: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Acked-by: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 17 ++++++++++++++++- mm/slub.c | 5 ----- 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index 961a3fb1f5a2..3f9766e393a3 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -262,7 +262,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) } #endif - +#ifndef CONFIG_SLOB /* * The slab lists for all objects. */ @@ -294,5 +294,20 @@ struct kmem_cache_node { }; +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __n = get_node(__s, __node), __node < nr_node_ids; __node++) \ + if (__n) + +#endif + void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); diff --git a/mm/slub.c b/mm/slub.c index 73004808537e..2569802aa7cc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -233,11 +233,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) -{ - return s->node[node]; -} - /* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) -- cgit v1.2.2 From fa45dc254bcf740852752effa35387be684947f8 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 6 Aug 2014 16:04:09 -0700 Subject: slub: use new node functions Make use of the new node functions in mm/slab.h to reduce code size and simplify. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Christoph Lameter Cc: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 78 ++++++++++++++++++++++++--------------------------------------- 1 file changed, 29 insertions(+), 49 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 2569802aa7cc..3918cd62a4b2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2157,6 +2157,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); int node; + struct kmem_cache_node *n; if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; @@ -2171,15 +2172,11 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", s->name); - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long nr_slabs; unsigned long nr_objs; unsigned long nr_free; - if (!n) - continue; - nr_free = count_partial(n, count_free); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); @@ -2923,13 +2920,10 @@ static void early_kmem_cache_node_alloc(int node) static void free_kmem_cache_nodes(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = s->node[node]; - - if (n) - kmem_cache_free(kmem_cache_node, n); - + for_each_kmem_cache_node(s, node, n) { + kmem_cache_free(kmem_cache_node, n); s->node[node] = NULL; } } @@ -3217,12 +3211,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) static inline int kmem_cache_close(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; flush_all(s); /* Attempt to free all objects */ - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) { free_partial(s, n); if (n->nr_partial || slabs_node(s, node)) return 1; @@ -3407,9 +3400,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) return -ENOMEM; flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) { if (!n->nr_partial) continue; @@ -3581,6 +3572,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) { int node; struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + struct kmem_cache_node *n; memcpy(s, static_cache, kmem_cache->object_size); @@ -3590,19 +3582,16 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) * IPIs around. */ __flush_cpu_slab(s, smp_processor_id()); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { struct page *p; - if (n) { - list_for_each_entry(p, &n->partial, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->partial, lru) + p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG - list_for_each_entry(p, &n->full, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->full, lru) + p->slab_cache = s; #endif - } } list_add(&s->list, &slab_caches); return s; @@ -3955,16 +3944,14 @@ static long validate_slab_cache(struct kmem_cache *s) unsigned long count = 0; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; if (!map) return -ENOMEM; flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) count += validate_slab_node(s, n, map); - } kfree(map); return count; } @@ -4118,6 +4105,7 @@ static int list_locations(struct kmem_cache *s, char *buf, int node; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), GFP_TEMPORARY)) { @@ -4127,8 +4115,7 @@ static int list_locations(struct kmem_cache *s, char *buf, /* Push back cpu slabs */ flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long flags; struct page *page; @@ -4327,8 +4314,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, get_online_mems(); #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = atomic_long_read(&n->total_objects); @@ -4344,9 +4332,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } else #endif if (flags & SO_PARTIAL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = count_partial(n, count_total); else if (flags & SO_OBJECTS) @@ -4359,7 +4347,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA - for_each_node_state(node, N_NORMAL_MEMORY) + for (node = 0; node < nr_node_ids; node++) if (nodes[node]) x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); @@ -4373,16 +4361,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s, static int any_slab_objects(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) if (atomic_long_read(&n->total_objects)) return 1; - } + return 0; } #endif @@ -5337,13 +5321,9 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) unsigned long nr_objs = 0; unsigned long nr_free = 0; int node; + struct kmem_cache_node *n; - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) { nr_slabs += node_nr_slabs(n); nr_objs += node_nr_objs(n); nr_free += count_partial(n, count_free); -- cgit v1.2.2 From 18bf854117c6caa4d0083bd42411895163467cb9 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 6 Aug 2014 16:04:11 -0700 Subject: slab: use get_node() and kmem_cache_node() functions Use the two functions to simplify the code avoiding numerous explicit checks coded checking for a certain node to be online. Get rid of various repeated calculations of kmem_cache_node structures. [akpm@linux-foundation.org: fix build] Signed-off-by: Christoph Lameter Cc: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 173 +++++++++++++++++++++++++++++--------------------------------- 1 file changed, 80 insertions(+), 93 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 18ac44b7558d..66b3ffbb890d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -267,7 +267,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define MAKE_LIST(cachep, listp, slab, nodeid) \ do { \ INIT_LIST_HEAD(listp); \ - list_splice(&(cachep->node[nodeid]->slab), listp); \ + list_splice(&get_node(cachep, nodeid)->slab, listp); \ } while (0) #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ @@ -488,16 +488,11 @@ static struct lock_class_key debugobj_alc_key; static void slab_set_lock_classes(struct kmem_cache *cachep, struct lock_class_key *l3_key, struct lock_class_key *alc_key, - int q) + struct kmem_cache_node *n) { struct array_cache **alc; - struct kmem_cache_node *n; int r; - n = cachep->node[q]; - if (!n) - return; - lockdep_set_class(&n->list_lock, l3_key); alc = n->alien; /* @@ -515,17 +510,19 @@ static void slab_set_lock_classes(struct kmem_cache *cachep, } } -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) +static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, + struct kmem_cache_node *n) { - slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); + slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, n); } static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) { int node; + struct kmem_cache_node *n; - for_each_online_node(node) - slab_set_debugobj_lock_classes_node(cachep, node); + for_each_kmem_cache_node(cachep, node, n) + slab_set_debugobj_lock_classes_node(cachep, n); } static void init_node_lock_keys(int q) @@ -542,31 +539,30 @@ static void init_node_lock_keys(int q) if (!cache) continue; - n = cache->node[q]; + n = get_node(cache, q); if (!n || OFF_SLAB(cache)) continue; slab_set_lock_classes(cache, &on_slab_l3_key, - &on_slab_alc_key, q); + &on_slab_alc_key, n); } } -static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) +static void on_slab_lock_classes_node(struct kmem_cache *cachep, + struct kmem_cache_node *n) { - if (!cachep->node[q]) - return; - slab_set_lock_classes(cachep, &on_slab_l3_key, - &on_slab_alc_key, q); + &on_slab_alc_key, n); } static inline void on_slab_lock_classes(struct kmem_cache *cachep) { int node; + struct kmem_cache_node *n; VM_BUG_ON(OFF_SLAB(cachep)); - for_each_node(node) - on_slab_lock_classes_node(cachep, node); + for_each_kmem_cache_node(cachep, node, n) + on_slab_lock_classes_node(cachep, n); } static inline void __init init_lock_keys(void) @@ -589,11 +585,13 @@ static inline void on_slab_lock_classes(struct kmem_cache *cachep) { } -static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) +static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, + struct kmem_cache_node *n) { } -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) +static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, + struct kmem_cache_node *n) { } @@ -826,7 +824,7 @@ static inline bool is_slab_pfmemalloc(struct page *page) static void recheck_pfmemalloc_active(struct kmem_cache *cachep, struct array_cache *ac) { - struct kmem_cache_node *n = cachep->node[numa_mem_id()]; + struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); struct page *page; unsigned long flags; @@ -881,7 +879,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, * If there are empty slabs on the slabs_free list and we are * being forced to refill the cache, mark this one !pfmemalloc. */ - n = cachep->node[numa_mem_id()]; + n = get_node(cachep, numa_mem_id()); if (!list_empty(&n->slabs_free) && force_refill) { struct page *page = virt_to_head_page(objp); ClearPageSlabPfmemalloc(page); @@ -1031,7 +1029,7 @@ static void free_alien_cache(struct array_cache **ac_ptr) static void __drain_alien_cache(struct kmem_cache *cachep, struct array_cache *ac, int node) { - struct kmem_cache_node *n = cachep->node[node]; + struct kmem_cache_node *n = get_node(cachep, node); if (ac->avail) { spin_lock(&n->list_lock); @@ -1099,7 +1097,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) if (likely(nodeid == node)) return 0; - n = cachep->node[node]; + n = get_node(cachep, node); STATS_INC_NODEFREES(cachep); if (n->alien && n->alien[nodeid]) { alien = n->alien[nodeid]; @@ -1111,9 +1109,10 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) ac_put_obj(cachep, alien, objp); spin_unlock(&alien->lock); } else { - spin_lock(&(cachep->node[nodeid])->list_lock); + n = get_node(cachep, nodeid); + spin_lock(&n->list_lock); free_block(cachep, &objp, 1, nodeid); - spin_unlock(&(cachep->node[nodeid])->list_lock); + spin_unlock(&n->list_lock); } return 1; } @@ -1140,7 +1139,8 @@ static int init_cache_node_node(int node) * begin anything. Make sure some other cpu on this * node has not already allocated this */ - if (!cachep->node[node]) { + n = get_node(cachep, node); + if (!n) { n = kmalloc_node(memsize, GFP_KERNEL, node); if (!n) return -ENOMEM; @@ -1156,11 +1156,11 @@ static int init_cache_node_node(int node) cachep->node[node] = n; } - spin_lock_irq(&cachep->node[node]->list_lock); - cachep->node[node]->free_limit = + spin_lock_irq(&n->list_lock); + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->node[node]->list_lock); + spin_unlock_irq(&n->list_lock); } return 0; } @@ -1186,7 +1186,7 @@ static void cpuup_canceled(long cpu) /* cpu is dead; no one can alloc from it. */ nc = cachep->array[cpu]; cachep->array[cpu] = NULL; - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) goto free_array_cache; @@ -1229,7 +1229,7 @@ free_array_cache: * shrink each nodelist to its limit. */ list_for_each_entry(cachep, &slab_caches, list) { - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) continue; drain_freelist(cachep, n, slabs_tofree(cachep, n)); @@ -1284,7 +1284,7 @@ static int cpuup_prepare(long cpu) } } cachep->array[cpu] = nc; - n = cachep->node[node]; + n = get_node(cachep, node); BUG_ON(!n); spin_lock_irq(&n->list_lock); @@ -1306,10 +1306,10 @@ static int cpuup_prepare(long cpu) kfree(shared); free_alien_cache(alien); if (cachep->flags & SLAB_DEBUG_OBJECTS) - slab_set_debugobj_lock_classes_node(cachep, node); + slab_set_debugobj_lock_classes_node(cachep, n); else if (!OFF_SLAB(cachep) && !(cachep->flags & SLAB_DESTROY_BY_RCU)) - on_slab_lock_classes_node(cachep, node); + on_slab_lock_classes_node(cachep, n); } init_node_lock_keys(node); @@ -1395,7 +1395,7 @@ static int __meminit drain_cache_node_node(int node) list_for_each_entry(cachep, &slab_caches, list) { struct kmem_cache_node *n; - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) continue; @@ -1690,14 +1690,10 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", cachep->name, cachep->size, cachep->gfporder); - for_each_online_node(node) { + for_each_kmem_cache_node(cachep, node, n) { unsigned long active_objs = 0, num_objs = 0, free_objects = 0; unsigned long active_slabs = 0, num_slabs = 0; - n = cachep->node[node]; - if (!n) - continue; - spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->slabs_full, lru) { active_objs += cachep->num; @@ -2434,7 +2430,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock); + assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); #endif } @@ -2442,7 +2438,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->node[node]->list_lock); + assert_spin_locked(&get_node(cachep, node)->list_lock); #endif } @@ -2462,12 +2458,14 @@ static void do_drain(void *arg) struct kmem_cache *cachep = arg; struct array_cache *ac; int node = numa_mem_id(); + struct kmem_cache_node *n; check_irq_off(); ac = cpu_cache_get(cachep); - spin_lock(&cachep->node[node]->list_lock); + n = get_node(cachep, node); + spin_lock(&n->list_lock); free_block(cachep, ac->entry, ac->avail, node); - spin_unlock(&cachep->node[node]->list_lock); + spin_unlock(&n->list_lock); ac->avail = 0; } @@ -2478,17 +2476,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep) on_each_cpu(do_drain, cachep, 1); check_irq_on(); - for_each_online_node(node) { - n = cachep->node[node]; - if (n && n->alien) + for_each_kmem_cache_node(cachep, node, n) + if (n->alien) drain_alien_cache(cachep, n->alien); - } - for_each_online_node(node) { - n = cachep->node[node]; - if (n) - drain_array(cachep, n, n->shared, 1, node); - } + for_each_kmem_cache_node(cachep, node, n) + drain_array(cachep, n, n->shared, 1, node); } /* @@ -2534,17 +2527,14 @@ out: int __kmem_cache_shrink(struct kmem_cache *cachep) { - int ret = 0, i = 0; + int ret = 0; + int node; struct kmem_cache_node *n; drain_cpu_caches(cachep); check_irq_on(); - for_each_online_node(i) { - n = cachep->node[i]; - if (!n) - continue; - + for_each_kmem_cache_node(cachep, node, n) { drain_freelist(cachep, n, slabs_tofree(cachep, n)); ret += !list_empty(&n->slabs_full) || @@ -2566,13 +2556,11 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) kfree(cachep->array[i]); /* NUMA: free the node structures */ - for_each_online_node(i) { - n = cachep->node[i]; - if (n) { - kfree(n->shared); - free_alien_cache(n->alien); - kfree(n); - } + for_each_kmem_cache_node(cachep, i, n) { + kfree(n->shared); + free_alien_cache(n->alien); + kfree(n); + cachep->node[i] = NULL; } return 0; } @@ -2751,7 +2739,7 @@ static int cache_grow(struct kmem_cache *cachep, /* Take the node list lock to change the colour_next on this node */ check_irq_off(); - n = cachep->node[nodeid]; + n = get_node(cachep, nodeid); spin_lock(&n->list_lock); /* Get colour for the slab, and cal the next value. */ @@ -2920,7 +2908,7 @@ retry: */ batchcount = BATCHREFILL_LIMIT; } - n = cachep->node[node]; + n = get_node(cachep, node); BUG_ON(ac->avail > 0 || !n); spin_lock(&n->list_lock); @@ -3169,8 +3157,8 @@ retry: nid = zone_to_nid(zone); if (cpuset_zone_allowed_hardwall(zone, flags) && - cache->node[nid] && - cache->node[nid]->free_objects) { + get_node(cache, nid) && + get_node(cache, nid)->free_objects) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); if (obj) @@ -3233,7 +3221,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int x; VM_BUG_ON(nodeid > num_online_nodes()); - n = cachep->node[nodeid]; + n = get_node(cachep, nodeid); BUG_ON(!n); retry: @@ -3304,7 +3292,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (nodeid == NUMA_NO_NODE) nodeid = slab_node; - if (unlikely(!cachep->node[nodeid])) { + if (unlikely(!get_node(cachep, nodeid))) { /* Node not bootstrapped yet */ ptr = fallback_alloc(cachep, flags); goto out; @@ -3420,7 +3408,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, objp = objpp[i]; page = virt_to_head_page(objp); - n = cachep->node[node]; + n = get_node(cachep, node); list_del(&page->lru); check_spinlock_acquired_node(cachep, node); slab_put_obj(cachep, page, objp, node); @@ -3462,7 +3450,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) BUG_ON(!batchcount || batchcount > ac->avail); #endif check_irq_off(); - n = cachep->node[node]; + n = get_node(cachep, node); spin_lock(&n->list_lock); if (n->shared) { struct array_cache *shared_array = n->shared; @@ -3775,7 +3763,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) } } - n = cachep->node[node]; + n = get_node(cachep, node); if (n) { struct array_cache *shared = n->shared; @@ -3820,9 +3808,8 @@ fail: /* Cache is not active yet. Roll back what we did */ node--; while (node >= 0) { - if (cachep->node[node]) { - n = cachep->node[node]; - + n = get_node(cachep, node); + if (n) { kfree(n->shared); free_alien_cache(n->alien); kfree(n); @@ -3884,11 +3871,17 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, for_each_online_cpu(i) { struct array_cache *ccold = new->new[i]; + int node; + struct kmem_cache_node *n; + if (!ccold) continue; - spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); - free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); - spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); + + node = cpu_to_mem(i); + n = get_node(cachep, node); + spin_lock_irq(&n->list_lock); + free_block(cachep, ccold->entry, ccold->avail, node); + spin_unlock_irq(&n->list_lock); kfree(ccold); } kfree(new); @@ -4048,7 +4041,7 @@ static void cache_reap(struct work_struct *w) * have established with reasonable certainty that * we can do some work if the lock was obtained. */ - n = searchp->node[node]; + n = get_node(searchp, node); reap_alien(searchp, n); @@ -4100,10 +4093,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) active_objs = 0; num_slabs = 0; - for_each_online_node(node) { - n = cachep->node[node]; - if (!n) - continue; + for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); spin_lock_irq(&n->list_lock); @@ -4328,10 +4318,7 @@ static int leaks_show(struct seq_file *m, void *p) x[1] = 0; - for_each_online_node(node) { - n = cachep->node[node]; - if (!n) - continue; + for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); spin_lock_irq(&n->list_lock); -- cgit v1.2.2 From 5240ab4076bd3815473f2f2991741acc698f8b58 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 6 Aug 2014 16:04:14 -0700 Subject: mm: slab.h: wrap the whole file with guarding macro Guarding section: #ifndef MM_SLAB_H #define MM_SLAB_H ... #endif currently doesn't cover the whole mm/slab.h. It seems like it was done unintentionally. Wrap the whole file by moving closing #endif to the end of it. Signed-off-by: Andrey Ryabinin Acked-by: Christoph Lameter Acked-by: David Rientjes Reviewed-by: Vladimir Davydov Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index 3f9766e393a3..3822b65edcc2 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -260,7 +260,6 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) WARN_ON_ONCE(1); return s; } -#endif #ifndef CONFIG_SLOB /* @@ -311,3 +310,5 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); + +#endif /* MM_SLAB_H */ -- cgit v1.2.2 From c07b8183cbb86d34007e5a3935e0ec89f5bb83c6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:04:16 -0700 Subject: mm, slub: mark resiliency_test as init text resiliency_test() is only called for bootstrap, so it may be moved to init.text and freed after boot. Signed-off-by: David Rientjes Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 3918cd62a4b2..2d61503efb92 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4187,7 +4187,7 @@ static int list_locations(struct kmem_cache *s, char *buf, #endif #ifdef SLUB_RESILIENCY_TEST -static void resiliency_test(void) +static void __init resiliency_test(void) { u8 *p; -- cgit v1.2.2 From 02e72cc61713185013d958baba508288ba2a0157 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 6 Aug 2014 16:04:18 -0700 Subject: mm: slub: SLUB_DEBUG=n: use the same alloc/free hooks as for SLUB_DEBUG=y There are two versions of alloc/free hooks now - one for CONFIG_SLUB_DEBUG=y and another one for CONFIG_SLUB_DEBUG=n. I see no reason why calls to other debugging subsystems (LOCKDEP, DEBUG_ATOMIC_SLEEP, KMEMCHECK and FAILSLAB) are hidden under SLUB_DEBUG. All this features should work regardless of SLUB_DEBUG config, as all of them already have own Kconfig options. This also fixes failslab for CONFIG_SLUB_DEBUG=n configuration. It simply has not worked before because should_failslab() call was in a hook hidden under "#ifdef CONFIG_SLUB_DEBUG #else". Note: There is one concealed change in allocation path for SLUB_DEBUG=n and all other debugging features disabled. The might_sleep_if() call can generate some code even if DEBUG_ATOMIC_SLEEP=n. For PREEMPT_VOLUNTARY=y might_sleep() inserts _cond_resched() call, but I think it should be ok. Signed-off-by: Andrey Ryabinin Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 97 ++++++++++++++++++++++++--------------------------------------- 1 file changed, 36 insertions(+), 61 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 2d61503efb92..92d8139c556d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -939,60 +939,6 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, } } -/* - * Hooks for other subsystems that check memory allocations. In a typical - * production configuration these hooks all should produce no code at all. - */ -static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) -{ - kmemleak_alloc(ptr, size, 1, flags); -} - -static inline void kfree_hook(const void *x) -{ - kmemleak_free(x); -} - -static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) -{ - flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); - - return should_failslab(s->object_size, flags, s->flags); -} - -static inline void slab_post_alloc_hook(struct kmem_cache *s, - gfp_t flags, void *object) -{ - flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); -} - -static inline void slab_free_hook(struct kmem_cache *s, void *x) -{ - kmemleak_free_recursive(x, s->flags); - - /* - * Trouble is that we may no longer disable interrupts in the fast path - * So in order to make the debug calls that expect irqs to be - * disabled we need to disable interrupts temporarily. - */ -#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) - { - unsigned long flags; - - local_irq_save(flags); - kmemcheck_slab_free(s, x, s->object_size); - debug_check_no_locks_freed(x, s->object_size); - local_irq_restore(flags); - } -#endif - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(x, s->object_size); -} - /* * Tracking of fully allocated slabs for debugging purposes. */ @@ -1277,6 +1223,12 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +#endif /* CONFIG_SLUB_DEBUG */ + +/* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + */ static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { kmemleak_alloc(ptr, size, 1, flags); @@ -1288,21 +1240,44 @@ static inline void kfree_hook(const void *x) } static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) - { return 0; } +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(flags & __GFP_WAIT); -static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) + return should_failslab(s->object_size, flags, s->flags); +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, + gfp_t flags, void *object) { - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, - flags & gfp_allowed_mask); + flags &= gfp_allowed_mask; + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); } static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); -} -#endif /* CONFIG_SLUB_DEBUG */ + /* + * Trouble is that we may no longer disable interrupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ +#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->object_size); + debug_check_no_locks_freed(x, s->object_size); + local_irq_restore(flags); + } +#endif + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->object_size); +} /* * Slab allocation and freeing -- cgit v1.2.2 From 8a9c61d4381c5e5007cc68e023940b18fa0808d7 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:20 -0700 Subject: slab: add unlikely macro to help compiler This patchset does some cleanup and tries to remove lockdep annotation. Patches 1~2 are just for really really minor improvement. Patches 3~9 are for clean-up and removing lockdep annotation. There are two cases that lockdep annotation is needed in SLAB. 1) holding two node locks 2) holding two array cache(alien cache) locks I looked at the code and found that we can avoid these cases without any negative effect. 1) occurs if freeing object makes new free slab and we decide to destroy it. Although we don't need to hold the lock during destroying a slab, current code do that. Destroying a slab without holding the lock would help the reduction of the lock contention. To do it, I change the implementation that new free slab is destroyed after releasing the lock. 2) occurs on similar situation. When we free object from non-local node, we put this object to alien cache with holding the alien cache lock. If alien cache is full, we try to flush alien cache to proper node cache, and, in this time, new free slab could be made. Destroying it would be started and we will free metadata object which comes from another node. In this case, we need another node's alien cache lock to free object. This forces us to hold two array cache locks and then we need lockdep annotation although they are always different locks and deadlock cannot be possible. To prevent this situation, I use same way as 1). In this way, we can avoid 1) and 2) cases, and then, can remove lockdep annotation. As short stat noted, this makes SLAB code much simpler. This patch (of 9): slab_should_failslab() is called on every allocation, so to optimize it is reasonable. We normally don't allocate from kmem_cache. It is just used when new kmem_cache is created, so it's very rare case. Therefore, add unlikely macro to help compiler optimization. Signed-off-by: Joonsoo Kim Acked-by: David Rientjes Acked-by: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 66b3ffbb890d..7d07942b9804 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3048,7 +3048,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) { - if (cachep == kmem_cache) + if (unlikely(cachep == kmem_cache)) return false; return should_failslab(cachep->object_size, flags, cachep->flags); -- cgit v1.2.2 From 25c063fbd5512eb7190bf5af88351109aededb3f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:22 -0700 Subject: slab: move up code to get kmem_cache_node in free_block() node isn't changed, so we don't need to retreive this structure everytime we move the object. Maybe compiler do this optimization, but making it explicitly is better. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 7d07942b9804..205632c94a6a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3398,7 +3398,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, int node) { int i; - struct kmem_cache_node *n; + struct kmem_cache_node *n = get_node(cachep, node); for (i = 0; i < nr_objects; i++) { void *objp; @@ -3408,7 +3408,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, objp = objpp[i]; page = virt_to_head_page(objp); - n = get_node(cachep, node); list_del(&page->lru); check_spinlock_acquired_node(cachep, node); slab_put_obj(cachep, page, objp, node); -- cgit v1.2.2 From 97654dfa20caa5e6c1b0a4af715aabaf5d070d69 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:25 -0700 Subject: slab: defer slab_destroy in free_block() In free_block(), if freeing object makes new free slab and number of free_objects exceeds free_limit, we start to destroy this new free slab with holding the kmem_cache node lock. Holding the lock is useless and, generally, holding a lock as least as possible is good thing. I never measure performance effect of this, but we'd be better not to hold the lock as much as possible. Commented by Christoph: This is also good because kmem_cache_free is no longer called while holding the node lock. So we avoid one case of recursion. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 60 +++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 205632c94a6a..f6ad8d335be7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -242,7 +242,8 @@ static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; static int drain_freelist(struct kmem_cache *cache, struct kmem_cache_node *n, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node); + int node, struct list_head *list); +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); @@ -1030,6 +1031,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, struct array_cache *ac, int node) { struct kmem_cache_node *n = get_node(cachep, node); + LIST_HEAD(list); if (ac->avail) { spin_lock(&n->list_lock); @@ -1041,9 +1043,10 @@ static void __drain_alien_cache(struct kmem_cache *cachep, if (n->shared) transfer_objects(n->shared, ac, ac->limit); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, &list); ac->avail = 0; spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); } } @@ -1087,6 +1090,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) struct kmem_cache_node *n; struct array_cache *alien = NULL; int node; + LIST_HEAD(list); node = numa_mem_id(); @@ -1111,8 +1115,9 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } else { n = get_node(cachep, nodeid); spin_lock(&n->list_lock); - free_block(cachep, &objp, 1, nodeid); + free_block(cachep, &objp, 1, nodeid, &list); spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); } return 1; } @@ -1182,6 +1187,7 @@ static void cpuup_canceled(long cpu) struct array_cache *nc; struct array_cache *shared; struct array_cache **alien; + LIST_HEAD(list); /* cpu is dead; no one can alloc from it. */ nc = cachep->array[cpu]; @@ -1196,7 +1202,7 @@ static void cpuup_canceled(long cpu) /* Free limit for this kmem_cache_node */ n->free_limit -= cachep->batchcount; if (nc) - free_block(cachep, nc->entry, nc->avail, node); + free_block(cachep, nc->entry, nc->avail, node, &list); if (!cpumask_empty(mask)) { spin_unlock_irq(&n->list_lock); @@ -1206,7 +1212,7 @@ static void cpuup_canceled(long cpu) shared = n->shared; if (shared) { free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &list); n->shared = NULL; } @@ -1221,6 +1227,7 @@ static void cpuup_canceled(long cpu) free_alien_cache(alien); } free_array_cache: + slabs_destroy(cachep, &list); kfree(nc); } /* @@ -2056,6 +2063,16 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) kmem_cache_free(cachep->freelist_cache, freelist); } +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) +{ + struct page *page, *n; + + list_for_each_entry_safe(page, n, list, lru) { + list_del(&page->lru); + slab_destroy(cachep, page); + } +} + /** * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created @@ -2459,13 +2476,15 @@ static void do_drain(void *arg) struct array_cache *ac; int node = numa_mem_id(); struct kmem_cache_node *n; + LIST_HEAD(list); check_irq_off(); ac = cpu_cache_get(cachep); n = get_node(cachep, node); spin_lock(&n->list_lock); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, &list); spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); ac->avail = 0; } @@ -3393,9 +3412,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) /* * Caller needs to acquire correct kmem_cache_node's list_lock + * @list: List of detached free slabs should be freed by caller */ -static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, - int node) +static void free_block(struct kmem_cache *cachep, void **objpp, + int nr_objects, int node, struct list_head *list) { int i; struct kmem_cache_node *n = get_node(cachep, node); @@ -3418,13 +3438,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, if (page->active == 0) { if (n->free_objects > n->free_limit) { n->free_objects -= cachep->num; - /* No need to drop any previously held - * lock here, even if we have a off-slab slab - * descriptor it is guaranteed to come from - * a different cache, refer to comments before - * alloc_slabmgmt. - */ - slab_destroy(cachep, page); + list_add_tail(&page->lru, list); } else { list_add(&page->lru, &n->slabs_free); } @@ -3443,6 +3457,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) int batchcount; struct kmem_cache_node *n; int node = numa_mem_id(); + LIST_HEAD(list); batchcount = ac->batchcount; #if DEBUG @@ -3464,7 +3479,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) } } - free_block(cachep, ac->entry, batchcount, node); + free_block(cachep, ac->entry, batchcount, node, &list); free_done: #if STATS { @@ -3485,6 +3500,7 @@ free_done: } #endif spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); ac->avail -= batchcount; memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); } @@ -3765,12 +3781,13 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) n = get_node(cachep, node); if (n) { struct array_cache *shared = n->shared; + LIST_HEAD(list); spin_lock_irq(&n->list_lock); if (shared) free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &list); n->shared = new_shared; if (!n->alien) { @@ -3780,6 +3797,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); kfree(shared); free_alien_cache(new_alien); continue; @@ -3869,6 +3887,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, cachep->shared = shared; for_each_online_cpu(i) { + LIST_HEAD(list); struct array_cache *ccold = new->new[i]; int node; struct kmem_cache_node *n; @@ -3879,8 +3898,9 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, node = cpu_to_mem(i); n = get_node(cachep, node); spin_lock_irq(&n->list_lock); - free_block(cachep, ccold->entry, ccold->avail, node); + free_block(cachep, ccold->entry, ccold->avail, node, &list); spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); kfree(ccold); } kfree(new); @@ -3988,6 +4008,7 @@ skip_setup: static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, struct array_cache *ac, int force, int node) { + LIST_HEAD(list); int tofree; if (!ac || !ac->avail) @@ -4000,12 +4021,13 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) tofree = (ac->avail + 1) / 2; - free_block(cachep, ac->entry, tofree, node); + free_block(cachep, ac->entry, tofree, node, &list); ac->avail -= tofree; memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); } spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); } } -- cgit v1.2.2 From 1fe00d50a9e81150de5000490b87ed227525cf09 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:27 -0700 Subject: slab: factor out initialization of array cache Factor out initialization of array cache to use it in following patch. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index f6ad8d335be7..8d9a0fff160d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -791,13 +791,8 @@ static void start_cpu_timer(int cpu) } } -static struct array_cache *alloc_arraycache(int node, int entries, - int batchcount, gfp_t gfp) +static void init_arraycache(struct array_cache *ac, int limit, int batch) { - int memsize = sizeof(void *) * entries + sizeof(struct array_cache); - struct array_cache *nc = NULL; - - nc = kmalloc_node(memsize, gfp, node); /* * The array_cache structures contain pointers to free object. * However, when such objects are allocated or transferred to another @@ -805,15 +800,25 @@ static struct array_cache *alloc_arraycache(int node, int entries, * valid references during a kmemleak scan. Therefore, kmemleak must * not scan such objects. */ - kmemleak_no_scan(nc); - if (nc) { - nc->avail = 0; - nc->limit = entries; - nc->batchcount = batchcount; - nc->touched = 0; - spin_lock_init(&nc->lock); + kmemleak_no_scan(ac); + if (ac) { + ac->avail = 0; + ac->limit = limit; + ac->batchcount = batch; + ac->touched = 0; + spin_lock_init(&ac->lock); } - return nc; +} + +static struct array_cache *alloc_arraycache(int node, int entries, + int batchcount, gfp_t gfp) +{ + int memsize = sizeof(void *) * entries + sizeof(struct array_cache); + struct array_cache *ac = NULL; + + ac = kmalloc_node(memsize, gfp, node); + init_arraycache(ac, entries, batchcount); + return ac; } static inline bool is_slab_pfmemalloc(struct page *page) -- cgit v1.2.2 From c8522a3a5832b843570a3315674f5a3575958a51 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:29 -0700 Subject: slab: introduce alien_cache Currently, we use array_cache for alien_cache. Although they are mostly similar, there is one difference, that is, need for spinlock. We don't need spinlock for array_cache itself, but to use array_cache for alien_cache, array_cache structure should have spinlock. This is needless overhead, so removing it would be better. This patch prepare it by introducing alien_cache and using it. In the following patch, we remove spinlock in array_cache. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 108 ++++++++++++++++++++++++++++++++++++++------------------------ mm/slab.h | 2 +- 2 files changed, 68 insertions(+), 42 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 8d9a0fff160d..de91d6f3a2a4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -203,6 +203,11 @@ struct array_cache { */ }; +struct alien_cache { + spinlock_t lock; + struct array_cache ac; +}; + #define SLAB_OBJ_PFMEMALLOC 1 static inline bool is_obj_pfmemalloc(void *objp) { @@ -491,7 +496,7 @@ static void slab_set_lock_classes(struct kmem_cache *cachep, struct lock_class_key *l3_key, struct lock_class_key *alc_key, struct kmem_cache_node *n) { - struct array_cache **alc; + struct alien_cache **alc; int r; lockdep_set_class(&n->list_lock, l3_key); @@ -507,7 +512,7 @@ static void slab_set_lock_classes(struct kmem_cache *cachep, return; for_each_node(r) { if (alc[r]) - lockdep_set_class(&alc[r]->lock, alc_key); + lockdep_set_class(&(alc[r]->ac.lock), alc_key); } } @@ -965,12 +970,13 @@ static int transfer_objects(struct array_cache *to, #define drain_alien_cache(cachep, alien) do { } while (0) #define reap_alien(cachep, n) do { } while (0) -static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) +static inline struct alien_cache **alloc_alien_cache(int node, + int limit, gfp_t gfp) { - return (struct array_cache **)BAD_ALIEN_MAGIC; + return (struct alien_cache **)BAD_ALIEN_MAGIC; } -static inline void free_alien_cache(struct array_cache **ac_ptr) +static inline void free_alien_cache(struct alien_cache **ac_ptr) { } @@ -996,40 +1002,52 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); static void *alternate_node_alloc(struct kmem_cache *, gfp_t); -static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) +static struct alien_cache *__alloc_alien_cache(int node, int entries, + int batch, gfp_t gfp) +{ + int memsize = sizeof(void *) * entries + sizeof(struct alien_cache); + struct alien_cache *alc = NULL; + + alc = kmalloc_node(memsize, gfp, node); + init_arraycache(&alc->ac, entries, batch); + return alc; +} + +static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { - struct array_cache **ac_ptr; + struct alien_cache **alc_ptr; int memsize = sizeof(void *) * nr_node_ids; int i; if (limit > 1) limit = 12; - ac_ptr = kzalloc_node(memsize, gfp, node); - if (ac_ptr) { - for_each_node(i) { - if (i == node || !node_online(i)) - continue; - ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); - if (!ac_ptr[i]) { - for (i--; i >= 0; i--) - kfree(ac_ptr[i]); - kfree(ac_ptr); - return NULL; - } + alc_ptr = kzalloc_node(memsize, gfp, node); + if (!alc_ptr) + return NULL; + + for_each_node(i) { + if (i == node || !node_online(i)) + continue; + alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); + if (!alc_ptr[i]) { + for (i--; i >= 0; i--) + kfree(alc_ptr[i]); + kfree(alc_ptr); + return NULL; } } - return ac_ptr; + return alc_ptr; } -static void free_alien_cache(struct array_cache **ac_ptr) +static void free_alien_cache(struct alien_cache **alc_ptr) { int i; - if (!ac_ptr) + if (!alc_ptr) return; for_each_node(i) - kfree(ac_ptr[i]); - kfree(ac_ptr); + kfree(alc_ptr[i]); + kfree(alc_ptr); } static void __drain_alien_cache(struct kmem_cache *cachep, @@ -1063,25 +1081,31 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) int node = __this_cpu_read(slab_reap_node); if (n->alien) { - struct array_cache *ac = n->alien[node]; - - if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { - __drain_alien_cache(cachep, ac, node); - spin_unlock_irq(&ac->lock); + struct alien_cache *alc = n->alien[node]; + struct array_cache *ac; + + if (alc) { + ac = &alc->ac; + if (ac->avail && spin_trylock_irq(&ac->lock)) { + __drain_alien_cache(cachep, ac, node); + spin_unlock_irq(&ac->lock); + } } } } static void drain_alien_cache(struct kmem_cache *cachep, - struct array_cache **alien) + struct alien_cache **alien) { int i = 0; + struct alien_cache *alc; struct array_cache *ac; unsigned long flags; for_each_online_node(i) { - ac = alien[i]; - if (ac) { + alc = alien[i]; + if (alc) { + ac = &alc->ac; spin_lock_irqsave(&ac->lock, flags); __drain_alien_cache(cachep, ac, i); spin_unlock_irqrestore(&ac->lock, flags); @@ -1093,7 +1117,8 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) { int nodeid = page_to_nid(virt_to_page(objp)); struct kmem_cache_node *n; - struct array_cache *alien = NULL; + struct alien_cache *alien = NULL; + struct array_cache *ac; int node; LIST_HEAD(list); @@ -1110,13 +1135,14 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) STATS_INC_NODEFREES(cachep); if (n->alien && n->alien[nodeid]) { alien = n->alien[nodeid]; - spin_lock(&alien->lock); - if (unlikely(alien->avail == alien->limit)) { + ac = &alien->ac; + spin_lock(&ac->lock); + if (unlikely(ac->avail == ac->limit)) { STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, alien, nodeid); + __drain_alien_cache(cachep, ac, nodeid); } - ac_put_obj(cachep, alien, objp); - spin_unlock(&alien->lock); + ac_put_obj(cachep, ac, objp); + spin_unlock(&ac->lock); } else { n = get_node(cachep, nodeid); spin_lock(&n->list_lock); @@ -1191,7 +1217,7 @@ static void cpuup_canceled(long cpu) list_for_each_entry(cachep, &slab_caches, list) { struct array_cache *nc; struct array_cache *shared; - struct array_cache **alien; + struct alien_cache **alien; LIST_HEAD(list); /* cpu is dead; no one can alloc from it. */ @@ -1272,7 +1298,7 @@ static int cpuup_prepare(long cpu) list_for_each_entry(cachep, &slab_caches, list) { struct array_cache *nc; struct array_cache *shared = NULL; - struct array_cache **alien = NULL; + struct alien_cache **alien = NULL; nc = alloc_arraycache(node, cachep->limit, cachep->batchcount, GFP_KERNEL); @@ -3762,7 +3788,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) int node; struct kmem_cache_node *n; struct array_cache *new_shared; - struct array_cache **new_alien = NULL; + struct alien_cache **new_alien = NULL; for_each_online_node(node) { diff --git a/mm/slab.h b/mm/slab.h index 3822b65edcc2..928823e17e58 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -276,7 +276,7 @@ struct kmem_cache_node { unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ struct array_cache *shared; /* shared per node */ - struct array_cache **alien; /* on other nodes */ + struct alien_cache **alien; /* on other nodes */ unsigned long next_reap; /* updated without locking */ int free_touched; /* updated without locking */ #endif -- cgit v1.2.2 From 49dfc304ba241b315068023962004542c5118103 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:31 -0700 Subject: slab: use the lock on alien_cache, instead of the lock on array_cache Now, we have separate alien_cache structure, so it'd be better to hold the lock on alien_cache while manipulating alien_cache. After that, we don't need the lock on array_cache, so remove it. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index de91d6f3a2a4..e4ce73c32a7a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -191,7 +191,6 @@ struct array_cache { unsigned int limit; unsigned int batchcount; unsigned int touched; - spinlock_t lock; void *entry[]; /* * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing @@ -512,7 +511,7 @@ static void slab_set_lock_classes(struct kmem_cache *cachep, return; for_each_node(r) { if (alc[r]) - lockdep_set_class(&(alc[r]->ac.lock), alc_key); + lockdep_set_class(&(alc[r]->lock), alc_key); } } @@ -811,7 +810,6 @@ static void init_arraycache(struct array_cache *ac, int limit, int batch) ac->limit = limit; ac->batchcount = batch; ac->touched = 0; - spin_lock_init(&ac->lock); } } @@ -1010,6 +1008,7 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, alc = kmalloc_node(memsize, gfp, node); init_arraycache(&alc->ac, entries, batch); + spin_lock_init(&alc->lock); return alc; } @@ -1086,9 +1085,9 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) if (alc) { ac = &alc->ac; - if (ac->avail && spin_trylock_irq(&ac->lock)) { + if (ac->avail && spin_trylock_irq(&alc->lock)) { __drain_alien_cache(cachep, ac, node); - spin_unlock_irq(&ac->lock); + spin_unlock_irq(&alc->lock); } } } @@ -1106,9 +1105,9 @@ static void drain_alien_cache(struct kmem_cache *cachep, alc = alien[i]; if (alc) { ac = &alc->ac; - spin_lock_irqsave(&ac->lock, flags); + spin_lock_irqsave(&alc->lock, flags); __drain_alien_cache(cachep, ac, i); - spin_unlock_irqrestore(&ac->lock, flags); + spin_unlock_irqrestore(&alc->lock, flags); } } } @@ -1136,13 +1135,13 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) if (n->alien && n->alien[nodeid]) { alien = n->alien[nodeid]; ac = &alien->ac; - spin_lock(&ac->lock); + spin_lock(&alien->lock); if (unlikely(ac->avail == ac->limit)) { STATS_INC_ACOVERFLOW(cachep); __drain_alien_cache(cachep, ac, nodeid); } ac_put_obj(cachep, ac, objp); - spin_unlock(&ac->lock); + spin_unlock(&alien->lock); } else { n = get_node(cachep, nodeid); spin_lock(&n->list_lock); @@ -1613,10 +1612,6 @@ void __init kmem_cache_init(void) memcpy(ptr, cpu_cache_get(kmem_cache), sizeof(struct arraycache_init)); - /* - * Do not assume that spinlocks can be initialized via memcpy: - */ - spin_lock_init(&ptr->lock); kmem_cache->array[smp_processor_id()] = ptr; @@ -1626,10 +1621,6 @@ void __init kmem_cache_init(void) != &initarray_generic.cache); memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), sizeof(struct arraycache_init)); - /* - * Do not assume that spinlocks can be initialized via memcpy: - */ - spin_lock_init(&ptr->lock); kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; } -- cgit v1.2.2 From 833b706cc8b7b555e18d3426e9616bd066883a7a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:33 -0700 Subject: slab: destroy a slab without holding any alien cache lock I haven't heard that this alien cache lock is contended, but to reduce chance of contention would be better generally. And with this change, we can simplify complex lockdep annotation in slab code. In the following patch, it will be implemented. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index e4ce73c32a7a..e4dc0896b891 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1050,10 +1050,10 @@ static void free_alien_cache(struct alien_cache **alc_ptr) } static void __drain_alien_cache(struct kmem_cache *cachep, - struct array_cache *ac, int node) + struct array_cache *ac, int node, + struct list_head *list) { struct kmem_cache_node *n = get_node(cachep, node); - LIST_HEAD(list); if (ac->avail) { spin_lock(&n->list_lock); @@ -1065,10 +1065,9 @@ static void __drain_alien_cache(struct kmem_cache *cachep, if (n->shared) transfer_objects(n->shared, ac, ac->limit); - free_block(cachep, ac->entry, ac->avail, node, &list); + free_block(cachep, ac->entry, ac->avail, node, list); ac->avail = 0; spin_unlock(&n->list_lock); - slabs_destroy(cachep, &list); } } @@ -1086,8 +1085,11 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) if (alc) { ac = &alc->ac; if (ac->avail && spin_trylock_irq(&alc->lock)) { - __drain_alien_cache(cachep, ac, node); + LIST_HEAD(list); + + __drain_alien_cache(cachep, ac, node, &list); spin_unlock_irq(&alc->lock); + slabs_destroy(cachep, &list); } } } @@ -1104,10 +1106,13 @@ static void drain_alien_cache(struct kmem_cache *cachep, for_each_online_node(i) { alc = alien[i]; if (alc) { + LIST_HEAD(list); + ac = &alc->ac; spin_lock_irqsave(&alc->lock, flags); - __drain_alien_cache(cachep, ac, i); + __drain_alien_cache(cachep, ac, i, &list); spin_unlock_irqrestore(&alc->lock, flags); + slabs_destroy(cachep, &list); } } } @@ -1138,10 +1143,11 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) spin_lock(&alien->lock); if (unlikely(ac->avail == ac->limit)) { STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, ac, nodeid); + __drain_alien_cache(cachep, ac, nodeid, &list); } ac_put_obj(cachep, ac, objp); spin_unlock(&alien->lock); + slabs_destroy(cachep, &list); } else { n = get_node(cachep, nodeid); spin_lock(&n->list_lock); -- cgit v1.2.2 From 367f7f2f45e7f601bcf87aeffb0c81e6d26e53df Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:35 -0700 Subject: slab: remove a useless lockdep annotation Now, there is no code to hold two lock simultaneously, since we don't call slab_destroy() with holding any lock. So, lockdep annotation is useless now. Remove it. v2: don't remove BAD_ALIEN_MAGIC in this patch. It will be removed in the following patch. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 153 -------------------------------------------------------------- 1 file changed, 153 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index e4dc0896b891..630c85469164 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -472,139 +472,6 @@ static struct kmem_cache kmem_cache_boot = { #define BAD_ALIEN_MAGIC 0x01020304ul -#ifdef CONFIG_LOCKDEP - -/* - * Slab sometimes uses the kmalloc slabs to store the slab headers - * for other slabs "off slab". - * The locking for this is tricky in that it nests within the locks - * of all other slabs in a few places; to deal with this special - * locking we put on-slab caches into a separate lock-class. - * - * We set lock class for alien array caches which are up during init. - * The lock annotation will be lost if all cpus of a node goes down and - * then comes back up during hotplug - */ -static struct lock_class_key on_slab_l3_key; -static struct lock_class_key on_slab_alc_key; - -static struct lock_class_key debugobj_l3_key; -static struct lock_class_key debugobj_alc_key; - -static void slab_set_lock_classes(struct kmem_cache *cachep, - struct lock_class_key *l3_key, struct lock_class_key *alc_key, - struct kmem_cache_node *n) -{ - struct alien_cache **alc; - int r; - - lockdep_set_class(&n->list_lock, l3_key); - alc = n->alien; - /* - * FIXME: This check for BAD_ALIEN_MAGIC - * should go away when common slab code is taught to - * work even without alien caches. - * Currently, non NUMA code returns BAD_ALIEN_MAGIC - * for alloc_alien_cache, - */ - if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) - return; - for_each_node(r) { - if (alc[r]) - lockdep_set_class(&(alc[r]->lock), alc_key); - } -} - -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, - struct kmem_cache_node *n) -{ - slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, n); -} - -static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) -{ - int node; - struct kmem_cache_node *n; - - for_each_kmem_cache_node(cachep, node, n) - slab_set_debugobj_lock_classes_node(cachep, n); -} - -static void init_node_lock_keys(int q) -{ - int i; - - if (slab_state < UP) - return; - - for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache_node *n; - struct kmem_cache *cache = kmalloc_caches[i]; - - if (!cache) - continue; - - n = get_node(cache, q); - if (!n || OFF_SLAB(cache)) - continue; - - slab_set_lock_classes(cache, &on_slab_l3_key, - &on_slab_alc_key, n); - } -} - -static void on_slab_lock_classes_node(struct kmem_cache *cachep, - struct kmem_cache_node *n) -{ - slab_set_lock_classes(cachep, &on_slab_l3_key, - &on_slab_alc_key, n); -} - -static inline void on_slab_lock_classes(struct kmem_cache *cachep) -{ - int node; - struct kmem_cache_node *n; - - VM_BUG_ON(OFF_SLAB(cachep)); - for_each_kmem_cache_node(cachep, node, n) - on_slab_lock_classes_node(cachep, n); -} - -static inline void __init init_lock_keys(void) -{ - int node; - - for_each_node(node) - init_node_lock_keys(node); -} -#else -static void __init init_node_lock_keys(int q) -{ -} - -static inline void init_lock_keys(void) -{ -} - -static inline void on_slab_lock_classes(struct kmem_cache *cachep) -{ -} - -static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, - struct kmem_cache_node *n) -{ -} - -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, - struct kmem_cache_node *n) -{ -} - -static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) -{ -} -#endif - static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) @@ -1348,13 +1215,7 @@ static int cpuup_prepare(long cpu) spin_unlock_irq(&n->list_lock); kfree(shared); free_alien_cache(alien); - if (cachep->flags & SLAB_DEBUG_OBJECTS) - slab_set_debugobj_lock_classes_node(cachep, n); - else if (!OFF_SLAB(cachep) && - !(cachep->flags & SLAB_DESTROY_BY_RCU)) - on_slab_lock_classes_node(cachep, n); } - init_node_lock_keys(node); return 0; bad: @@ -1663,9 +1524,6 @@ void __init kmem_cache_init_late(void) BUG(); mutex_unlock(&slab_mutex); - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); - /* Done! */ slab_state = FULL; @@ -2446,17 +2304,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) return err; } - if (flags & SLAB_DEBUG_OBJECTS) { - /* - * Would deadlock through slab_destroy()->call_rcu()-> - * debug_object_activate()->kmem_cache_alloc(). - */ - WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); - - slab_set_debugobj_lock_classes(cachep); - } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) - on_slab_lock_classes(cachep); - return 0; } -- cgit v1.2.2 From a640616822b2c3a8009b0600f20c4a76ea8a0025 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:38 -0700 Subject: slab: remove BAD_ALIEN_MAGIC BAD_ALIEN_MAGIC value isn't used anymore. So remove it. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 630c85469164..42a9eddb61cd 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -470,8 +470,6 @@ static struct kmem_cache kmem_cache_boot = { .name = "kmem_cache", }; -#define BAD_ALIEN_MAGIC 0x01020304ul - static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) @@ -838,7 +836,7 @@ static int transfer_objects(struct array_cache *to, static inline struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { - return (struct alien_cache **)BAD_ALIEN_MAGIC; + return NULL; } static inline void free_alien_cache(struct alien_cache **ac_ptr) -- cgit v1.2.2 From 5e804789673114c616816f8387169790afe376b5 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:40 -0700 Subject: slab: change int to size_t for representing allocation size It is better to represent allocation size in size_t rather than int. So change it. Signed-off-by: Joonsoo Kim Suggested-by: Andrew Morton Cc: Christoph Lameter Reviewed-by: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 42a9eddb61cd..1351725f7936 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -681,7 +681,7 @@ static void init_arraycache(struct array_cache *ac, int limit, int batch) static struct array_cache *alloc_arraycache(int node, int entries, int batchcount, gfp_t gfp) { - int memsize = sizeof(void *) * entries + sizeof(struct array_cache); + size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); struct array_cache *ac = NULL; ac = kmalloc_node(memsize, gfp, node); @@ -868,7 +868,7 @@ static void *alternate_node_alloc(struct kmem_cache *, gfp_t); static struct alien_cache *__alloc_alien_cache(int node, int entries, int batch, gfp_t gfp) { - int memsize = sizeof(void *) * entries + sizeof(struct alien_cache); + size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); struct alien_cache *alc = NULL; alc = kmalloc_node(memsize, gfp, node); @@ -880,7 +880,7 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { struct alien_cache **alc_ptr; - int memsize = sizeof(void *) * nr_node_ids; + size_t memsize = sizeof(void *) * nr_node_ids; int i; if (limit > 1) @@ -1037,7 +1037,7 @@ static int init_cache_node_node(int node) { struct kmem_cache *cachep; struct kmem_cache_node *n; - const int memsize = sizeof(struct kmem_cache_node); + const size_t memsize = sizeof(struct kmem_cache_node); list_for_each_entry(cachep, &slab_caches, list) { /* -- cgit v1.2.2 From 54266640709a24c9844245d0d9f36b9cb1f31326 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 6 Aug 2014 16:04:42 -0700 Subject: slub: avoid duplicate creation on the first object When a kmem_cache is created with ctor, each object in the kmem_cache will be initialized before ready to use. While in slub implementation, the first object will be initialized twice. This patch reduces the duplication of initialization of the first object. Fix commit 7656c72b ("SLUB: add macros for scanning objects in a slab"). Signed-off-by: Wei Yang Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 92d8139c556d..1f1f838326a0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -283,6 +283,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size) +#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ + for (__p = (__addr), __idx = 1; __idx <= __objects;\ + __p += (__s)->size, __idx++) + /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) { @@ -1379,9 +1383,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; void *start; - void *last; void *p; int order; + int idx; BUG_ON(flags & GFP_SLAB_BUG_MASK); @@ -1402,14 +1406,13 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (unlikely(s->flags & SLAB_POISON)) memset(start, POISON_INUSE, PAGE_SIZE << order); - last = start; - for_each_object(p, s, start, page->objects) { - setup_object(s, page, last); - set_freepointer(s, last, p); - last = p; + for_each_object_idx(p, idx, s, start, page->objects) { + setup_object(s, page, p); + if (likely(idx < page->objects)) + set_freepointer(s, p, p + s->size); + else + set_freepointer(s, p, NULL); } - setup_object(s, page, last); - set_freepointer(s, last, NULL); page->freelist = start; page->inuse = page->objects; -- cgit v1.2.2 From 928cec9cd6db53a68f54bc9ef1c54c674ba1c6bb Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 6 Aug 2014 16:04:44 -0700 Subject: mm: move slab related stuff from util.c to slab_common.c Functions krealloc(), __krealloc(), kzfree() belongs to slab API, so should be placed in slab_common.c Also move slab allocator's tracepoints defenitions to slab_common.c No functional changes here. Signed-off-by: Andrey Ryabinin Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/util.c | 102 ------------------------------------------------------- 2 files changed, 101 insertions(+), 102 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index d31c4bacc6a2..d319502b2403 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -19,6 +19,8 @@ #include #include #include + +#define CREATE_TRACE_POINTS #include #include "slab.h" @@ -787,3 +789,102 @@ static int __init slab_proc_init(void) } module_init(slab_proc_init); #endif /* CONFIG_SLABINFO */ + +static __always_inline void *__do_krealloc(const void *p, size_t new_size, + gfp_t flags) +{ + void *ret; + size_t ks = 0; + + if (p) + ks = ksize(p); + + if (ks >= new_size) + return (void *)p; + + ret = kmalloc_track_caller(new_size, flags); + if (ret && p) + memcpy(ret, p, ks); + + return ret; +} + +/** + * __krealloc - like krealloc() but don't free @p. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * This function is like krealloc() except it never frees the originally + * allocated buffer. Use this if you don't want to free the buffer immediately + * like, for example, with RCU. + */ +void *__krealloc(const void *p, size_t new_size, gfp_t flags) +{ + if (unlikely(!new_size)) + return ZERO_SIZE_PTR; + + return __do_krealloc(p, new_size, flags); + +} +EXPORT_SYMBOL(__krealloc); + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ret = __do_krealloc(p, new_size, flags); + if (ret && p != ret) + kfree(p); + + return ret; +} +EXPORT_SYMBOL(krealloc); + +/** + * kzfree - like kfree but zero memory + * @p: object to free memory of + * + * The memory of the object @p points to is zeroed before freed. + * If @p is %NULL, kzfree() does nothing. + * + * Note: this function zeroes the whole allocated buffer which can be a good + * deal bigger than the requested buffer size passed to kmalloc(). So be + * careful when using this function in performance sensitive code. + */ +void kzfree(const void *p) +{ + size_t ks; + void *mem = (void *)p; + + if (unlikely(ZERO_OR_NULL_PTR(mem))) + return; + ks = ksize(mem); + memset(mem, 0, ks); + kfree(mem); +} +EXPORT_SYMBOL(kzfree); + +/* Tracepoints definitions. */ +EXPORT_TRACEPOINT_SYMBOL(kmalloc); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); +EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); +EXPORT_TRACEPOINT_SYMBOL(kfree); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); diff --git a/mm/util.c b/mm/util.c index d5ea733c5082..7b6608df2ee8 100644 --- a/mm/util.c +++ b/mm/util.c @@ -16,9 +16,6 @@ #include "internal.h" -#define CREATE_TRACE_POINTS -#include - /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate @@ -112,97 +109,6 @@ void *memdup_user(const void __user *src, size_t len) } EXPORT_SYMBOL(memdup_user); -static __always_inline void *__do_krealloc(const void *p, size_t new_size, - gfp_t flags) -{ - void *ret; - size_t ks = 0; - - if (p) - ks = ksize(p); - - if (ks >= new_size) - return (void *)p; - - ret = kmalloc_track_caller(new_size, flags); - if (ret && p) - memcpy(ret, p, ks); - - return ret; -} - -/** - * __krealloc - like krealloc() but don't free @p. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * This function is like krealloc() except it never frees the originally - * allocated buffer. Use this if you don't want to free the buffer immediately - * like, for example, with RCU. - */ -void *__krealloc(const void *p, size_t new_size, gfp_t flags) -{ - if (unlikely(!new_size)) - return ZERO_SIZE_PTR; - - return __do_krealloc(p, new_size, flags); - -} -EXPORT_SYMBOL(__krealloc); - -/** - * krealloc - reallocate memory. The contents will remain unchanged. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. - */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) -{ - void *ret; - - if (unlikely(!new_size)) { - kfree(p); - return ZERO_SIZE_PTR; - } - - ret = __do_krealloc(p, new_size, flags); - if (ret && p != ret) - kfree(p); - - return ret; -} -EXPORT_SYMBOL(krealloc); - -/** - * kzfree - like kfree but zero memory - * @p: object to free memory of - * - * The memory of the object @p points to is zeroed before freed. - * If @p is %NULL, kzfree() does nothing. - * - * Note: this function zeroes the whole allocated buffer which can be a good - * deal bigger than the requested buffer size passed to kmalloc(). So be - * careful when using this function in performance sensitive code. - */ -void kzfree(const void *p) -{ - size_t ks; - void *mem = (void *)p; - - if (unlikely(ZERO_OR_NULL_PTR(mem))) - return; - ks = ksize(mem); - memset(mem, 0, ks); - kfree(mem); -} -EXPORT_SYMBOL(kzfree); - /* * strndup_user - duplicate an existing string from user space * @s: The string to duplicate @@ -504,11 +410,3 @@ out_mm: out: return res; } - -/* Tracepoints definitions. */ -EXPORT_TRACEPOINT_SYMBOL(kmalloc); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); -EXPORT_TRACEPOINT_SYMBOL(kfree); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); -- cgit v1.2.2 From 8a7d9b4306258e092afaae3c663661d22bf91f5c Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Wed, 6 Aug 2014 16:04:46 -0700 Subject: mm/slab.c: fix comments Current struct kmem_cache has no 'lock' field, and slab page is managed by struct kmem_cache_node, which has 'list_lock' field. Clean up the related comment. Signed-off-by: Wang Sheng-Hui Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 1351725f7936..2e60bf3dedbb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1611,7 +1611,8 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) } /* - * Interface to system's page allocator. No need to hold the cache-lock. + * Interface to system's page allocator. No need to hold the + * kmem_cache_node ->list_lock. * * If we requested dmaable memory, we will get it. Even if we * did not request dmaable memory, we might get it, but that @@ -1913,9 +1914,9 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, * @cachep: cache pointer being destroyed * @page: page pointer being destroyed * - * Destroy all the objs in a slab, and release the mem back to the system. - * Before calling the slab must have been unlinked from the cache. The - * cache-lock is not held/needed. + * Destroy all the objs in a slab page, and release the mem back to the system. + * Before calling the slab page must have been unlinked from the cache. The + * kmem_cache_node ->list_lock is not held/needed. */ static void slab_destroy(struct kmem_cache *cachep, struct page *page) { -- cgit v1.2.2 From 0aa9a13d80bae1bb24956f6e3e2662b7242e0b41 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Aug 2014 16:04:48 -0700 Subject: mm, slub: fix some indenting in cmpxchg_double_slab() The return statement goes with the cmpxchg_double() condition so it needs to be indented another tab. Also these days the fashion is to line function parameters up, and it looks nicer that way because then the "freelist_new" is not at the same indent level as the "return 1;". Signed-off-by: Dan Carpenter Signed-off-by: Pekka Enberg Signed-off-by: David Rientjes Cc: Joonsoo Kim Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 1f1f838326a0..d9aadbfe7c29 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -381,9 +381,9 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; } else #endif { @@ -417,9 +417,9 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; } else #endif { -- cgit v1.2.2 From 4307c14f3c77bc0cb0facfc9c67c7872505aaedf Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 6 Aug 2014 16:04:51 -0700 Subject: slab: fix the alias count (via sysfs) of slab cache We mark some slab caches (e.g. kmem_cache_node) as unmergeable by setting refcount to -1, and their alias should be 0, not refcount-1, so correct it here. Signed-off-by: Gu Zheng Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index d9aadbfe7c29..9b861b90cde1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4466,7 +4466,7 @@ SLAB_ATTR_RO(ctor); static ssize_t aliases_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->refcount - 1); + return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); } SLAB_ATTR_RO(aliases); -- cgit v1.2.2 From c42e5715617232563f0cf9f231d86b5133c4487e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:04:53 -0700 Subject: slab: convert last use of __FUNCTION__ to __func__ Just about all of these have been converted to __func__, so convert the last use. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index 928823e17e58..0e0fdd365840 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -256,7 +256,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return cachep; pr_err("%s: Wrong slab cache. %s but object is from %s\n", - __FUNCTION__, cachep->name, s->name); + __func__, cachep->name, s->name); WARN_ON_ONCE(1); return s; } -- cgit v1.2.2 From 3e2faa085448d5c478ebc9d5f6cb4d822467f4d7 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:04:55 -0700 Subject: mm/readahead.c: remove unused file_ra_state from count_history_pages count_history_pages does only call page_cache_prev_hole in rcu_lock context using address_space mapping. There's no need to have file_ra_state here. Signed-off-by: Fabian Frederick Acked-by: Fengguang Wu Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index 0ca36a7770b1..17b9172ec37f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -326,7 +326,6 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, * - thrashing threshold in memory tight systems */ static pgoff_t count_history_pages(struct address_space *mapping, - struct file_ra_state *ra, pgoff_t offset, unsigned long max) { pgoff_t head; @@ -349,7 +348,7 @@ static int try_context_readahead(struct address_space *mapping, { pgoff_t size; - size = count_history_pages(mapping, ra, offset, max); + size = count_history_pages(mapping, offset, max); /* * not enough history pages: -- cgit v1.2.2 From f276540441d255e2f87b37411c4fb75b0eca1606 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:04:57 -0700 Subject: mm/memory_hotplug.c: add __meminit to grow_zone_span/grow_pgdat_span grow_zone_span and grow_pgdat_span are only called by __meminit __add_zone Signed-off-by: Fabian Frederick Cc: Toshi Kani Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 469bbf505f85..3557e8c9e8de 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -284,8 +284,8 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static void grow_zone_span(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) +static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) { unsigned long old_zone_end_pfn; @@ -427,8 +427,8 @@ out_fail: return -1; } -static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, - unsigned long end_pfn) +static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long end_pfn) { unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); -- cgit v1.2.2 From e19318116048d5fbdb8d230d6d37625834b503cd Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:04:59 -0700 Subject: mm/page_alloc.c: add __meminit to alloc_pages_exact_nid() alloc_pages_exact_nid() is only called by __meminit alloc_page_cgroup() Signed-off-by: Fabian Frederick Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef44ad736ca1..fd4322cc096d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2962,7 +2962,7 @@ EXPORT_SYMBOL(alloc_pages_exact); * Note this is not alloc_pages_exact_node() which allocates on a specific node, * but is not exact. */ -void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { unsigned order = get_order(size); struct page *p = alloc_pages_node(nid, gfp_mask, order); -- cgit v1.2.2 From b95b4e1ed92a203f4bdfc55f53d6e9c2773e3b6d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 6 Aug 2014 16:05:01 -0700 Subject: mm/page_alloc.c: unexport alloc_pages_exact_nid() It is only called by mm/page_cgroup.c whcih cannot be modular. Reported-by: David Rientjes Cc: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd4322cc096d..8c4f1b220ab9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2970,7 +2970,6 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); } -EXPORT_SYMBOL(alloc_pages_exact_nid); /** * free_pages_exact - release memory allocated via alloc_pages_exact() -- cgit v1.2.2 From 474750aba88817c53f39424e5567b8e4acc4b39b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:05:06 -0700 Subject: vmalloc: use rcu list iterator to reduce vmap_area_lock contention Richard Yao reported a month ago that his system have a trouble with vmap_area_lock contention during performance analysis by /proc/meminfo. Andrew asked why his analysis checks /proc/meminfo stressfully, but he didn't answer it. https://lkml.org/lkml/2014/4/10/416 Although I'm not sure that this is right usage or not, there is a solution reducing vmap_area_lock contention with no side-effect. That is just to use rcu list iterator in get_vmalloc_info(). rcu can be used in this function because all RCU protocol is already respected by writers, since Nick Piggin commit db64fe02258f1 ("mm: rewrite vmap layer") back in linux-2.6.28 Specifically : insertions use list_add_rcu(), deletions use list_del_rcu() and kfree_rcu(). Note the rb tree is not used from rcu reader (it would not be safe), only the vmap_area_list has full RCU protection. Note that __purge_vmap_area_lazy() already uses this rcu protection. rcu_read_lock(); list_for_each_entry_rcu(va, &vmap_area_list, list) { if (va->flags & VM_LAZY_FREE) { if (va->va_start < *start) *start = va->va_start; if (va->va_end > *end) *end = va->va_end; nr += (va->va_end - va->va_start) >> PAGE_SHIFT; list_add_tail(&va->purge_list, &valist); va->flags |= VM_LAZY_FREEING; va->flags &= ~VM_LAZY_FREE; } } rcu_read_unlock(); Peter: : While rcu list traversal over the vmap_area_list is safe, this may : arrive at different results than the spinlocked version. The rcu list : traversal version will not be a 'snapshot' of a single, valid instant : of the entire vmap_area_list, but rather a potential amalgam of : different list states. Joonsoo: : Yes, you are right, but I don't think that we should be strict here. : Meminfo is already not a 'snapshot' at specific time. While we try to get : certain stats, the other stats can change. And, although we may arrive at : different results than the spinlocked version, the difference would not be : large and would not make serious side-effect. [edumazet@google.com: add more commit description] Signed-off-by: Joonsoo Kim Reported-by: Richard Yao Acked-by: Eric Dumazet Cc: Peter Hurley Cc: Zhang Yanfei Cc: Johannes Weiner Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f64632b67196..fdbb116ee669 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2690,14 +2690,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi) prev_end = VMALLOC_START; - spin_lock(&vmap_area_lock); + rcu_read_lock(); if (list_empty(&vmap_area_list)) { vmi->largest_chunk = VMALLOC_TOTAL; goto out; } - list_for_each_entry(va, &vmap_area_list, list) { + list_for_each_entry_rcu(va, &vmap_area_list, list) { unsigned long addr = va->va_start; /* @@ -2724,7 +2724,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi) vmi->largest_chunk = VMALLOC_END - prev_end; out: - spin_unlock(&vmap_area_lock); + rcu_read_unlock(); } #endif -- cgit v1.2.2 From c0d73261f5c1355a35b8b40e871d31578ce0c044 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 6 Aug 2014 16:05:08 -0700 Subject: mm/memory.c: use entry = ACCESS_ONCE(*pte) in handle_pte_fault() Use ACCESS_ONCE() in handle_pte_fault() when getting the entry or orig_pte upon which all subsequent decisions and pte_same() tests will be made. I have no evidence that its lack is responsible for the mm/filemap.c:202 BUG_ON(page_mapped(page)) in __delete_from_page_cache() found by trinity, and I am not optimistic that it will fix it. But I have found no other explanation, and ACCESS_ONCE() here will surely not hurt. If gcc does re-access the pte before passing it down, then that would be disastrous for correct page fault handling, and certainly could explain the page_mapped() BUGs seen (concurrent fault causing page to be mapped in a second time on top of itself: mapcount 2 for a single pte). Signed-off-by: Hugh Dickins Cc: Sasha Levin Cc: Linus Torvalds Cc: "Kirill A. Shutemov" Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 8b44f765b645..06ff0720d75a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3181,7 +3181,7 @@ static int handle_pte_fault(struct mm_struct *mm, pte_t entry; spinlock_t *ptl; - entry = *pte; + entry = ACCESS_ONCE(*pte); if (!pte_present(entry)) { if (pte_none(entry)) { if (vma->vm_ops) { -- cgit v1.2.2 From 4f7c6b49c45a398d72763d1f0e64ddff8b3653c7 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Wed, 6 Aug 2014 16:05:13 -0700 Subject: mem-hotplug: introduce MMOP_OFFLINE to replace the hard coding -1 In store_mem_state(), we have: ... 334 else if (!strncmp(buf, "offline", min_t(int, count, 7))) 335 online_type = -1; ... 355 case -1: 356 ret = device_offline(&mem->dev); 357 break; ... Here, "offline" is hard coded as -1. This patch does the following renaming: ONLINE_KEEP -> MMOP_ONLINE_KEEP ONLINE_KERNEL -> MMOP_ONLINE_KERNEL ONLINE_MOVABLE -> MMOP_ONLINE_MOVABLE and introduces MMOP_OFFLINE = -1 to avoid hard coding. Signed-off-by: Tang Chen Cc: Hu Tao Cc: Greg Kroah-Hartman Cc: Lai Jiangshan Cc: Yasuaki Ishimatsu Cc: Gu Zheng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3557e8c9e8de..a3797d3fd8a4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -977,15 +977,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ zone = page_zone(pfn_to_page(pfn)); ret = -EINVAL; - if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && + if ((zone_idx(zone) > ZONE_NORMAL || + online_type == MMOP_ONLINE_MOVABLE) && !can_online_high_movable(zone)) goto out; - if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { + if (online_type == MMOP_ONLINE_KERNEL && + zone_idx(zone) == ZONE_MOVABLE) { if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) goto out; } - if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { + if (online_type == MMOP_ONLINE_MOVABLE && + zone_idx(zone) == ZONE_MOVABLE - 1) { if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) goto out; } -- cgit v1.2.2 From 7be12fc9f8dcae7f4c9647d495bd64fc4ffb6836 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Wed, 6 Aug 2014 16:05:15 -0700 Subject: mm: page_alloc: simplify drain_zone_pages by using min() Instead of open-coding getting minimal value of two, just use min macro. That is why it is there for. While changing the function also change type of batch local variable to match type of per_cpu_pages::batch (which is int). Signed-off-by: Michal Nazarewicz Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8c4f1b220ab9..c1c6cb78e5ca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1257,15 +1257,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; - int to_drain; - unsigned long batch; + int to_drain, batch; local_irq_save(flags); batch = ACCESS_ONCE(pcp->batch); - if (pcp->count >= batch) - to_drain = batch; - else - to_drain = pcp->count; + to_drain = min(pcp->count, batch); if (to_drain > 0) { free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; -- cgit v1.2.2 From bc7f84c0e67c0ca90b6d0e95cc293ed5d8ad30c4 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:05:17 -0700 Subject: mm/internal.h: use nth_page Use nth_page instead of pfn_to_page(page_to_pfn Signed-off-by: Fabian Frederick Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 7f22a11fcc66..a1b651b11c5f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -247,7 +247,7 @@ static inline void mlock_migrate_page(struct page *new, struct page *old) { } static inline struct page *mem_map_offset(struct page *base, int offset) { if (unlikely(offset >= MAX_ORDER_NR_PAGES)) - return pfn_to_page(page_to_pfn(base) + offset); + return nth_page(base, offset); return base + offset; } -- cgit v1.2.2 From a254129e8686bff7a340b58f35241b04927e81c0 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:05:25 -0700 Subject: CMA: generalize CMA reserved area management functionality Currently, there are two users on CMA functionality, one is the DMA subsystem and the other is the KVM on powerpc. They have their own code to manage CMA reserved area even if they looks really similar. From my guess, it is caused by some needs on bitmap management. KVM side wants to maintain bitmap not for 1 page, but for more size. Eventually it use bitmap where one bit represents 64 pages. When I implement CMA related patches, I should change those two places to apply my change and it seem to be painful to me. I want to change this situation and reduce future code management overhead through this patch. This change could also help developer who want to use CMA in their new feature development, since they can use CMA easily without copying & pasting this reserved area management code. In previous patches, we have prepared some features to generalize CMA reserved area management and now it's time to do it. This patch moves core functions to mm/cma.c and change DMA APIs to use these functions. There is no functional change in DMA APIs. Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Acked-by: Zhang Yanfei Acked-by: Minchan Kim Reviewed-by: Aneesh Kumar K.V Cc: Alexander Graf Cc: Aneesh Kumar K.V Cc: Gleb Natapov Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski Cc: Paolo Bonzini Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 11 ++ mm/Makefile | 1 + mm/cma.c | 333 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 345 insertions(+) create mode 100644 mm/cma.c (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 3e9977a9d657..f4899ec39cf4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -508,6 +508,17 @@ config CMA_DEBUG processing calls such as dma_alloc_from_contiguous(). This option does not affect warning and error messages. +config CMA_AREAS + int "Maximum count of the CMA areas" + depends on CMA + default 7 + help + CMA allows to create CMA areas for particular purpose, mainly, + used as device private area. This parameter sets the maximum + number of CMA area in the system. + + If unsure, leave the default value "7". + config ZBUD tristate default n diff --git a/mm/Makefile b/mm/Makefile index 4064f3ec145e..8338473c329a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -62,3 +62,4 @@ obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o +obj-$(CONFIG_CMA) += cma.o diff --git a/mm/cma.c b/mm/cma.c new file mode 100644 index 000000000000..656004216953 --- /dev/null +++ b/mm/cma.c @@ -0,0 +1,333 @@ +/* + * Contiguous Memory Allocator + * + * Copyright (c) 2010-2011 by Samsung Electronics. + * Copyright IBM Corporation, 2013 + * Copyright LG Electronics Inc., 2014 + * Written by: + * Marek Szyprowski + * Michal Nazarewicz + * Aneesh Kumar K.V + * Joonsoo Kim + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + */ + +#define pr_fmt(fmt) "cma: " fmt + +#ifdef CONFIG_CMA_DEBUG +#ifndef DEBUG +# define DEBUG +#endif +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +struct cma { + unsigned long base_pfn; + unsigned long count; + unsigned long *bitmap; + unsigned int order_per_bit; /* Order of pages represented by one bit */ + struct mutex lock; +}; + +static struct cma cma_areas[MAX_CMA_AREAS]; +static unsigned cma_area_count; +static DEFINE_MUTEX(cma_mutex); + +phys_addr_t cma_get_base(struct cma *cma) +{ + return PFN_PHYS(cma->base_pfn); +} + +unsigned long cma_get_size(struct cma *cma) +{ + return cma->count << PAGE_SHIFT; +} + +static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) +{ + return (1UL << (align_order >> cma->order_per_bit)) - 1; +} + +static unsigned long cma_bitmap_maxno(struct cma *cma) +{ + return cma->count >> cma->order_per_bit; +} + +static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, + unsigned long pages) +{ + return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; +} + +static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) +{ + unsigned long bitmap_no, bitmap_count; + + bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + mutex_lock(&cma->lock); + bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); + mutex_unlock(&cma->lock); +} + +static int __init cma_activate_area(struct cma *cma) +{ + int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long); + unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; + unsigned i = cma->count >> pageblock_order; + struct zone *zone; + + cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + + if (!cma->bitmap) + return -ENOMEM; + + WARN_ON_ONCE(!pfn_valid(pfn)); + zone = page_zone(pfn_to_page(pfn)); + + do { + unsigned j; + + base_pfn = pfn; + for (j = pageblock_nr_pages; j; --j, pfn++) { + WARN_ON_ONCE(!pfn_valid(pfn)); + /* + * alloc_contig_range requires the pfn range + * specified to be in the same zone. Make this + * simple by forcing the entire CMA resv range + * to be in the same zone. + */ + if (page_zone(pfn_to_page(pfn)) != zone) + goto err; + } + init_cma_reserved_pageblock(pfn_to_page(base_pfn)); + } while (--i); + + mutex_init(&cma->lock); + return 0; + +err: + kfree(cma->bitmap); + return -EINVAL; +} + +static int __init cma_init_reserved_areas(void) +{ + int i; + + for (i = 0; i < cma_area_count; i++) { + int ret = cma_activate_area(&cma_areas[i]); + + if (ret) + return ret; + } + + return 0; +} +core_initcall(cma_init_reserved_areas); + +/** + * cma_declare_contiguous() - reserve custom contiguous area + * @size: Size of the reserved area (in bytes), + * @base: Base address of the reserved area optional, use 0 for any + * @limit: End address of the reserved memory (optional, 0 for any). + * @alignment: Alignment for the CMA area, should be power of 2 or zero + * @order_per_bit: Order of pages represented by one bit on bitmap. + * @res_cma: Pointer to store the created cma region. + * @fixed: hint about where to place the reserved area + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. This function allows to create custom reserved areas. + * + * If @fixed is true, reserve contiguous area at exactly @base. If false, + * reserve in range from @base to @limit. + */ +int __init cma_declare_contiguous(phys_addr_t size, + phys_addr_t base, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + struct cma **res_cma, bool fixed) +{ + struct cma *cma = &cma_areas[cma_area_count]; + int ret = 0; + + pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", + __func__, (unsigned long)size, (unsigned long)base, + (unsigned long)limit, (unsigned long)alignment); + + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size) + return -EINVAL; + + if (alignment && !is_power_of_2(alignment)) + return -EINVAL; + + /* + * Sanitise input arguments. + * Pages both ends in CMA area could be merged into adjacent unmovable + * migratetype page by page allocator's buddy algorithm. In the case, + * you couldn't get a contiguous memory, which is not what we want. + */ + alignment = max(alignment, + (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); + base = ALIGN(base, alignment); + size = ALIGN(size, alignment); + limit &= ~(alignment - 1); + + /* size should be aligned with order_per_bit */ + if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) + return -EINVAL; + + /* Reserve memory */ + if (base && fixed) { + if (memblock_is_region_reserved(base, size) || + memblock_reserve(base, size) < 0) { + ret = -EBUSY; + goto err; + } + } else { + phys_addr_t addr = memblock_alloc_range(size, alignment, base, + limit); + if (!addr) { + ret = -ENOMEM; + goto err; + } else { + base = addr; + } + } + + /* + * Each reserved area must be initialised later, when more kernel + * subsystems (like slab allocator) are available. + */ + cma->base_pfn = PFN_DOWN(base); + cma->count = size >> PAGE_SHIFT; + cma->order_per_bit = order_per_bit; + *res_cma = cma; + cma_area_count++; + + pr_info("CMA: reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, + (unsigned long)base); + return 0; + +err: + pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); + return ret; +} + +/** + * cma_alloc() - allocate pages from contiguous area + * @cma: Contiguous memory region for which the allocation is performed. + * @count: Requested number of pages. + * @align: Requested alignment of pages (in PAGE_SIZE order). + * + * This function allocates part of contiguous memory on specific + * contiguous memory area. + */ +struct page *cma_alloc(struct cma *cma, int count, unsigned int align) +{ + unsigned long mask, pfn, start = 0; + unsigned long bitmap_maxno, bitmap_no, bitmap_count; + struct page *page = NULL; + int ret; + + if (!cma || !cma->count) + return NULL; + + pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma, + count, align); + + if (!count) + return NULL; + + mask = cma_bitmap_aligned_mask(cma, align); + bitmap_maxno = cma_bitmap_maxno(cma); + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + for (;;) { + mutex_lock(&cma->lock); + bitmap_no = bitmap_find_next_zero_area(cma->bitmap, + bitmap_maxno, start, bitmap_count, mask); + if (bitmap_no >= bitmap_maxno) { + mutex_unlock(&cma->lock); + break; + } + bitmap_set(cma->bitmap, bitmap_no, bitmap_count); + /* + * It's safe to drop the lock here. We've marked this region for + * our exclusive use. If the migration fails we will take the + * lock again and unmark it. + */ + mutex_unlock(&cma->lock); + + pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); + mutex_lock(&cma_mutex); + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); + mutex_unlock(&cma_mutex); + if (ret == 0) { + page = pfn_to_page(pfn); + break; + } else if (ret != -EBUSY) { + cma_clear_bitmap(cma, pfn, count); + break; + } + cma_clear_bitmap(cma, pfn, count); + pr_debug("%s(): memory range at %p is busy, retrying\n", + __func__, pfn_to_page(pfn)); + /* try again with a bit different memory target */ + start = bitmap_no + mask + 1; + } + + pr_debug("%s(): returned %p\n", __func__, page); + return page; +} + +/** + * cma_release() - release allocated pages + * @cma: Contiguous memory region for which the allocation is performed. + * @pages: Allocated pages. + * @count: Number of allocated pages. + * + * This function releases memory allocated by alloc_cma(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool cma_release(struct cma *cma, struct page *pages, int count) +{ + unsigned long pfn; + + if (!cma || !pages) + return false; + + pr_debug("%s(page %p)\n", __func__, (void *)pages); + + pfn = page_to_pfn(pages); + + if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) + return false; + + VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); + + free_contig_range(pfn, count); + cma_clear_bitmap(cma, pfn, count); + + return true; +} -- cgit v1.2.2 From b7155e76a702d97553660828347b9f10858b4dd5 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:05:30 -0700 Subject: mm, CMA: clean-up CMA allocation error path We can remove one call sites for clear_cma_bitmap() if we first call it before checking error number. Signed-off-by: Joonsoo Kim Acked-by: Minchan Kim Reviewed-by: Michal Nazarewicz Reviewed-by: Zhang Yanfei Reviewed-by: Aneesh Kumar K.V Cc: Alexander Graf Cc: Aneesh Kumar K.V Cc: Gleb Natapov Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski Cc: Paolo Bonzini Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index 656004216953..103a6663b7c7 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -285,11 +285,12 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) if (ret == 0) { page = pfn_to_page(pfn); break; - } else if (ret != -EBUSY) { - cma_clear_bitmap(cma, pfn, count); - break; } + cma_clear_bitmap(cma, pfn, count); + if (ret != -EBUSY) + break; + pr_debug("%s(): memory range at %p is busy, retrying\n", __func__, pfn_to_page(pfn)); /* try again with a bit different memory target */ -- cgit v1.2.2 From c1f733aaaf30a0068a3126d5aa9d5b4c25ba4c0c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:05:32 -0700 Subject: mm, CMA: change cma_declare_contiguous() to obey coding convention Conventionally, we put output param to the end of param list and put the 'base' ahead of 'size', but cma_declare_contiguous() doesn't look like that, so change it. Additionally, move down cma_areas reference code to the position where it is really needed. Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Reviewed-by: Aneesh Kumar K.V Cc: Alexander Graf Cc: Aneesh Kumar K.V Cc: Gleb Natapov Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski Cc: Minchan Kim Cc: Paolo Bonzini Cc: Zhang Yanfei Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index 103a6663b7c7..488e50810ed1 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -141,13 +141,13 @@ core_initcall(cma_init_reserved_areas); /** * cma_declare_contiguous() - reserve custom contiguous area - * @size: Size of the reserved area (in bytes), * @base: Base address of the reserved area optional, use 0 for any + * @size: Size of the reserved area (in bytes), * @limit: End address of the reserved memory (optional, 0 for any). * @alignment: Alignment for the CMA area, should be power of 2 or zero * @order_per_bit: Order of pages represented by one bit on bitmap. - * @res_cma: Pointer to store the created cma region. * @fixed: hint about where to place the reserved area + * @res_cma: Pointer to store the created cma region. * * This function reserves memory from early allocator. It should be * called by arch specific code once the early allocator (memblock or bootmem) @@ -157,12 +157,12 @@ core_initcall(cma_init_reserved_areas); * If @fixed is true, reserve contiguous area at exactly @base. If false, * reserve in range from @base to @limit. */ -int __init cma_declare_contiguous(phys_addr_t size, - phys_addr_t base, phys_addr_t limit, +int __init cma_declare_contiguous(phys_addr_t base, + phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, - struct cma **res_cma, bool fixed) + bool fixed, struct cma **res_cma) { - struct cma *cma = &cma_areas[cma_area_count]; + struct cma *cma; int ret = 0; pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", @@ -218,6 +218,7 @@ int __init cma_declare_contiguous(phys_addr_t size, * Each reserved area must be initialised later, when more kernel * subsystems (like slab allocator) are available. */ + cma = &cma_areas[cma_area_count]; cma->base_pfn = PFN_DOWN(base); cma->count = size >> PAGE_SHIFT; cma->order_per_bit = order_per_bit; -- cgit v1.2.2 From 0de9d2ebe590f9203dac59d4b8e298c473764b92 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:05:34 -0700 Subject: mm, CMA: clean-up log message We don't need explicit 'CMA:' prefix, since we already define prefix 'cma:' in pr_fmt. So remove it. Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Reviewed-by: Zhang Yanfei Cc: "Aneesh Kumar K.V" Cc: Alexander Graf Cc: Aneesh Kumar K.V Cc: Gleb Natapov Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski Cc: Minchan Kim Cc: Paolo Bonzini Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index 488e50810ed1..c17751c0dcaf 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -225,12 +225,12 @@ int __init cma_declare_contiguous(phys_addr_t base, *res_cma = cma; cma_area_count++; - pr_info("CMA: reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, + pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, (unsigned long)base); return 0; err: - pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); + pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); return ret; } -- cgit v1.2.2 From f8303c2582b889351e261ff18c4d8eb197a77db2 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 6 Aug 2014 16:05:36 -0700 Subject: mm, thp: move invariant bug check out of loop in __split_huge_page_map In __split_huge_page_map(), the check for page_mapcount(page) is invariant within the for loop. Because of the fact that the macro is implemented using atomic_read(), the redundant check cannot be optimized away by the compiler leading to unnecessary read to the page structure. This patch moves the invariant bug check out of the loop so that it will be done only once. On a 3.16-rc1 based kernel, the execution time of a microbenchmark that broke up 1000 transparent huge pages using munmap() had an execution time of 38,245us and 38,548us with and without the patch respectively. The performance gain is about 1%. Signed-off-by: Waiman Long Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Rik van Riel Cc: Scott J Norton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 33514d88fef9..2161490526f0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1775,6 +1775,8 @@ static int __split_huge_page_map(struct page *page, if (pmd) { pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); + if (pmd_write(*pmd)) + BUG_ON(page_mapcount(page) != 1); haddr = address; for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1784,8 +1786,6 @@ static int __split_huge_page_map(struct page *page, entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (!pmd_write(*pmd)) entry = pte_wrprotect(entry); - else - BUG_ON(page_mapcount(page) != 1); if (!pmd_young(*pmd)) entry = pte_mkold(entry); if (pmd_numa(*pmd)) -- cgit v1.2.2 From 3a79d52aa3c63c939f5a1f86e80e634f84e987c4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 6 Aug 2014 16:05:38 -0700 Subject: mm, thp: replace smp_mb after atomic_add by smp_mb__after_atomic In some architectures like x86, atomic_add() is a full memory barrier. In that case, an additional smp_mb() is just a waste of time. This patch replaces that smp_mb() by smp_mb__after_atomic() which will avoid the redundant memory barrier in some architectures. With a 3.16-rc1 based kernel, this patch reduced the execution time of breaking 1000 transparent huge pages from 38,245us to 30,964us. A reduction of 19% which is quite sizeable. It also reduces the %cpu time of the __split_huge_page_refcount function in the perf profile from 2.18% to 1.15%. Signed-off-by: Waiman Long Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Rik van Riel Cc: Scott J Norton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2161490526f0..4b95ff4120f5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1681,7 +1681,7 @@ static void __split_huge_page_refcount(struct page *page, &page_tail->_count); /* after clearing PageTail the gup refcount can be released */ - smp_mb(); + smp_mb__after_atomic(); /* * retain hwpoison flag of the poisoned tail page: -- cgit v1.2.2 From 6539cc053869bd32a2db731b215b7c73b11f68d3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:42 -0700 Subject: mm: memcontrol: fold mem_cgroup_do_charge() These patches rework memcg charge lifetime to integrate more naturally with the lifetime of user pages. This drastically simplifies the code and reduces charging and uncharging overhead. The most expensive part of charging and uncharging is the page_cgroup bit spinlock, which is removed entirely after this series. Here are the top-10 profile entries of a stress test that reads a 128G sparse file on a freshly booted box, without even a dedicated cgroup (i.e. executing in the root memcg). Before: 15.36% cat [kernel.kallsyms] [k] copy_user_generic_string 13.31% cat [kernel.kallsyms] [k] memset 11.48% cat [kernel.kallsyms] [k] do_mpage_readpage 4.23% cat [kernel.kallsyms] [k] get_page_from_freelist 2.38% cat [kernel.kallsyms] [k] put_page 2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge 2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common 1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn After: 15.67% cat [kernel.kallsyms] [k] copy_user_generic_string 13.48% cat [kernel.kallsyms] [k] memset 11.42% cat [kernel.kallsyms] [k] do_mpage_readpage 3.98% cat [kernel.kallsyms] [k] get_page_from_freelist 2.46% cat [kernel.kallsyms] [k] put_page 2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn 1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk 1.30% cat [kernel.kallsyms] [k] kfree As you can see, the memcg footprint has shrunk quite a bit. text data bss dec hex filename 37970 9892 400 48262 bc86 mm/memcontrol.o.old 35239 9892 400 45531 b1db mm/memcontrol.o This patch (of 13): This function was split out because mem_cgroup_try_charge() got too big. But having essentially one sequence of operations arbitrarily split in half is not good for reworking the code. Fold it back in. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 166 ++++++++++++++++++++++---------------------------------- 1 file changed, 64 insertions(+), 102 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f009a14918d2..fe3ad310656d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2551,80 +2551,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } - -/* See mem_cgroup_try_charge() for details */ -enum { - CHARGE_OK, /* success */ - CHARGE_RETRY, /* need to retry but retry is not bad */ - CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ - CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ -}; - -static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, unsigned int min_pages, - bool invoke_oom) -{ - unsigned long csize = nr_pages * PAGE_SIZE; - struct mem_cgroup *mem_over_limit; - struct res_counter *fail_res; - unsigned long flags = 0; - int ret; - - ret = res_counter_charge(&memcg->res, csize, &fail_res); - - if (likely(!ret)) { - if (!do_swap_account) - return CHARGE_OK; - ret = res_counter_charge(&memcg->memsw, csize, &fail_res); - if (likely(!ret)) - return CHARGE_OK; - - res_counter_uncharge(&memcg->res, csize); - mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); - flags |= MEM_CGROUP_RECLAIM_NOSWAP; - } else - mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); - /* - * Never reclaim on behalf of optional batching, retry with a - * single page instead. - */ - if (nr_pages > min_pages) - return CHARGE_RETRY; - - if (!(gfp_mask & __GFP_WAIT)) - return CHARGE_WOULDBLOCK; - - if (gfp_mask & __GFP_NORETRY) - return CHARGE_NOMEM; - - ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); - if (mem_cgroup_margin(mem_over_limit) >= nr_pages) - return CHARGE_RETRY; - /* - * Even though the limit is exceeded at this point, reclaim - * may have been able to free some pages. Retry the charge - * before killing the task. - * - * Only for regular pages, though: huge pages are rather - * unlikely to succeed so close to the limit, and we fall back - * to regular pages anyway in case of failure. - */ - if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) - return CHARGE_RETRY; - - /* - * At task move, charge accounts can be doubly counted. So, it's - * better to wait until the end of task_move if something is going on. - */ - if (mem_cgroup_wait_acct_move(mem_over_limit)) - return CHARGE_RETRY; - - if (invoke_oom) - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - - return CHARGE_NOMEM; -} - /** * mem_cgroup_try_charge - try charging a memcg * @memcg: memcg to charge @@ -2641,7 +2567,11 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, { unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - int ret; + struct mem_cgroup *mem_over_limit; + struct res_counter *fail_res; + unsigned long nr_reclaimed; + unsigned long flags = 0; + unsigned long long size; if (mem_cgroup_is_root(memcg)) goto done; @@ -2661,44 +2591,76 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, if (gfp_mask & __GFP_NOFAIL) oom = false; -again: +retry: if (consume_stock(memcg, nr_pages)) goto done; - do { - bool invoke_oom = oom && !nr_oom_retries; + size = batch * PAGE_SIZE; + if (!res_counter_charge(&memcg->res, size, &fail_res)) { + if (!do_swap_account) + goto done_restock; + if (!res_counter_charge(&memcg->memsw, size, &fail_res)) + goto done_restock; + res_counter_uncharge(&memcg->res, size); + mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); + flags |= MEM_CGROUP_RECLAIM_NOSWAP; + } else + mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); - /* If killed, bypass charge */ - if (fatal_signal_pending(current)) - goto bypass; + if (batch > nr_pages) { + batch = nr_pages; + goto retry; + } - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, - nr_pages, invoke_oom); - switch (ret) { - case CHARGE_OK: - break; - case CHARGE_RETRY: /* not in OOM situation but retry */ - batch = nr_pages; - goto again; - case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - goto nomem; - case CHARGE_NOMEM: /* OOM routine works */ - if (!oom || invoke_oom) - goto nomem; - nr_oom_retries--; - break; - } - } while (ret != CHARGE_OK); + if (!(gfp_mask & __GFP_WAIT)) + goto nomem; - if (batch > nr_pages) - refill_stock(memcg, batch - nr_pages); -done: - return 0; + if (gfp_mask & __GFP_NORETRY) + goto nomem; + + nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); + + if (mem_cgroup_margin(mem_over_limit) >= batch) + goto retry; + /* + * Even though the limit is exceeded at this point, reclaim + * may have been able to free some pages. Retry the charge + * before killing the task. + * + * Only for regular pages, though: huge pages are rather + * unlikely to succeed so close to the limit, and we fall back + * to regular pages anyway in case of failure. + */ + if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER)) + goto retry; + /* + * At task move, charge accounts can be doubly counted. So, it's + * better to wait until the end of task_move if something is going on. + */ + if (mem_cgroup_wait_acct_move(mem_over_limit)) + goto retry; + + if (fatal_signal_pending(current)) + goto bypass; + + if (!oom) + goto nomem; + + if (nr_oom_retries--) + goto retry; + + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; bypass: return -EINTR; + +done_restock: + if (batch > nr_pages) + refill_stock(memcg, batch - nr_pages); +done: + return 0; } /** -- cgit v1.2.2 From 06b078fc065fe1fe7097675c8ee416aa2ef94fb3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:44 -0700 Subject: mm: memcontrol: rearrange charging fast path The charging path currently starts out with OOM condition checks when OOM is the rarest possible case. Rearrange this code to run OOM/task dying checks only after trying the percpu charge and the res_counter charge and bail out before entering reclaim. Attempting a charge does not hurt an (oom-)killed task as much as every charge attempt having to check OOM conditions. Also, only check __GFP_NOFAIL when the charge would actually fail. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fe3ad310656d..f7b6bec9f538 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2575,22 +2575,6 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, if (mem_cgroup_is_root(memcg)) goto done; - /* - * Unlike in global OOM situations, memcg is not in a physical - * memory shortage. Allow dying and OOM-killed tasks to - * bypass the last charges so that they can exit quickly and - * free their memory. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE) || - fatal_signal_pending(current) || - current->flags & PF_EXITING)) - goto bypass; - - if (unlikely(task_in_memcg_oom(current))) - goto nomem; - - if (gfp_mask & __GFP_NOFAIL) - oom = false; retry: if (consume_stock(memcg, nr_pages)) goto done; @@ -2612,6 +2596,20 @@ retry: goto retry; } + /* + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and + * free their memory. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE) || + fatal_signal_pending(current) || + current->flags & PF_EXITING)) + goto bypass; + + if (unlikely(task_in_memcg_oom(current))) + goto nomem; + if (!(gfp_mask & __GFP_WAIT)) goto nomem; @@ -2640,6 +2638,9 @@ retry: if (mem_cgroup_wait_acct_move(mem_over_limit)) goto retry; + if (gfp_mask & __GFP_NOFAIL) + goto bypass; + if (fatal_signal_pending(current)) goto bypass; -- cgit v1.2.2 From 28c34c291e746aab1c2bfd6d6609b2e47fa0978b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:47 -0700 Subject: mm: memcontrol: reclaim at least once for __GFP_NORETRY Currently, __GFP_NORETRY tries charging once and gives up before even trying to reclaim. Bring the behavior on par with the page allocator and reclaim at least once before giving up. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f7b6bec9f538..a73f3947f5d9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2613,13 +2613,13 @@ retry: if (!(gfp_mask & __GFP_WAIT)) goto nomem; - if (gfp_mask & __GFP_NORETRY) - goto nomem; - nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= batch) goto retry; + + if (gfp_mask & __GFP_NORETRY) + goto nomem; /* * Even though the limit is exceeded at this point, reclaim * may have been able to free some pages. Retry the charge -- cgit v1.2.2 From d51d885bbb137cc8e1704e76be1846c5e0d5e8b4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:49 -0700 Subject: mm: huge_memory: use GFP_TRANSHUGE when charging huge pages Transparent huge page charges prefer falling back to regular pages rather than spending a lot of time in direct reclaim. Desired reclaim behavior is usually declared in the gfp mask, but THP charges use GFP_KERNEL and then rely on the fact that OOM is disabled for THP charges, and that OOM-disabled charges don't retry reclaim. Needless to say, this is anything but obvious and quite error prone. Convert THP charges to use GFP_TRANSHUGE instead, which implies __GFP_NORETRY, to indicate the low-latency requirement. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4b95ff4120f5..24e354c2b59e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -827,7 +827,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_TRANSHUGE))) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1132,7 +1132,7 @@ alloc: goto out; } - if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) { put_page(new_page); if (page) { split_huge_page(page); @@ -2399,7 +2399,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (!new_page) return; - if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) return; /* -- cgit v1.2.2 From 9b1306192d335759a6cf2f3b404c49e811e5f953 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:51 -0700 Subject: mm: memcontrol: retry reclaim for oom-disabled and __GFP_NOFAIL charges There is no reason why oom-disabled and __GFP_NOFAIL charges should try to reclaim only once when every other charge tries several times before giving up. Make them all retry the same number of times. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a73f3947f5d9..3069d6420b0e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2566,7 +2566,7 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, bool oom) { unsigned int batch = max(CHARGE_BATCH, nr_pages); - int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; unsigned long nr_reclaimed; @@ -2638,6 +2638,9 @@ retry: if (mem_cgroup_wait_acct_move(mem_over_limit)) goto retry; + if (nr_retries--) + goto retry; + if (gfp_mask & __GFP_NOFAIL) goto bypass; @@ -2647,9 +2650,6 @@ retry: if (!oom) goto nomem; - if (nr_oom_retries--) - goto retry; - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) -- cgit v1.2.2 From 0029e19ebf84dcd70b226820daa7747b28d5956d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Aug 2014 16:05:53 -0700 Subject: mm: memcontrol: remove explicit OOM parameter in charge path For the page allocator, __GFP_NORETRY implies that no OOM should be triggered, whereas memcg has an explicit parameter to disable OOM. The only callsites that want OOM disabled are THP charges and charge moving. THP already uses __GFP_NORETRY and charge moving can use it as well - one full reclaim cycle should be plenty. Switch it over, then remove the OOM parameter. Signed-off-by: Johannes Weiner Signed-off-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3069d6420b0e..8aaca8267dfe 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2555,15 +2555,13 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, * mem_cgroup_try_charge - try charging a memcg * @memcg: memcg to charge * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails * * Returns 0 if @memcg was charged successfully, -EINTR if the charge * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. */ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) + unsigned int nr_pages) { unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; @@ -2647,9 +2645,6 @@ retry: if (fatal_signal_pending(current)) goto bypass; - if (!oom) - goto nomem; - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) @@ -2675,15 +2670,14 @@ done: */ static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) + unsigned int nr_pages) { struct mem_cgroup *memcg; int ret; memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); + ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages); css_put(&memcg->css); if (ret == -EINTR) memcg = root_mem_cgroup; @@ -2900,8 +2894,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) if (ret) return ret; - ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, - oom_gfp_allowed(gfp)); + ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT); if (ret == -EINTR) { /* * mem_cgroup_try_charge() chosed to bypass to root due to @@ -3650,7 +3643,6 @@ int mem_cgroup_charge_anon(struct page *page, { unsigned int nr_pages = 1; struct mem_cgroup *memcg; - bool oom = true; if (mem_cgroup_disabled()) return 0; @@ -3662,14 +3654,9 @@ int mem_cgroup_charge_anon(struct page *page, if (PageTransHuge(page)) { nr_pages <<= compound_order(page); VM_BUG_ON_PAGE(!PageTransHuge(page), page); - /* - * Never OOM-kill a process for a huge page. The - * fault handler will fall back to regular pages. - */ - oom = false; } - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages); if (!memcg) return -ENOMEM; __mem_cgroup_commit_charge(memcg, page, nr_pages, @@ -3706,7 +3693,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, memcg = try_get_mem_cgroup_from_page(page); if (!memcg) memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, mask, 1, true); + ret = mem_cgroup_try_charge(memcg, mask, 1); css_put(&memcg->css); if (ret == -EINTR) memcg = root_mem_cgroup; @@ -3733,7 +3720,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, if (!PageSwapCache(page)) { struct mem_cgroup *memcg; - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1); if (!memcg) return -ENOMEM; *memcgp = memcg; @@ -3802,7 +3789,7 @@ int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, return 0; } - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1); if (!memcg) return -ENOMEM; __mem_cgroup_commit_charge(memcg, page, 1, type, false); @@ -6440,7 +6427,8 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); + ret = mem_cgroup_try_charge(memcg, + GFP_KERNEL & ~__GFP_NORETRY, 1); if (ret) /* mem_cgroup_clear_mc() will do uncharge later */ return ret; -- cgit v1.2.2 From 9476db974d9e18885123fcebc09f4596bb922e5f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:55 -0700 Subject: mm: memcontrol: simplify move precharge function The move precharge function does some baroque things: it tries raw res_counter charging of the entire amount first, and then falls back to a loop of one-by-one charges, with checks for pending signals and cond_resched() batching. Just use mem_cgroup_try_charge() without __GFP_WAIT for the first bulk charge attempt. In the one-by-one loop, remove the signal check (this is already checked in try_charge), and simply call cond_resched() after every charge - it's not that expensive. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 48 +++++++++++++++--------------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8aaca8267dfe..8a4159efa3c0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6385,56 +6385,38 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ -#define PRECHARGE_COUNT_AT_ONCE 256 static int mem_cgroup_do_precharge(unsigned long count) { int ret = 0; - int batch_count = PRECHARGE_COUNT_AT_ONCE; - struct mem_cgroup *memcg = mc.to; - if (mem_cgroup_is_root(memcg)) { + if (mem_cgroup_is_root(mc.to)) { mc.precharge += count; /* we don't need css_get for root */ return ret; } - /* try to charge at once */ - if (count > 1) { - struct res_counter *dummy; - /* - * "memcg" cannot be under rmdir() because we've already checked - * by cgroup_lock_live_cgroup() that it is not removed and we - * are still under the same cgroup_mutex. So we can postpone - * css_get(). - */ - if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) - goto one_by_one; - if (do_swap_account && res_counter_charge(&memcg->memsw, - PAGE_SIZE * count, &dummy)) { - res_counter_uncharge(&memcg->res, PAGE_SIZE * count); - goto one_by_one; - } + + /* Try a single bulk charge without reclaim first */ + ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + if (!ret) { mc.precharge += count; return ret; } -one_by_one: - /* fall back to one by one charge */ + + /* Try charges one by one with reclaim */ while (count--) { - if (signal_pending(current)) { - ret = -EINTR; - break; - } - if (!batch_count--) { - batch_count = PRECHARGE_COUNT_AT_ONCE; - cond_resched(); - } - ret = mem_cgroup_try_charge(memcg, + ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); + /* + * In case of failure, any residual charges against + * mc.to will be dropped by mem_cgroup_clear_mc() + * later on. + */ if (ret) - /* mem_cgroup_clear_mc() will do uncharge later */ return ret; mc.precharge++; + cond_resched(); } - return ret; + return 0; } /** -- cgit v1.2.2 From 692e7c45d95ad1064b6911800e2cfec7fc0236db Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:57 -0700 Subject: mm: memcontrol: catch root bypass in move precharge When mem_cgroup_try_charge() returns -EINTR, it bypassed the charge to the root memcg. But move precharging does not catch this and treats this case as if no charge had happened, thus leaking a charge against root. Because of an old optimization, the root memcg's res_counter is not actually charged right now, but it's still an imbalance and subsequent patches will charge the root memcg again. Catch those bypasses to the root memcg and properly cancel them before giving up the move. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8a4159efa3c0..e0ac636315f8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6401,6 +6401,10 @@ static int mem_cgroup_do_precharge(unsigned long count) mc.precharge += count; return ret; } + if (ret == -EINTR) { + __mem_cgroup_cancel_charge(root_mem_cgroup, count); + return ret; + } /* Try charges one by one with reclaim */ while (count--) { @@ -6409,8 +6413,11 @@ static int mem_cgroup_do_precharge(unsigned long count) /* * In case of failure, any residual charges against * mc.to will be dropped by mem_cgroup_clear_mc() - * later on. + * later on. However, cancel any charges that are + * bypassed to root right away or they'll be lost. */ + if (ret == -EINTR) + __mem_cgroup_cancel_charge(root_mem_cgroup, 1); if (ret) return ret; mc.precharge++; -- cgit v1.2.2 From 05b8430123359886ef6a4146fba384e30d771b3f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:59 -0700 Subject: mm: memcontrol: use root_mem_cgroup res_counter Due to an old optimization to keep expensive res_counter changes at a minimum, the root_mem_cgroup res_counter is never charged; there is no limit at that level anyway, and any statistics can be generated on demand by summing up the counters of all other cgroups. However, with per-cpu charge caches, res_counter operations do not even show up in profiles anymore, so this optimization is no longer necessary. Remove it to simplify the code. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 152 ++++++++++++++++---------------------------------------- 1 file changed, 44 insertions(+), 108 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e0ac636315f8..07908ea954b6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2570,9 +2570,8 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, unsigned long nr_reclaimed; unsigned long flags = 0; unsigned long long size; + int ret = 0; - if (mem_cgroup_is_root(memcg)) - goto done; retry: if (consume_stock(memcg, nr_pages)) goto done; @@ -2650,13 +2649,15 @@ nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; bypass: - return -EINTR; + memcg = root_mem_cgroup; + ret = -EINTR; + goto retry; done_restock: if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); done: - return 0; + return ret; } /** @@ -2695,13 +2696,11 @@ static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - if (!mem_cgroup_is_root(memcg)) { - unsigned long bytes = nr_pages * PAGE_SIZE; + unsigned long bytes = nr_pages * PAGE_SIZE; - res_counter_uncharge(&memcg->res, bytes); - if (do_swap_account) - res_counter_uncharge(&memcg->memsw, bytes); - } + res_counter_uncharge(&memcg->res, bytes); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, bytes); } /* @@ -2713,9 +2712,6 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, { unsigned long bytes = nr_pages * PAGE_SIZE; - if (mem_cgroup_is_root(memcg)) - return; - res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); if (do_swap_account) res_counter_uncharge_until(&memcg->memsw, @@ -3943,7 +3939,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, * replacement page, so leave it alone when phasing out the * page that is unused after the migration. */ - if (!end_migration && !mem_cgroup_is_root(memcg)) + if (!end_migration) mem_cgroup_do_uncharge(memcg, nr_pages, ctype); return memcg; @@ -4076,8 +4072,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * We uncharge this because swap is freed. This memcg can * be obsolete one. We avoid calling css_tryget_online(). */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_swap_statistics(memcg, false); css_put(&memcg->css); } @@ -4767,78 +4762,24 @@ out: return retval; } - -static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) -{ - struct mem_cgroup *iter; - long val = 0; - - /* Per-cpu values can be negative, use a signed accumulator */ - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_stat(iter, idx); - - if (val < 0) /* race ? */ - val = 0; - return val; -} - -static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) -{ - u64 val; - - if (!mem_cgroup_is_root(memcg)) { - if (!swap) - return res_counter_read_u64(&memcg->res, RES_USAGE); - else - return res_counter_read_u64(&memcg->memsw, RES_USAGE); - } - - /* - * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS - * as well as in MEM_CGROUP_STAT_RSS_HUGE. - */ - val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); - - if (swap) - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); - - return val << PAGE_SHIFT; -} - static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) + struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - u64 val; - int name; - enum res_type type; - - type = MEMFILE_TYPE(cft->private); - name = MEMFILE_ATTR(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); + int name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, false); - else - val = res_counter_read_u64(&memcg->res, name); - break; + return res_counter_read_u64(&memcg->res, name); case _MEMSWAP: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, true); - else - val = res_counter_read_u64(&memcg->memsw, name); - break; + return res_counter_read_u64(&memcg->memsw, name); case _KMEM: - val = res_counter_read_u64(&memcg->kmem, name); + return res_counter_read_u64(&memcg->kmem, name); break; default: BUG(); } - - return val; } #ifdef CONFIG_MEMCG_KMEM @@ -5300,7 +5241,10 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) if (!t) goto unlock; - usage = mem_cgroup_usage(memcg, swap); + if (!swap) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* * current_threshold points to threshold just below or equal to usage. @@ -5396,15 +5340,15 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG(); - usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before adding a new one */ if (thresholds->primary) __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -5484,18 +5428,19 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, int i, j, size; mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG(); if (!thresholds->primary) goto unlock; - usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before removing */ __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -6249,9 +6194,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) * core guarantees its existence. */ } else { - res_counter_init(&memcg->res, NULL); - res_counter_init(&memcg->memsw, NULL); - res_counter_init(&memcg->kmem, NULL); + res_counter_init(&memcg->res, &root_mem_cgroup->res); + res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); + res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -6387,13 +6332,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) /* Handlers for move charge at task migration. */ static int mem_cgroup_do_precharge(unsigned long count) { - int ret = 0; - - if (mem_cgroup_is_root(mc.to)) { - mc.precharge += count; - /* we don't need css_get for root */ - return ret; - } + int ret; /* Try a single bulk charge without reclaim first */ ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); @@ -6700,21 +6639,18 @@ static void __mem_cgroup_clear_mc(void) /* we must fixup refcnts and charges */ if (mc.moved_swap) { /* uncharge swap account from the old cgroup */ - if (!mem_cgroup_is_root(mc.from)) - res_counter_uncharge(&mc.from->memsw, - PAGE_SIZE * mc.moved_swap); + res_counter_uncharge(&mc.from->memsw, + PAGE_SIZE * mc.moved_swap); for (i = 0; i < mc.moved_swap; i++) css_put(&mc.from->css); - if (!mem_cgroup_is_root(mc.to)) { - /* - * we charged both to->res and to->memsw, so we should - * uncharge to->res. - */ - res_counter_uncharge(&mc.to->res, - PAGE_SIZE * mc.moved_swap); - } + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + res_counter_uncharge(&mc.to->res, + PAGE_SIZE * mc.moved_swap); /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } -- cgit v1.2.2 From 9a2385eef9f28fb5260c48c45fc8fe01f1da70a6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:06:01 -0700 Subject: mm: memcontrol: remove ordering between pc->mem_cgroup and PageCgroupUsed There is a write barrier between setting pc->mem_cgroup and PageCgroupUsed, which was added to allow LRU operations to lookup the memcg LRU list of a page without acquiring the page_cgroup lock. But ever since commit 38c5d72f3ebe ("memcg: simplify LRU handling by new rule"), pages are ensured to be off-LRU while charging, so nobody else is changing LRU state while pc->mem_cgroup is being written, and there are no read barriers anymore. Remove the unnecessary write barrier. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 07908ea954b6..c31bc40a5827 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2795,14 +2795,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, } pc->mem_cgroup = memcg; - /* - * We access a page_cgroup asynchronously without lock_page_cgroup(). - * Especially when a page_cgroup is taken from a page, pc->mem_cgroup - * is accessed after testing USED bit. To make pc->mem_cgroup visible - * before USED bit, we need memory barrier here. - * See mem_cgroup_add_lru_list(), etc. - */ - smp_wmb(); SetPageCgroupUsed(pc); if (lrucare) { @@ -3483,7 +3475,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) { pc = head_pc + i; pc->mem_cgroup = memcg; - smp_wmb();/* see __commit_charge() */ pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; } __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], -- cgit v1.2.2 From a840cda63e543d41270698525542a82b7a8a18d7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:06:04 -0700 Subject: mm: memcontrol: do not acquire page_cgroup lock for kmem pages Kmem page charging and uncharging is serialized by means of exclusive access to the page. Do not take the page_cgroup lock and don't set pc->flags atomically. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Vladimir Davydov Cc: Hugh Dickins Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c31bc40a5827..a6a062e409eb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3407,12 +3407,13 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, memcg_uncharge_kmem(memcg, PAGE_SIZE << order); return; } - + /* + * The page is freshly allocated and not visible to any + * outside callers yet. Set up pc non-atomically. + */ pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); pc->mem_cgroup = memcg; - SetPageCgroupUsed(pc); - unlock_page_cgroup(pc); + pc->flags = PCG_USED; } void __memcg_kmem_uncharge_pages(struct page *page, int order) @@ -3422,19 +3423,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) pc = lookup_page_cgroup(page); - /* - * Fast unlocked return. Theoretically might have changed, have to - * check again after locking. - */ if (!PageCgroupUsed(pc)) return; - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); + memcg = pc->mem_cgroup; + pc->flags = 0; /* * We trust that only if there is a memcg associated with the page, it -- cgit v1.2.2 From 8d07429319b2836604061f48f7e3dfe78acc060c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:06:10 -0700 Subject: mm: vmscan: remove remains of kswapd-managed zone->all_unreclaimable shrink_zones() has a special branch to skip the all_unreclaimable() check during hibernation, because a frozen kswapd can't mark a zone unreclaimable. But ever since commit 6e543d5780e3 ("mm: vmscan: fix do_try_to_free_pages() livelock"), determining a zone to be unreclaimable is done by directly looking at its scan history and no longer relies on kswapd setting the per-zone flag. Remove this branch and let shrink_zones() check the reclaimability of the target zones regardless of hibernation state. Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Cc: Rik van Riel Acked-by: Michal Hocko Cc: Vlastimil Babka Acked-by: Minchan Kim Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 0f16ffe8eb67..19b5b8016209 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2534,14 +2534,6 @@ out: if (sc->nr_reclaimed) return sc->nr_reclaimed; - /* - * As hibernation is going on, kswapd is freezed so that it can't mark - * the zone into all_unreclaimable. Thus bypassing all_unreclaimable - * check. - */ - if (oom_killer_disabled) - return 0; - /* Aborted reclaim to try compaction? don't OOM, then */ if (aborted_reclaim) return 1; -- cgit v1.2.2 From 0b06496a338e83627dc5f0d25323e7a1ae9cb87d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:06:12 -0700 Subject: mm: vmscan: rework compaction-ready signaling in direct reclaim Page reclaim for a higher-order page runs until compaction is ready, then aborts and signals this situation through the return value of shrink_zones(). This is an oddly specific signal to encode in the return value of shrink_zones(), though, and can be quite confusing. Introduce sc->compaction_ready and signal the compactability of the zones out-of-band to free up the return value of shrink_zones() for actual zone reclaimability. Signed-off-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Cc: Michal Hocko Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 70 ++++++++++++++++++++++++++++--------------------------------- 1 file changed, 32 insertions(+), 38 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 19b5b8016209..6f43df4a5253 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -65,6 +65,9 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + /* One of the zones is ready for compaction */ + int compaction_ready; + /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; @@ -2292,15 +2295,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) } /* Returns true if compaction should go ahead for a high-order request */ -static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +static inline bool compaction_ready(struct zone *zone, int order) { unsigned long balance_gap, watermark; bool watermark_ok; - /* Do not consider compaction for orders reclaim is meant to satisfy */ - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) - return false; - /* * Compaction takes time to run and there are potentially other * callers using the pages just freed. Continue reclaiming until @@ -2309,18 +2308,18 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) */ balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); - watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); /* * If compaction is deferred, reclaim up to a point where * compaction will have a chance of success when re-enabled */ - if (compaction_deferred(zone, sc->order)) + if (compaction_deferred(zone, order)) return watermark_ok; /* If compaction is not ready to start, keep reclaiming */ - if (!compaction_suitable(zone, sc->order)) + if (!compaction_suitable(zone, order)) return false; return watermark_ok; @@ -2341,20 +2340,14 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. - * - * This function returns true if a zone is being reclaimed for a costly - * high-order allocation and compaction is ready to begin. This indicates to - * the caller that it should consider retrying the allocation instead of - * further reclaim. */ -static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) +static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; unsigned long lru_pages = 0; - bool aborted_reclaim = false; struct reclaim_state *reclaim_state = current->reclaim_state; gfp_t orig_mask; struct shrink_control shrink = { @@ -2391,22 +2384,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (sc->priority != DEF_PRIORITY && !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ - if (IS_ENABLED(CONFIG_COMPACTION)) { - /* - * If we already have plenty of memory free for - * compaction in this zone, don't free any more. - * Even though compaction is invoked for any - * non-zero order, only frequent costly order - * reclamation is disruptive enough to become a - * noticeable problem, like transparent huge - * page allocations. - */ - if ((zonelist_zone_idx(z) <= requested_highidx) - && compaction_ready(zone, sc)) { - aborted_reclaim = true; - continue; - } + + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticeable problem, like transparent huge + * page allocations. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && + sc->order > PAGE_ALLOC_COSTLY_ORDER && + zonelist_zone_idx(z) <= requested_highidx && + compaction_ready(zone, sc->order)) { + sc->compaction_ready = true; + continue; } + /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and @@ -2444,8 +2439,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * promoted it to __GFP_HIGHMEM. */ sc->gfp_mask = orig_mask; - - return aborted_reclaim; } /* All zones in zonelist are unreclaimable? */ @@ -2489,7 +2482,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, { unsigned long total_scanned = 0; unsigned long writeback_threshold; - bool aborted_reclaim; delayacct_freepages_start(); @@ -2500,11 +2492,14 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - aborted_reclaim = shrink_zones(zonelist, sc); + shrink_zones(zonelist, sc); total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) - goto out; + break; + + if (sc->compaction_ready) + break; /* * If we're getting trouble reclaiming, start doing @@ -2526,16 +2521,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } - } while (--sc->priority >= 0 && !aborted_reclaim); + } while (--sc->priority >= 0); -out: delayacct_freepages_end(); if (sc->nr_reclaimed) return sc->nr_reclaimed; /* Aborted reclaim to try compaction? don't OOM, then */ - if (aborted_reclaim) + if (sc->compaction_ready) return 1; /* top priority shrink_zones still had more to do? don't OOM, then */ -- cgit v1.2.2 From 2344d7e44b870f9df67e505ee4e633217de752ba Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:06:15 -0700 Subject: mm: vmscan: remove all_unreclaimable() Direct reclaim currently calls shrink_zones() to reclaim all members of a zonelist, and if that wasn't successful it does another pass through the same zonelist to check overall reclaimability. Just check reclaimability in shrink_zones() directly and propagate the result through the return value. Then remove all_unreclaimable(). Signed-off-by: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Acked-by: Michal Hocko Cc: Vlastimil Babka Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 6f43df4a5253..74a9e0ae09b0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2244,9 +2244,10 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static void shrink_zone(struct zone *zone, struct scan_control *sc) +static bool shrink_zone(struct zone *zone, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned; + bool reclaimable = false; do { struct mem_cgroup *root = sc->target_mem_cgroup; @@ -2290,8 +2291,13 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + + return reclaimable; } /* Returns true if compaction should go ahead for a high-order request */ @@ -2340,8 +2346,10 @@ static inline bool compaction_ready(struct zone *zone, int order) * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. + * + * Returns true if a zone was reclaimable. */ -static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) +static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; @@ -2354,6 +2362,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) .gfp_mask = sc->gfp_mask, }; enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); + bool reclaimable = false; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2414,10 +2423,17 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; + if (nr_soft_reclaimed) + reclaimable = true; /* need some check for avoid more shrink_zone() */ } - shrink_zone(zone, sc); + if (shrink_zone(zone, sc)) + reclaimable = true; + + if (global_reclaim(sc) && + !reclaimable && zone_reclaimable(zone)) + reclaimable = true; } /* @@ -2439,26 +2455,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * promoted it to __GFP_HIGHMEM. */ sc->gfp_mask = orig_mask; -} - -/* All zones in zonelist are unreclaimable? */ -static bool all_unreclaimable(struct zonelist *zonelist, - struct scan_control *sc) -{ - struct zoneref *z; - struct zone *zone; - for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { - if (!populated_zone(zone)) - continue; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - if (zone_reclaimable(zone)) - return false; - } - - return true; + return reclaimable; } /* @@ -2482,6 +2480,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, { unsigned long total_scanned = 0; unsigned long writeback_threshold; + bool zones_reclaimable; delayacct_freepages_start(); @@ -2492,7 +2491,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - shrink_zones(zonelist, sc); + zones_reclaimable = shrink_zones(zonelist, sc); total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) @@ -2532,8 +2531,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (sc->compaction_ready) return 1; - /* top priority shrink_zones still had more to do? don't OOM, then */ - if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) + /* Any of the zones still reclaimable? Don't OOM. */ + if (zones_reclaimable) return 1; return 0; -- cgit v1.2.2 From 02695175c79b9163c798cc1cb78c628d011c07a6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:06:17 -0700 Subject: mm: vmscan: move swappiness out of scan_control Swappiness is determined for each scanned memcg individually in shrink_zone() and is not a parameter that applies throughout the reclaim scan. Move it out of struct scan_control to prevent accidental use of a stale value. Signed-off-by: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Acked-by: Michal Hocko Cc: Vlastimil Babka Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 74a9e0ae09b0..c28b8981e56a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -89,9 +89,6 @@ struct scan_control { /* Scan (total_size >> priority) pages at once */ int priority; - /* anon vs. file LRUs scanning "ratio" */ - int swappiness; - /* * The memory cgroup that hit its limit and as a result is the * primary target of this reclaim invocation. @@ -1868,8 +1865,8 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, - unsigned long *nr) +static void get_scan_count(struct lruvec *lruvec, int swappiness, + struct scan_control *sc, unsigned long *nr) { struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; @@ -1912,7 +1909,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * using the memory controller's swap limit feature would be * too expensive. */ - if (!global_reclaim(sc) && !sc->swappiness) { + if (!global_reclaim(sc) && !swappiness) { scan_balance = SCAN_FILE; goto out; } @@ -1922,7 +1919,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping). */ - if (!sc->priority && sc->swappiness) { + if (!sc->priority && swappiness) { scan_balance = SCAN_EQUAL; goto out; } @@ -1965,7 +1962,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = sc->swappiness; + anon_prio = swappiness; file_prio = 200 - anon_prio; /* @@ -2055,7 +2052,8 @@ out: /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +static void shrink_lruvec(struct lruvec *lruvec, int swappiness, + struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; @@ -2066,7 +2064,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, sc, nr); + get_scan_count(lruvec, swappiness, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2263,11 +2261,12 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(root, NULL, &reclaim); do { struct lruvec *lruvec; + int swappiness; lruvec = mem_cgroup_zone_lruvec(zone, memcg); + swappiness = mem_cgroup_swappiness(memcg); - sc->swappiness = mem_cgroup_swappiness(memcg); - shrink_lruvec(lruvec, sc); + shrink_lruvec(lruvec, swappiness, sc); /* * Direct reclaim and kswapd have to scan all memory @@ -2714,10 +2713,10 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, .may_swap = !noswap, .order = 0, .priority = 0, - .swappiness = mem_cgroup_swappiness(memcg), .target_mem_cgroup = memcg, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + int swappiness = mem_cgroup_swappiness(memcg); sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2733,7 +2732,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_lruvec(lruvec, &sc); + shrink_lruvec(lruvec, swappiness, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); -- cgit v1.2.2 From ee814fe23daf08abd3ea6c6b1f900f4f25b524d7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:06:19 -0700 Subject: mm: vmscan: clean up struct scan_control Reorder the members by input and output, then turn the individual integers for may_writepage, may_unmap, may_swap, compaction_ready, hibernation_mode into bit fields to save stack space: +72/-296 -224 kswapd 104 176 +72 try_to_free_pages 80 56 -24 try_to_free_mem_cgroup_pages 80 56 -24 shrink_all_memory 88 64 -24 reclaim_clean_pages_from_list 168 144 -24 mem_cgroup_shrink_node_zone 104 80 -24 __zone_reclaim 176 152 -24 balance_pgdat 152 - -152 Signed-off-by: Johannes Weiner Suggested-by: Mel Gorman Acked-by: Mel Gorman Acked-by: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 99 ++++++++++++++++++++++++++++--------------------------------- 1 file changed, 46 insertions(+), 53 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index c28b8981e56a..81dd858b9d17 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -59,35 +59,20 @@ #include struct scan_control { - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Number of pages freed so far during a call to shrink_zones() */ - unsigned long nr_reclaimed; - - /* One of the zones is ready for compaction */ - int compaction_ready; - /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; - unsigned long hibernation_mode; - /* This context's GFP mask */ gfp_t gfp_mask; - int may_writepage; - - /* Can mapped pages be reclaimed? */ - int may_unmap; - - /* Can pages be swapped as part of reclaim? */ - int may_swap; - + /* Allocation order */ int order; - /* Scan (total_size >> priority) pages at once */ - int priority; + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; /* * The memory cgroup that hit its limit and as a result is the @@ -95,11 +80,27 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; - /* - * Nodemask of nodes allowed by the caller. If NULL, all nodes - * are scanned. - */ - nodemask_t *nodemask; + /* Scan (total_size >> priority) pages at once */ + int priority; + + unsigned int may_writepage:1; + + /* Can mapped pages be reclaimed? */ + unsigned int may_unmap:1; + + /* Can pages be swapped as part of reclaim? */ + unsigned int may_swap:1; + + unsigned int hibernation_mode:1; + + /* One of the zones is ready for compaction */ + unsigned int compaction_ready:1; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -2668,15 +2669,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, { unsigned long nr_reclaimed; struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .order = order, + .nodemask = nodemask, + .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, - .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, - .order = order, - .priority = DEF_PRIORITY, - .target_mem_cgroup = NULL, - .nodemask = nodemask, }; /* @@ -2706,14 +2706,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, unsigned long *nr_scanned) { struct scan_control sc = { - .nr_scanned = 0, .nr_to_reclaim = SWAP_CLUSTER_MAX, + .target_mem_cgroup = memcg, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .order = 0, - .priority = 0, - .target_mem_cgroup = memcg, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); int swappiness = mem_cgroup_swappiness(memcg); @@ -2748,16 +2745,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_reclaimed; int nid; struct scan_control sc = { - .may_writepage = !laptop_mode, - .may_unmap = 1, - .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, - .order = 0, - .priority = DEF_PRIORITY, - .target_mem_cgroup = memcg, - .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + .target_mem_cgroup = memcg, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = !noswap, }; /* @@ -3015,12 +3010,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .order = order, .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = 1, - .may_writepage = !laptop_mode, - .order = order, - .target_mem_cgroup = NULL, }; count_vm_event(PAGEOUTRUN); @@ -3401,14 +3395,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { struct reclaim_state reclaim_state; struct scan_control sc = { + .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, - .may_swap = 1, - .may_unmap = 1, + .priority = DEF_PRIORITY, .may_writepage = 1, - .nr_to_reclaim = nr_to_reclaim, + .may_unmap = 1, + .may_swap = 1, .hibernation_mode = 1, - .order = 0, - .priority = DEF_PRIORITY, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; @@ -3588,13 +3581,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct task_struct *p = current; struct reclaim_state reclaim_state; struct scan_control sc = { - .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), - .may_swap = 1, .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, .priority = ZONE_RECLAIM_PRIORITY, + .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_swap = 1, }; struct shrink_control shrink = { .gfp_mask = sc.gfp_mask, -- cgit v1.2.2 From 54980b93c026ac24b7d5046597a254244eafcdeb Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Wed, 6 Aug 2014 16:06:23 -0700 Subject: mm: update the description for madvise_remove Currently, we have more filesystems supporting fallocate, e.g ext4/btrfs. Remove the outdated comment for madvise_remove. Signed-off-by: Wang Sheng-Hui Reviewed-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index a402f8fdc68e..0938b30da4ab 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -292,9 +292,6 @@ static long madvise_dontneed(struct vm_area_struct *vma, /* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. - * - * NOTE: Currently, only shmfs/tmpfs is supported for this operation. - * Other filesystems return -ENOSYS. */ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, -- cgit v1.2.2 From 660654f90e7f8f6d8163276d47fc1573a39c7007 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Aug 2014 16:06:25 -0700 Subject: mm/vmalloc.c: add a schedule point to vmalloc() It is not uncommon on busy servers to get stuck hundred of ms in vmalloc() calls (like file descriptor expansions). Add a cond_resched() to __vmalloc_area_node() to be gentle to other tasks. [akpm@linux-foundation.org: only do it for __GFP_WAIT, per David] Signed-off-by: Eric Dumazet Cc: Hugh Dickins Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fdbb116ee669..a3cad905f560 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1602,6 +1602,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; + if (gfp_mask & __GFP_WAIT) + cond_resched(); } if (map_vm_area(area, prot, &pages)) -- cgit v1.2.2 From 930f036b4ff6501b91e09bba4bf94423203dabd9 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:06:28 -0700 Subject: mm, vmalloc: constify allocation mask tmp_mask in the __vmalloc_area_node() iteration never changes so it can be moved into function scope and marked with const. This causes the movl and orl to only be done once per call rather than area->nr_pages times. nested_gfp can also be marked const. Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a3cad905f560..9ec4173f48a8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1566,7 +1566,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, const int order = 0; struct page **pages; unsigned int nr_pages, array_size, i; - gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1589,12 +1590,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, for (i = 0; i < area->nr_pages; i++) { struct page *page; - gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; if (node == NUMA_NO_NODE) - page = alloc_page(tmp_mask); + page = alloc_page(alloc_mask); else - page = alloc_pages_node(node, tmp_mask, order); + page = alloc_pages_node(node, alloc_mask, order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ -- cgit v1.2.2 From 66ee4b8887ec5ce04bae3e840d206db7b7ad34d1 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 6 Aug 2014 16:06:32 -0700 Subject: shmem: fix double uncharge in __shmem_file_setup() If __shmem_file_setup() fails on struct file allocation it uncharges memory commitment twice: first by shmem_unacct_size() and second time implicitly in shmem_evict_inode() when it kills the newly created inode. This patch removes shmem_unacct_size() from error path if the inode was already there. Signed-off-by: Konstantin Khlebnikov Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index af68b15a8fc1..3609d31ad0dd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2932,16 +2932,16 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, this.len = strlen(name); this.hash = 0; /* will go */ sb = shm_mnt->mnt_sb; + path.mnt = mntget(shm_mnt); path.dentry = d_alloc_pseudo(sb, &this); if (!path.dentry) goto put_memory; d_set_d_op(path.dentry, &anon_ops); - path.mnt = mntget(shm_mnt); res = ERR_PTR(-ENOSPC); inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) - goto put_dentry; + goto put_memory; inode->i_flags |= i_flags; d_instantiate(path.dentry, inode); @@ -2949,19 +2949,19 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); if (IS_ERR(res)) - goto put_dentry; + goto put_path; res = alloc_file(&path, FMODE_WRITE | FMODE_READ, &shmem_file_operations); if (IS_ERR(res)) - goto put_dentry; + goto put_path; return res; -put_dentry: - path_put(&path); put_memory: shmem_unacct_size(flags, size); +put_path: + path_put(&path); return res; } -- cgit v1.2.2 From 77142517990fd3d982678c2945ea2c4188ec5f9a Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 6 Aug 2014 16:06:34 -0700 Subject: shmem: update memory reservation on truncate A shared anonymous mapping created without MAP_NORESERVE holds memory reservation for whole range of shmem segment. Usually there is no way to change its size, but /proc//map_files/... (available if CONFIG_CHECKPOINT_RESTORE=y) allows that. This patch adjusts the memory reservation in shmem_setattr(). Signed-off-by: Konstantin Khlebnikov Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 3609d31ad0dd..57fd82a5af7a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -149,6 +149,19 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) vm_unacct_memory(VM_ACCT(size)); } +static inline int shmem_reacct_size(unsigned long flags, + loff_t oldsize, loff_t newsize) +{ + if (!(flags & VM_NORESERVE)) { + if (VM_ACCT(newsize) > VM_ACCT(oldsize)) + return security_vm_enough_memory_mm(current->mm, + VM_ACCT(newsize) - VM_ACCT(oldsize)); + else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) + vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); + } + return 0; +} + /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow huge sparse files. @@ -549,6 +562,10 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) loff_t newsize = attr->ia_size; if (newsize != oldsize) { + error = shmem_reacct_size(SHMEM_I(inode)->flags, + oldsize, newsize); + if (error) + return error; i_size_write(inode, newsize); inode->i_ctime = inode->i_mtime = CURRENT_TIME; } -- cgit v1.2.2 From 82f71ae4a2b829a25971bdf54b4d0d3d69d3c8b7 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 6 Aug 2014 16:06:36 -0700 Subject: mm: catch memory commitment underflow Print a warning (if CONFIG_DEBUG_VM=y) when memory commitment becomes too negative. This shouldn't happen any more - the previous two patches fixed the committed_as underflow issues. [akpm@linux-foundation.org: use VM_WARN_ONCE, per Dave] Signed-off-by: Konstantin Khlebnikov Cc: Hugh Dickins Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 129b847d30cc..64c9d736155c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -134,6 +135,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed, reserve; + VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < + -(s64)vm_committed_as_batch * num_online_cpus(), + "memory commitment underflow"); + vm_acct_memory(pages); /* -- cgit v1.2.2 From cc7452b6dca384400960d40090a98d0eb920ab22 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Wed, 6 Aug 2014 16:06:38 -0700 Subject: mm: export NR_SHMEM via sysinfo(2) / si_meminfo() interfaces Historically, we exported shared pages to userspace via sysinfo(2) sharedram and /proc/meminfo's "MemShared" fields. With the advent of tmpfs, from kernel v2.4 onward, that old way for accounting shared mem was deemed inaccurate and we started to export a hard-coded 0 for sysinfo.sharedram. Later on, during the 2.6 timeframe, "MemShared" got re-introduced to /proc/meminfo re-branded as "Shmem", but we're still reporting sysinfo.sharedmem as that old hard-coded zero, which makes the "shared memory" report inconsistent across interfaces. This patch leverages the addition of explicit accounting for pages used by shmem/tmpfs -- "4b02108 mm: oom analysis: add shmem vmstat" -- in order to make the users of sysinfo(2) and si_meminfo*() friends aware of that vmstat entry and make them report it consistently across the interfaces, as well to make sysinfo(2) returned data consistent with our current API documentation states. Signed-off-by: Rafael Aquini Acked-by: Rik van Riel Cc: Mel Gorman Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c1c6cb78e5ca..0987ac9f0a4e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3047,7 +3047,7 @@ static inline void show_node(struct zone *zone) void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; - val->sharedram = 0; + val->sharedram = global_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; @@ -3067,6 +3067,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += pgdat->node_zones[zone_type].managed_pages; val->totalram = managed_pages; + val->sharedram = node_page_state(nid, NR_SHMEM); val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; -- cgit v1.2.2 From c2ea2181db43ced2e5945b9596bb3bb9935ce92e Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:06:41 -0700 Subject: mm/hwpoison-inject.c: remove unnecessary null test before debugfs_remove_recursive Fix checkpatch warning: "WARNING: debugfs_remove_recursive(NULL) is safe this check is probably not required" Signed-off-by: Fabian Frederick Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hwpoison-inject.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 95487c71cad5..329caf56df22 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -72,8 +72,7 @@ DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); static void pfn_inject_exit(void) { - if (hwpoison_dir) - debugfs_remove_recursive(hwpoison_dir); + debugfs_remove_recursive(hwpoison_dir); } static int pfn_inject_init(void) -- cgit v1.2.2 From eb39d618f9e80f81cfc5788cf1b252d141c2f0c3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 6 Aug 2014 16:06:43 -0700 Subject: mm: replace init_page_accessed by __SetPageReferenced Do we really need an exported alias for __SetPageReferenced()? Its callers better know what they're doing, in which case the page would not be already marked referenced. Kill init_page_accessed(), just __SetPageReferenced() inline. Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Michal Hocko Cc: Dave Hansen Cc: Prabhakar Lad Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- mm/shmem.c | 2 +- mm/swap.c | 14 +++----------- 3 files changed, 6 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 65d44fd88c78..7e85c8147e1b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1091,9 +1091,9 @@ no_page: if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) fgp_flags |= FGP_LOCK; - /* Init accessed so avoit atomic mark_page_accessed later */ + /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) - init_page_accessed(page); + __SetPageReferenced(page); err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); if (unlikely(err)) { diff --git a/mm/shmem.c b/mm/shmem.c index 57fd82a5af7a..fe15d96c3166 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1166,7 +1166,7 @@ repeat: __SetPageSwapBacked(page); __set_page_locked(page); if (sgp == SGP_WRITE) - init_page_accessed(page); + __SetPageReferenced(page); error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); diff --git a/mm/swap.c b/mm/swap.c index 9e8e3472248b..d8eb4d09ffa2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -589,6 +589,9 @@ static void __lru_cache_activate_page(struct page *page) * inactive,unreferenced -> inactive,referenced * inactive,referenced -> active,unreferenced * active,unreferenced -> active,referenced + * + * When a newly allocated page is not yet visible, so safe for non-atomic ops, + * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). */ void mark_page_accessed(struct page *page) { @@ -614,17 +617,6 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -/* - * Used to mark_page_accessed(page) that is not visible yet and when it is - * still safe to use non-atomic ops - */ -void init_page_accessed(struct page *page) -{ - if (!PageReferenced(page)) - __SetPageReferenced(page); -} -EXPORT_SYMBOL(init_page_accessed); - static void __lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvec); -- cgit v1.2.2 From 2f4612af43d4854c892f5ef8ed7a98b6492aee44 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 6 Aug 2014 16:06:45 -0700 Subject: mm,hugetlb: make unmap_ref_private() return void This function always returns 1, thus no need to check return value in hugetlb_cow(). By doing so, we can get rid of the unnecessary WARN_ON call. While this logic perhaps existed as a way of identifying future unmap_ref_private() mishandling, reality is it serves no apparent purpose. Signed-off-by: Davidlohr Bueso Cc: Aswin Chandramouleeswaran Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7a0a73d2fcff..b94752ae791b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2754,8 +2754,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */ -static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, - struct page *page, unsigned long address) +static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, unsigned long address) { struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; @@ -2794,8 +2794,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, address + huge_page_size(h), page); } mutex_unlock(&mapping->i_mmap_mutex); - - return 1; } /* @@ -2857,20 +2855,18 @@ retry_avoidcopy: */ if (outside_reserve) { BUG_ON(huge_pte_none(pte)); - if (unmap_ref_private(mm, vma, old_page, address)) { - BUG_ON(huge_pte_none(pte)); - spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); - if (likely(ptep && - pte_same(huge_ptep_get(ptep), pte))) - goto retry_avoidcopy; - /* - * race occurs while re-acquiring page table - * lock, and our job is done. - */ - return 0; - } - WARN_ON_ONCE(1); + unmap_ref_private(mm, vma, old_page, address); + BUG_ON(huge_pte_none(pte)); + spin_lock(ptl); + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + if (likely(ptep && + pte_same(huge_ptep_get(ptep), pte))) + goto retry_avoidcopy; + /* + * race occurs while re-acquiring page table + * lock, and our job is done. + */ + return 0; } /* Caller expects lock to be held */ -- cgit v1.2.2 From ad4404a226ea92f2966f0e5378614e15ff4a7c76 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 6 Aug 2014 16:06:47 -0700 Subject: mm,hugetlb: simplify error handling in hugetlb_cow() When returning from hugetlb_cow(), we always (1) put back the refcount for each referenced page -- always 'old', and 'new' if allocation was successful. And (2) retake the page table lock right before returning, as the callers expects. This logic can be simplified and encapsulated, as proposed in this patch. In addition to cleaner code, we also shave a few bytes off the instruction text: text data bss dec hex filename 28399 462 41328 70189 1122d mm/hugetlb.o-baseline 28367 462 41328 70157 1120d mm/hugetlb.o-patched Passes libhugetlbfs testcases. Signed-off-by: Davidlohr Bueso Cc: Aswin Chandramouleeswaran Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b94752ae791b..e84d22ce5de8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2808,7 +2808,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, { struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; - int outside_reserve = 0; + int ret = 0, outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -2838,14 +2838,14 @@ retry_avoidcopy: page_cache_get(old_page); - /* Drop page table lock as buddy allocator may be called */ + /* + * Drop page table lock as buddy allocator may be called. It will + * be acquired again before returning to the caller, as expected. + */ spin_unlock(ptl); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { - long err = PTR_ERR(new_page); - page_cache_release(old_page); - /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient @@ -2854,6 +2854,7 @@ retry_avoidcopy: * may get SIGKILLed if it later faults. */ if (outside_reserve) { + page_cache_release(old_page); BUG_ON(huge_pte_none(pte)); unmap_ref_private(mm, vma, old_page, address); BUG_ON(huge_pte_none(pte)); @@ -2869,12 +2870,9 @@ retry_avoidcopy: return 0; } - /* Caller expects lock to be held */ - spin_lock(ptl); - if (err == -ENOMEM) - return VM_FAULT_OOM; - else - return VM_FAULT_SIGBUS; + ret = (PTR_ERR(new_page) == -ENOMEM) ? + VM_FAULT_OOM : VM_FAULT_SIGBUS; + goto out_release_old; } /* @@ -2882,11 +2880,8 @@ retry_avoidcopy: * anon_vma prepared. */ if (unlikely(anon_vma_prepare(vma))) { - page_cache_release(new_page); - page_cache_release(old_page); - /* Caller expects lock to be held */ - spin_lock(ptl); - return VM_FAULT_OOM; + ret = VM_FAULT_OOM; + goto out_release_all; } copy_user_huge_page(new_page, old_page, address, vma, @@ -2896,6 +2891,7 @@ retry_avoidcopy: mmun_start = address & huge_page_mask(h); mmun_end = mmun_start + huge_page_size(h); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + /* * Retake the page table lock to check for racing updates * before the page tables are altered @@ -2916,12 +2912,13 @@ retry_avoidcopy: } spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +out_release_all: page_cache_release(new_page); +out_release_old: page_cache_release(old_page); - /* Caller expects lock to be held */ - spin_lock(ptl); - return 0; + spin_lock(ptl); /* Caller expects lock to be held */ + return ret; } /* Return the pagecache page at a given address within a VMA */ -- cgit v1.2.2 From f37d4298aa7f8b74395aa13c728677e2ed86fdaf Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 6 Aug 2014 16:06:49 -0700 Subject: hwpoison: fix race with changing page during offlining When a hwpoison page is locked it could change state due to parallel modifications. The original compound page can be torn down and then this 4k page becomes part of a differently-size compound page is is a standalone regular page. Check after the lock if the page is still the same compound page. We could go back, grab the new head page and try again but it should be quite rare, so I thought this was safest. A retry loop would be more difficult to test and may have more side effects. The hwpoison code by design only tries to handle cases that are reasonably common in workloads, as visible in page-flags. I'm not really that concerned about handling this (likely rare case), just not crashing on it. Signed-off-by: Andi Kleen Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a013bc94ebbe..44c6bd201d3a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1172,6 +1172,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) lock_page(hpage); + /* + * The page could have changed compound pages during the locking. + * If this happens just bail out. + */ + if (compound_head(p) != hpage) { + action_result(pfn, "different compound page after locking", IGNORED); + res = -EBUSY; + goto out; + } + /* * We use page flags to determine what action should be taken, but * the flags can be modified by the error containment action. One -- cgit v1.2.2 From 238d3c13f0cce38752072dc90f4e828abdfec143 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:06:51 -0700 Subject: mm, hugetlb: generalize writes to nr_hugepages Three different interfaces alter the maximum number of hugepages for an hstate: - /proc/sys/vm/nr_hugepages for global number of hugepages of the default hstate, - /sys/kernel/mm/hugepages/hugepages-X/nr_hugepages for global number of hugepages for a specific hstate, and - /sys/kernel/mm/hugepages/hugepages-X/nr_hugepages/mempolicy for number of hugepages for a specific hstate over the set of allowed nodes. Generalize the code so that a single function handles all of these writes instead of duplicating the code in two different functions. This decreases the number of lines of code, but also reduces the size of .text by about half a percent since set_max_huge_pages() can be inlined. Signed-off-by: David Rientjes Cc: Joonsoo Kim Reviewed-by: Naoya Horiguchi Reviewed-by: Luiz Capitulino Cc: "Kirill A. Shutemov" Acked-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 58 ++++++++++++++++++++++++++-------------------------------- 1 file changed, 26 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e84d22ce5de8..7a0fcb33973e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1734,21 +1734,13 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, return sprintf(buf, "%lu\n", nr_huge_pages); } -static ssize_t nr_hugepages_store_common(bool obey_mempolicy, - struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t len) +static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, + struct hstate *h, int nid, + unsigned long count, size_t len) { int err; - int nid; - unsigned long count; - struct hstate *h; NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); - err = kstrtoul(buf, 10, &count); - if (err) - goto out; - - h = kobj_to_hstate(kobj, &nid); if (hstate_is_gigantic(h) && !gigantic_page_supported()) { err = -EINVAL; goto out; @@ -1784,6 +1776,23 @@ out: return err; } +static ssize_t nr_hugepages_store_common(bool obey_mempolicy, + struct kobject *kobj, const char *buf, + size_t len) +{ + struct hstate *h; + unsigned long count; + int nid; + int err; + + err = kstrtoul(buf, 10, &count); + if (err) + return err; + + h = kobj_to_hstate(kobj, &nid); + return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); +} + static ssize_t nr_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1793,7 +1802,7 @@ static ssize_t nr_hugepages_show(struct kobject *kobj, static ssize_t nr_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - return nr_hugepages_store_common(false, kobj, attr, buf, len); + return nr_hugepages_store_common(false, kobj, buf, len); } HSTATE_ATTR(nr_hugepages); @@ -1812,7 +1821,7 @@ static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - return nr_hugepages_store_common(true, kobj, attr, buf, len); + return nr_hugepages_store_common(true, kobj, buf, len); } HSTATE_ATTR(nr_hugepages_mempolicy); #endif @@ -2248,36 +2257,21 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; - unsigned long tmp; + unsigned long tmp = h->max_huge_pages; int ret; if (!hugepages_supported()) return -ENOTSUPP; - tmp = h->max_huge_pages; - - if (write && hstate_is_gigantic(h) && !gigantic_page_supported()) - return -EINVAL; - table->data = &tmp; table->maxlen = sizeof(unsigned long); ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); if (ret) goto out; - if (write) { - NODEMASK_ALLOC(nodemask_t, nodes_allowed, - GFP_KERNEL | __GFP_NORETRY); - if (!(obey_mempolicy && - init_nodemask_of_mempolicy(nodes_allowed))) { - NODEMASK_FREE(nodes_allowed); - nodes_allowed = &node_states[N_MEMORY]; - } - h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); - - if (nodes_allowed != &node_states[N_MEMORY]) - NODEMASK_FREE(nodes_allowed); - } + if (write) + ret = __nr_hugepages_store_common(obey_mempolicy, h, + NUMA_NO_NODE, tmp, *length); out: return ret; } -- cgit v1.2.2 From ed4d4902ebdd7ca8b5a51daaf6bebf4b172895cc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:06:54 -0700 Subject: mm, hugetlb: remove hugetlb_zero and hugetlb_infinity They are unnecessary: "zero" can be used in place of "hugetlb_zero" and passing extra2 == NULL is equivalent to infinity. Signed-off-by: David Rientjes Cc: Joonsoo Kim Reviewed-by: Naoya Horiguchi Reviewed-by: Luiz Capitulino Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7a0fcb33973e..d9ad93b55585 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,7 +35,6 @@ #include #include "internal.h" -const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; unsigned long hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; -- cgit v1.2.2 From 21bda264f4243f61dfcc485174055f12ad0530b4 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 6 Aug 2014 16:06:56 -0700 Subject: mm: make copy_pte_range static again Commit 71e3aac0724f ("thp: transparent hugepage core") adds copy_pte_range prototype to huge_mm.h. I'm not sure why (or if) this function have been used outside of memory.c, but it currently isn't. This patch makes copy_pte_range() static again. Signed-off-by: Jerome Marchand Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 06ff0720d75a..01d0289f30a7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -884,7 +884,7 @@ out_set_pte: return 0; } -int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { -- cgit v1.2.2 From f6f8ed47353597dcb895eb4a15a28af657392e72 Mon Sep 17 00:00:00 2001 From: WANG Chao Date: Wed, 6 Aug 2014 16:06:58 -0700 Subject: mm/vmalloc.c: clean up map_vm_area third argument Currently map_vm_area() takes (struct page *** pages) as third argument, and after mapping, it moves (*pages) to point to (*pages + nr_mappped_pages). It looks like this kind of increment is useless to its caller these days. The callers don't care about the increments and actually they're trying to avoid this by passing another copy to map_vm_area(). The caller can always guarantee all the pages can be mapped into vm_area as specified in first argument and the caller only cares about whether map_vm_area() fails or not. This patch cleans up the pointer movement in map_vm_area() and updates its callers accordingly. Signed-off-by: WANG Chao Cc: Zhang Yanfei Acked-by: Greg Kroah-Hartman Cc: Minchan Kim Cc: Nitin Gupta Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 14 +++++--------- mm/zsmalloc.c | 2 +- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9ec4173f48a8..2b0aa5486092 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1270,19 +1270,15 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) } EXPORT_SYMBOL_GPL(unmap_kernel_range); -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) { unsigned long addr = (unsigned long)area->addr; unsigned long end = addr + get_vm_area_size(area); int err; - err = vmap_page_range(addr, end, prot, *pages); - if (err > 0) { - *pages += err; - err = 0; - } + err = vmap_page_range(addr, end, prot, pages); - return err; + return err > 0 ? 0 : err; } EXPORT_SYMBOL_GPL(map_vm_area); @@ -1548,7 +1544,7 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_vm_area(area, prot, &pages)) { + if (map_vm_area(area, prot, pages)) { vunmap(area->addr); return NULL; } @@ -1606,7 +1602,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, cond_resched(); } - if (map_vm_area(area, prot, &pages)) + if (map_vm_area(area, prot, pages)) goto fail; return area->addr; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index fe78189624cf..bb62a4adc328 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -690,7 +690,7 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); + BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); area->vm_addr = area->vm->addr; return area->vm_addr + off; } -- cgit v1.2.2 From 9aed8614af5a05cdaa32a0b78b0f1a424754a958 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 6 Aug 2014 16:07:05 -0700 Subject: mm/memory.c: don't forget to set softdirty on file mapped fault Otherwise we may not notice that pte was softdirty because pte_mksoft_dirty helper _returns_ new pte but doesn't modify the argument. In case if page fault happend on dirty filemapping the newly created pte may loose softdirty bit thus if a userspace program is tracking memory changes with help of a memory tracker (CONFIG_MEM_SOFT_DIRTY) it might miss modification of a memory page (which in worts case may lead to data inconsistency). Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 01d0289f30a7..7e131325bdf8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2744,7 +2744,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) - pte_mksoft_dirty(entry); + entry = pte_mksoft_dirty(entry); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); -- cgit v1.2.2 From d0480be44a90af81a425f426c9107fb8f0899f65 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Wed, 6 Aug 2014 16:07:07 -0700 Subject: mm: update the description for vm_total_pages vm_total_pages is calculated by nr_free_pagecache_pages(), which counts the number of pages which are beyond the high watermark within all zones. So vm_total_pages is not equal to total number of pages which the VM controls. Signed-off-by: Wang Sheng-Hui Cc: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 81dd858b9d17..5fec1ba9951f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -137,7 +137,11 @@ struct scan_control { * From 0 .. 100. Higher means more swappy. */ int vm_swappiness = 60; -unsigned long vm_total_pages; /* The total number of pages which the VM controls */ +/* + * The total number of pages which are beyond the high watermark within all + * zones. + */ +unsigned long vm_total_pages; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); -- cgit v1.2.2 From 24b7e5819ad5cbef2b7c7376510862aa8319d240 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:11 -0700 Subject: mm: pagemap: avoid unnecessary overhead when tracepoints are deactivated This was formerly the series "Improve sequential read throughput" which noted some major differences in performance of tiobench since 3.0. While there are a number of factors, two that dominated were the introduction of the fair zone allocation policy and changes to CFQ. The behaviour of fair zone allocation policy makes more sense than tiobench as a benchmark and CFQ defaults were not changed due to insufficient benchmarking. This series is what's left. It's one functional fix to the fair zone allocation policy when used on NUMA machines and a reduction of overhead in general. tiobench was used for the comparison despite its flaws as an IO benchmark as in this case we are primarily interested in the overhead of page allocator and page reclaim activity. On UMA, it makes little difference to overhead 3.16.0-rc3 3.16.0-rc3 vanilla lowercost-v5 User 383.61 386.77 System 403.83 401.74 Elapsed 5411.50 5413.11 On a 4-socket NUMA machine it's a bit more noticable 3.16.0-rc3 3.16.0-rc3 vanilla lowercost-v5 User 746.94 802.00 System 65336.22 40852.33 Elapsed 27553.52 27368.46 This patch (of 6): The LRU insertion and activate tracepoints take PFN as a parameter forcing the overhead to the caller. Move the overhead to the tracepoint fast-assign method to ensure the cost is only incurred when the tracepoint is active. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index d8eb4d09ffa2..c789d01c9ec3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -501,7 +501,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, SetPageActive(page); lru += LRU_ACTIVE; add_page_to_lru_list(page, lruvec, lru); - trace_mm_lru_activate(page, page_to_pfn(page)); + trace_mm_lru_activate(page); __count_vm_event(PGACTIVATE); update_page_reclaim_stat(lruvec, file, 1); @@ -988,7 +988,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, SetPageLRU(page); add_page_to_lru_list(page, lruvec, lru); update_page_reclaim_stat(lruvec, file, active); - trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); + trace_mm_lru_insertion(page, lru); } /* -- cgit v1.2.2 From 3484b2de9499df23c4604a513b36f96326ae81ad Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:14 -0700 Subject: mm: rearrange zone fields into read-only, page alloc, statistics and page reclaim lines The arrangement of struct zone has changed over time and now it has reached the point where there is some inappropriate sharing going on. On x86-64 for example o The zone->node field is shared with the zone lock and zone->node is accessed frequently from the page allocator due to the fair zone allocation policy. o span_seqlock is almost never used by shares a line with free_area o Some zone statistics share a cache line with the LRU lock so reclaim-intensive and allocator-intensive workloads can bounce the cache line on a stat update This patch rearranges struct zone to put read-only and read-mostly fields together and then splits the page allocator intensive fields, the zone statistics and the page reclaim intensive fields into their own cache lines. Note that the type of lowmem_reserve changes due to the watermark calculations being signed and avoiding a signed/unsigned conversion there. On the test configuration I used the overall size of struct zone shrunk by one cache line. On smaller machines, this is not likely to be noticable. However, on a 4-node NUMA machine running tiobench the system CPU overhead is reduced by this patch. 3.16.0-rc3 3.16.0-rc3 vanillarearrange-v5r9 User 746.94 759.78 System 65336.22 58350.98 Elapsed 27553.52 27282.02 Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++---- mm/vmstat.c | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0987ac9f0a4e..b7381d11f021 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1708,7 +1708,6 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, { /* free_pages my go negative - that's OK */ long min = mark; - long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; long free_cma = 0; @@ -1723,7 +1722,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages - free_cma <= min + lowmem_reserve) + if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -3254,7 +3253,7 @@ void show_free_areas(unsigned int filter) ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %lu", zone->lowmem_reserve[i]); + printk(" %ld", zone->lowmem_reserve[i]); printk("\n"); } @@ -5575,7 +5574,7 @@ static void calculate_totalreserve_pages(void) for_each_online_pgdat(pgdat) { for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; - unsigned long max = 0; + long max = 0; /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { diff --git a/mm/vmstat.c b/mm/vmstat.c index b37bd49bfd55..8267f77d1875 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1077,10 +1077,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone_page_state(zone, i)); seq_printf(m, - "\n protection: (%lu", + "\n protection: (%ld", zone->lowmem_reserve[0]); for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) - seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, ", %ld", zone->lowmem_reserve[i]); seq_printf(m, ")" "\n pagesets"); -- cgit v1.2.2 From 0d5d823ab4e608ec7b52ac4410de4cb74bbe0edd Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:16 -0700 Subject: mm: move zone->pages_scanned into a vmstat counter zone->pages_scanned is a write-intensive cache line during page reclaim and it's also updated during page free. Move the counter into vmstat to take advantage of the per-cpu updates and do not update it in the free paths unless necessary. On a small UMA machine running tiobench the difference is marginal. On a 4-node machine the overhead is more noticable. Note that automatic NUMA balancing was disabled for this test as otherwise the system CPU overhead is unpredictable. 3.16.0-rc3 3.16.0-rc3 3.16.0-rc3 vanillarearrange-v5 vmstat-v5 User 746.94 759.78 774.56 System 65336.22 58350.98 32847.27 Elapsed 27553.52 27282.02 27415.04 Note that the overhead reduction will vary depending on where exactly pages are allocated and freed. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 +++++++++--- mm/vmscan.c | 7 ++++--- mm/vmstat.c | 3 ++- 3 files changed, 15 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b7381d11f021..daa016063793 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -680,9 +680,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, int migratetype = 0; int batch_free = 0; int to_free = count; + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); while (to_free) { struct page *page; @@ -731,8 +734,11 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); __free_one_page(page, pfn, zone, order, migratetype); if (unlikely(!is_migrate_isolate(migratetype))) @@ -3248,7 +3254,7 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_BOUNCE)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - zone->pages_scanned, + K(zone_page_state(zone, NR_PAGES_SCANNED)), (!zone_reclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); diff --git a/mm/vmscan.c b/mm/vmscan.c index 5fec1ba9951f..9c8222b499b4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -174,7 +174,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) bool zone_reclaimable(struct zone *zone) { - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; + return zone_page_state(zone, NR_PAGES_SCANNED) < + zone_reclaimable_pages(zone) * 6; } static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) @@ -1508,7 +1509,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); if (global_reclaim(sc)) { - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); else @@ -1698,7 +1699,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); if (global_reclaim(sc)) - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); reclaim_stat->recent_scanned[file] += nr_taken; diff --git a/mm/vmstat.c b/mm/vmstat.c index 8267f77d1875..e574e883fa70 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -763,6 +763,7 @@ const char * const vmstat_text[] = { "nr_shmem", "nr_dirtied", "nr_written", + "nr_pages_scanned", #ifdef CONFIG_NUMA "numa_hit", @@ -1067,7 +1068,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - zone->pages_scanned, + zone_page_state(zone, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); -- cgit v1.2.2 From bb0b6dffa2ccfbd9747ad0cc87c7459622896e60 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:18 -0700 Subject: mm: vmscan: only update per-cpu thresholds for online CPU When kswapd is awake reclaiming, the per-cpu stat thresholds are lowered to get more accurate counts to avoid breaching watermarks. This threshold update iterates over all possible CPUs which is unnecessary. Only online CPUs need to be updated. If a new CPU is onlined, refresh_zone_stat_thresholds() will set the thresholds correctly. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index e574e883fa70..e9ab104b956f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, continue; threshold = (*calculate_pressure)(zone); - for_each_possible_cpu(cpu) + for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; } -- cgit v1.2.2 From f7b5d647946aae1647bf5cd26c16b3a793c1ac49 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:20 -0700 Subject: mm: page_alloc: abort fair zone allocation policy when remotes nodes are encountered The purpose of numa_zonelist_order=zone is to preserve lower zones for use with 32-bit devices. If locality is preferred then the numa_zonelist_order=node policy should be used. Unfortunately, the fair zone allocation policy overrides this by skipping zones on remote nodes until the lower one is found. While this makes sense from a page aging and performance perspective, it breaks the expected zonelist policy. This patch restores the expected behaviour for zone-list ordering. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index daa016063793..6e5e8f762532 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1965,7 +1965,7 @@ zonelist_scan: */ if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) - continue; + break; if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) continue; } -- cgit v1.2.2 From 4ffeaf3560a52b4a69cc7909873d08c0ef5909d4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:22 -0700 Subject: mm: page_alloc: reduce cost of the fair zone allocation policy The fair zone allocation policy round-robins allocations between zones within a node to avoid age inversion problems during reclaim. If the first allocation fails, the batch counts are reset and a second attempt made before entering the slow path. One assumption made with this scheme is that batches expire at roughly the same time and the resets each time are justified. This assumption does not hold when zones reach their low watermark as the batches will be consumed at uneven rates. Allocation failure due to watermark depletion result in additional zonelist scans for the reset and another watermark check before hitting the slowpath. On UMA, the benefit is negligible -- around 0.25%. On 4-socket NUMA machine it's variable due to the variability of measuring overhead with the vmstat changes. The system CPU overhead comparison looks like 3.16.0-rc3 3.16.0-rc3 3.16.0-rc3 vanilla vmstat-v5 lowercost-v5 User 746.94 774.56 802.00 System 65336.22 32847.27 40852.33 Elapsed 27553.52 27415.04 27368.46 However it is worth noting that the overall benchmark still completed faster and intuitively it makes sense to take as few passes as possible through the zonelists. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 101 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 53 insertions(+), 48 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e5e8f762532..fb9908148474 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1612,6 +1612,9 @@ again: } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && + !zone_is_fair_depleted(zone)) + zone_set_flag(zone, ZONE_FAIR_DEPLETED); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1923,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) #endif /* CONFIG_NUMA */ +static void reset_alloc_batches(struct zone *preferred_zone) +{ + struct zone *zone = preferred_zone->zone_pgdat->node_zones; + + do { + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + zone_clear_flag(zone, ZONE_FAIR_DEPLETED); + } while (zone++ != preferred_zone); +} + /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. @@ -1940,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, int did_zlc_setup = 0; /* just call zlc_setup() one time */ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE); + int nr_fair_skipped = 0; + bool zonelist_rescan; zonelist_scan: + zonelist_rescan = false; + /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. @@ -1966,8 +1985,10 @@ zonelist_scan: if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) break; - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) + if (zone_is_fair_depleted(zone)) { + nr_fair_skipped++; continue; + } } /* * When allocating a page cache page for writing, we @@ -2073,13 +2094,7 @@ this_zone_full: zlc_mark_zone_full(zonelist, z); } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { - /* Disable zlc cache for second zonelist scan */ - zlc_active = 0; - goto zonelist_scan; - } - - if (page) + if (page) { /* * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was * necessary to allocate the page. The expectation is @@ -2088,8 +2103,37 @@ this_zone_full: * for !PFMEMALLOC purposes. */ page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return page; + } - return page; + /* + * The first pass makes sure allocations are spread fairly within the + * local node. However, the local node might have free pages left + * after the fairness batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without fairness, and + * include remote zones now, before entering the slowpath and waking + * kswapd: prefer spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + alloc_flags &= ~ALLOC_FAIR; + if (nr_fair_skipped) { + zonelist_rescan = true; + reset_alloc_batches(preferred_zone); + } + if (nr_online_nodes > 1) + zonelist_rescan = true; + } + + if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { + /* Disable zlc cache for second zonelist scan */ + zlc_active = 0; + zonelist_rescan = true; + } + + if (zonelist_rescan) + goto zonelist_scan; + + return NULL; } /* @@ -2410,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void reset_alloc_batches(struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - /* - * Only reset the batches of zones that were actually - * considered in the fairness pass, we don't want to - * trash fairness information for zones that are not - * actually part of this zonelist's round-robin cycle. - */ - if (!zone_local(preferred_zone, zone)) - continue; - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - } -} - static void wake_all_kswapds(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2767,28 +2789,11 @@ retry_cpuset: if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif -retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, classzone_idx, migratetype); if (unlikely(!page)) { - /* - * The first pass makes sure allocations are spread - * fairly within the local node. However, the local - * node might have free pages left after the fairness - * batches are exhausted, and remote zones haven't - * even been considered yet. Try once more without - * fairness, and include remote zones now, before - * entering the slowpath and waking kswapd: prefer - * spilling to a remote zone over swapping locally. - */ - if (alloc_flags & ALLOC_FAIR) { - reset_alloc_batches(zonelist, high_zoneidx, - preferred_zone); - alloc_flags &= ~ALLOC_FAIR; - goto retry; - } /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not -- cgit v1.2.2 From 9a95f3cf7b33d66fa64727cff8cd2f2a9d09f335 Mon Sep 17 00:00:00 2001 From: Paul Cassella Date: Wed, 6 Aug 2014 16:07:24 -0700 Subject: mm: describe mmap_sem rules for __lock_page_or_retry() and callers Add a comment describing the circumstances in which __lock_page_or_retry() will or will not release the mmap_sem when returning 0. Add comments to lock_page_or_retry()'s callers (filemap_fault(), do_swap_page()) noting the impact on VM_FAULT_RETRY returns. Add comments on up the call tree, particularly replacing the false "We return with mmap_sem still held" comments. Signed-off-by: Paul Cassella Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 23 +++++++++++++++++++++++ mm/gup.c | 18 +++++++++++++++--- mm/memory.c | 34 +++++++++++++++++++++++++++++++--- mm/mlock.c | 9 ++++++++- 4 files changed, 77 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 7e85c8147e1b..af19a6b079f5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -808,6 +808,17 @@ int __lock_page_killable(struct page *page) } EXPORT_SYMBOL_GPL(__lock_page_killable); +/* + * Return values: + * 1 - page is locked; mmap_sem is still held. + * 0 - page is not locked. + * mmap_sem has been released (up_read()), unless flags had both + * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in + * which case mmap_sem is still held. + * + * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 + * with the page locked and the mmap_sem unperturbed. + */ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { @@ -1827,6 +1838,18 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. + * + * vma->vm_mm->mmap_sem must be held on entry. + * + * If our return value has VM_FAULT_RETRY set, it's because + * lock_page_or_retry() returned 0. + * The mmap_sem has usually been released in this case. + * See __lock_page_or_retry() for the exception. + * + * If our return value does not have VM_FAULT_RETRY set, the mmap_sem + * has not been released. + * + * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. */ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { diff --git a/mm/gup.c b/mm/gup.c index cc5a9e7adea7..91d044b1600d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -258,6 +258,11 @@ unmap: return ret; } +/* + * mmap_sem must be held on entry. If @nonblocking != NULL and + * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. + * If it is, *@nonblocking will be set to 0 and -EBUSY returned. + */ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long address, unsigned int *flags, int *nonblocking) { @@ -373,7 +378,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * with a put_page() call when it is finished with. vmas will only * remain valid while mmap_sem is held. * - * Must be called with mmap_sem held for read or write. + * Must be called with mmap_sem held. It may be released. See below. * * __get_user_pages walks a process's page tables and takes a reference to * each struct page that each user address corresponds to at a given @@ -396,7 +401,14 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * * If @nonblocking != NULL, __get_user_pages will not wait for disk IO * or mmap_sem contention, and if waiting is needed to pin all pages, - * *@nonblocking will be set to 0. + * *@nonblocking will be set to 0. Further, if @gup_flags does not + * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in + * this case. + * + * A caller using such a combination of @nonblocking and @gup_flags + * must therefore hold the mmap_sem for reading only, and recognize + * when it's been released. Otherwise, it must be held for either + * reading or writing and will not be released. * * In most cases, get_user_pages or get_user_pages_fast should be used * instead of __get_user_pages. __get_user_pages should be used only if @@ -528,7 +540,7 @@ EXPORT_SYMBOL(__get_user_pages); * such architectures, gup() will not be enough to make a subsequent access * succeed. * - * This should be called with the mm_sem held for read. + * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). */ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags) diff --git a/mm/memory.c b/mm/memory.c index 7e131325bdf8..4d0a543f3bb3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2399,7 +2399,10 @@ EXPORT_SYMBOL(unmap_mapping_range); /* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * + * We return with the mmap_sem locked or unlocked in the same cases + * as does filemap_fault(). */ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, @@ -2688,6 +2691,11 @@ oom: return VM_FAULT_OOM; } +/* + * The mmap_sem must have been held on entry, and may have been + * released depending on flags and vma->vm_ops->fault() return value. + * See filemap_fault() and __lock_page_retry(). + */ static int __do_fault(struct vm_area_struct *vma, unsigned long address, pgoff_t pgoff, unsigned int flags, struct page **page) { @@ -3016,6 +3024,12 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults). + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) @@ -3040,7 +3054,9 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, @@ -3172,7 +3188,10 @@ out: * * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -3232,6 +3251,9 @@ unlock: /* * By the time we get here, we already hold the mm semaphore + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -3313,6 +3335,12 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, return handle_pte_fault(mm, vma, address, pte, pmd, flags); } +/* + * By the time we get here, we already hold the mm semaphore + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { diff --git a/mm/mlock.c b/mm/mlock.c index b1eb53634005..ce84cb0b83ef 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -210,12 +210,19 @@ out: * @vma: target vma * @start: start address * @end: end address + * @nonblocking: * * This takes care of making the pages present too. * * return 0 on success, negative error code on error. * - * vma->vm_mm->mmap_sem must be held for at least read. + * vma->vm_mm->mmap_sem must be held. + * + * If @nonblocking is NULL, it may be held for read or write and will + * be unperturbed. + * + * If @nonblocking is non-NULL, it must held for read only and may be + * released. If it's released, *@nonblocking will be set to 0. */ long __mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *nonblocking) -- cgit v1.2.2 From fed400a181447ba975d40e1df5e0d555eae51795 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Wed, 6 Aug 2014 16:07:26 -0700 Subject: mm/shmem.c: remove the unused gfp arg to shmem_add_to_page_cache() The gfp arg is not used in shmem_add_to_page_cache. Remove this unused arg. Signed-off-by: Wang Sheng-Hui Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index fe15d96c3166..302d1cf7ad07 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -293,7 +293,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp, void *expected) + pgoff_t index, void *expected) { int error; @@ -666,7 +666,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, */ if (!error) error = shmem_add_to_page_cache(*pagep, mapping, index, - GFP_NOWAIT, radswap); + radswap); if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which @@ -1112,7 +1112,7 @@ repeat: gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - gfp, swp_to_radix_entry(swap)); + swp_to_radix_entry(swap)); /* * We already confirmed swap under page lock, and make * no memory allocation here, so usually no possibility @@ -1175,7 +1175,7 @@ repeat: error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - gfp, NULL); + NULL); radix_tree_preload_end(); } if (error) { -- cgit v1.2.2 From 14a4e2141e24304fff2c697be6382ffb83888185 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:29 -0700 Subject: mm, thp: only collapse hugepages to nodes with affinity for zone_reclaim_mode Commit 9f1b868a13ac ("mm: thp: khugepaged: add policy for finding target node") improved the previous khugepaged logic which allocated a transparent hugepages from the node of the first page being collapsed. However, it is still possible to collapse pages to remote memory which may suffer from additional access latency. With the current policy, it is possible that 255 pages (with PAGE_SHIFT == 12) will be collapsed remotely if the majority are allocated from that node. When zone_reclaim_mode is enabled, it means the VM should make every attempt to allocate locally to prevent NUMA performance degradation. In this case, we do not want to collapse hugepages to remote nodes that would suffer from increased access latency. Thus, when zone_reclaim_mode is enabled, only allow collapsing to nodes with RECLAIM_DISTANCE or less. There is no functional change for systems that disable zone_reclaim_mode. Signed-off-by: David Rientjes Cc: Dave Hansen Cc: Andrea Arcangeli Acked-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Rik van Riel Cc: "Kirill A. Shutemov" Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 24e354c2b59e..3630d577e987 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2233,6 +2233,30 @@ static void khugepaged_alloc_sleep(void) static int khugepaged_node_load[MAX_NUMNODES]; +static bool khugepaged_scan_abort(int nid) +{ + int i; + + /* + * If zone_reclaim_mode is disabled, then no extra effort is made to + * allocate memory locally. + */ + if (!zone_reclaim_mode) + return false; + + /* If there is a count for this node already, it must be acceptable */ + if (khugepaged_node_load[nid]) + return false; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (!khugepaged_node_load[i]) + continue; + if (node_distance(nid, i) > RECLAIM_DISTANCE) + return true; + } + return false; +} + #ifdef CONFIG_NUMA static int khugepaged_find_target_node(void) { @@ -2545,6 +2569,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * hit record. */ node = page_to_nid(page); + if (khugepaged_scan_abort(node)) + goto out_unmap; khugepaged_node_load[node]++; VM_BUG_ON_PAGE(PageCompound(page), page); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) -- cgit v1.2.2 From 9ef0a0ffa28edbf5c7cfa6be73b4ecb9896a3875 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:31 -0700 Subject: mm, writeback: prevent race when calculating dirty limits Setting vm_dirty_bytes and dirty_background_bytes is not protected by any serialization. Therefore, it's possible for either variable to change value after the test in global_dirty_limits() to determine whether available_memory needs to be initialized or not. Always ensure that available_memory is properly initialized. Signed-off-by: David Rientjes Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e0c943014eb7..91d73ef1744d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -261,14 +261,11 @@ static unsigned long global_dirtyable_memory(void) */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { + const unsigned long available_memory = global_dirtyable_memory(); unsigned long background; unsigned long dirty; - unsigned long uninitialized_var(available_memory); struct task_struct *tsk; - if (!vm_dirty_bytes || !dirty_background_bytes) - available_memory = global_dirtyable_memory(); - if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); else -- cgit v1.2.2 From aee52cae00ba0d426a827b761920a476a08eca9e Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 6 Aug 2014 16:07:33 -0700 Subject: slub: remove kmemcg id from create_unique_id This function is never called for memcg caches, because they are unmergeable, so remove the dead code. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: Christoph Lameter Reviewed-by: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 9b861b90cde1..3e8afcc07a76 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5128,12 +5128,6 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = '-'; p += sprintf(p, "%07d", s->size); -#ifdef CONFIG_MEMCG_KMEM - if (!is_root_cache(s)) - p += sprintf(p, "-%08d", - memcg_cache_id(s->memcg_params->memcg)); -#endif - BUG_ON(p > name + ID_STR_LENGTH - 1); return name; } -- cgit v1.2.2 From 6326440077a48d2c3b2993f3b3f2d969f09b6917 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 6 Aug 2014 16:07:36 -0700 Subject: memory-hotplug: add zone_for_memory() for selecting zone for new memory This series of patches fixes a problem when adding memory in bad manner. For example: for a x86_64 machine booted with "mem=400M" and with 2GiB memory installed, following commands cause problem: # echo 0x40000000 > /sys/devices/system/memory/probe [ 28.613895] init_memory_mapping: [mem 0x40000000-0x47ffffff] # echo 0x48000000 > /sys/devices/system/memory/probe [ 28.693675] init_memory_mapping: [mem 0x48000000-0x4fffffff] # echo online_movable > /sys/devices/system/memory/memory9/state # echo 0x50000000 > /sys/devices/system/memory/probe [ 29.084090] init_memory_mapping: [mem 0x50000000-0x57ffffff] # echo 0x58000000 > /sys/devices/system/memory/probe [ 29.151880] init_memory_mapping: [mem 0x58000000-0x5fffffff] # echo online_movable > /sys/devices/system/memory/memory11/state # echo online> /sys/devices/system/memory/memory8/state # echo online> /sys/devices/system/memory/memory10/state # echo offline> /sys/devices/system/memory/memory9/state [ 30.558819] Offlined Pages 32768 # free total used free shared buffers cached Mem: 780588 18014398509432020 830552 0 0 51180 -/+ buffers/cache: 18014398509380840 881732 Swap: 0 0 0 This is because the above commands probe higher memory after online a section with online_movable, which causes ZONE_HIGHMEM (or ZONE_NORMAL for systems without ZONE_HIGHMEM) overlaps ZONE_MOVABLE. After the second online_movable, the problem can be observed from zoneinfo: # cat /proc/zoneinfo ... Node 0, zone Movable pages free 65491 min 250 low 312 high 375 scanned 0 spanned 18446744073709518848 present 65536 managed 65536 ... This series of patches solve the problem by checking ZONE_MOVABLE when choosing zone for new memory. If new memory is inside or higher than ZONE_MOVABLE, makes it go there instead. After applying this series of patches, following are free and zoneinfo result (after offlining memory9): bash-4.2# free total used free shared buffers cached Mem: 780956 80112 700844 0 0 51180 -/+ buffers/cache: 28932 752024 Swap: 0 0 0 bash-4.2# cat /proc/zoneinfo Node 0, zone DMA pages free 3389 min 14 low 17 high 21 scanned 0 spanned 4095 present 3998 managed 3977 nr_free_pages 3389 ... start_pfn: 1 inactive_ratio: 1 Node 0, zone DMA32 pages free 73724 min 341 low 426 high 511 scanned 0 spanned 98304 present 98304 managed 92958 nr_free_pages 73724 ... start_pfn: 4096 inactive_ratio: 1 Node 0, zone Normal pages free 32630 min 120 low 150 high 180 scanned 0 spanned 32768 present 32768 managed 32768 nr_free_pages 32630 ... start_pfn: 262144 inactive_ratio: 1 Node 0, zone Movable pages free 65476 min 241 low 301 high 361 scanned 0 spanned 98304 present 65536 managed 65536 nr_free_pages 65476 ... start_pfn: 294912 inactive_ratio: 1 This patch (of 7): Introduce zone_for_memory() in arch independent code for arch_add_memory() use. Many arch_add_memory() function simply selects ZONE_HIGHMEM or ZONE_NORMAL and add new memory into it. However, with the existance of ZONE_MOVABLE, the selection method should be carefully considered: if new, higher memory is added after ZONE_MOVABLE is setup, the default zone and ZONE_MOVABLE may overlap each other. should_add_memory_movable() checks the status of ZONE_MOVABLE. If it has already contain memory, compare the address of new memory and movable memory. If new memory is higher than movable, it should be added into ZONE_MOVABLE instead of default zone. Signed-off-by: Wang Nan Cc: Zhang Yanfei Cc: Dave Hansen Cc: Ingo Molnar Cc: Yinghai Lu Cc: "Mel Gorman" Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a3797d3fd8a4..2ff8c2325e96 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1159,6 +1159,34 @@ static int check_hotplug_memory_range(u64 start, u64 size) return 0; } +/* + * If movable zone has already been setup, newly added memory should be check. + * If its address is higher than movable zone, it should be added as movable. + * Without this check, movable zone may overlap with other zone. + */ +static int should_add_memory_movable(int nid, u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; + + if (zone_is_empty(movable_zone)) + return 0; + + if (movable_zone->zone_start_pfn <= start_pfn) + return 1; + + return 0; +} + +int zone_for_memory(int nid, u64 start, u64 size, int zone_default) +{ + if (should_add_memory_movable(nid, start, size)) + return ZONE_MOVABLE; + + return zone_default; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory(int nid, u64 start, u64 size) { -- cgit v1.2.2 From 8d060bf490930f305c4efc45724e861a268f4d2f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:50 -0700 Subject: mm, oom: ensure memoryless node zonelist always includes zones With memoryless node support being worked on, it's possible that for optimizations that a node may not have a non-NULL zonelist. When CONFIG_NUMA is enabled and node 0 is memoryless, this means the zonelist for first_online_node may become NULL. The oom killer requires a zonelist that includes all memory zones for the sysrq trigger and pagefault out of memory handler. Ensure that a non-NULL zonelist is always passed to the oom killer. [akpm@linux-foundation.org: fix non-numa build] Signed-off-by: David Rientjes Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3291e82d4352..b0a1e1ff0353 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -694,7 +694,7 @@ void pagefault_out_of_memory(void) if (mem_cgroup_oom_synchronize(true)) return; - zonelist = node_zonelist(first_online_node, GFP_KERNEL); + zonelist = node_zonelist(first_memory_node, GFP_KERNEL); if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { out_of_memory(NULL, 0, 0, NULL, false); clear_zonelist_oom(zonelist, GFP_KERNEL); -- cgit v1.2.2 From e972a070e2d3296cd2e2cc2fd0561ce89a1d5ebf Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:52 -0700 Subject: mm, oom: rename zonelist locking functions try_set_zonelist_oom() and clear_zonelist_oom() are not named properly to imply that they require locking semantics to avoid out_of_memory() being reordered. zone_scan_lock is required for both functions to ensure that there is proper locking synchronization. Rename try_set_zonelist_oom() to oom_zonelist_trylock() and rename clear_zonelist_oom() to oom_zonelist_unlock() to imply there is proper locking semantics. At the same time, convert oom_zonelist_trylock() to return bool instead of int since only success and failure are tested. Signed-off-by: David Rientjes Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 30 +++++++++++++----------------- mm/page_alloc.c | 6 +++--- 2 files changed, 16 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b0a1e1ff0353..d33aca1552ad 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -559,28 +559,25 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); * if a parallel OOM killing is already taking place that includes a zone in * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. */ -int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) +bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; - int ret = 1; + bool ret = true; spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) if (zone_is_oom_locked(zone)) { - ret = 0; + ret = false; goto out; } - } - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { - /* - * Lock each zone in the zonelist under zone_scan_lock so a - * parallel invocation of try_set_zonelist_oom() doesn't succeed - * when it shouldn't. - */ + /* + * Lock each zone in the zonelist under zone_scan_lock so a parallel + * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. + */ + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) zone_set_flag(zone, ZONE_OOM_LOCKED); - } out: spin_unlock(&zone_scan_lock); @@ -592,15 +589,14 @@ out: * allocation attempts with zonelists containing them may now recall the OOM * killer, if necessary. */ -void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) +void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) zone_clear_flag(zone, ZONE_OOM_LOCKED); - } spin_unlock(&zone_scan_lock); } @@ -695,8 +691,8 @@ void pagefault_out_of_memory(void) return; zonelist = node_zonelist(first_memory_node, GFP_KERNEL); - if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { + if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { out_of_memory(NULL, 0, 0, NULL, false); - clear_zonelist_oom(zonelist, GFP_KERNEL); + oom_zonelist_unlock(zonelist, GFP_KERNEL); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fb9908148474..578236089ec1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2246,8 +2246,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, { struct page *page; - /* Acquire the OOM killer lock for the zones in zonelist */ - if (!try_set_zonelist_oom(zonelist, gfp_mask)) { + /* Acquire the per-zone oom lock for each zone */ + if (!oom_zonelist_trylock(zonelist, gfp_mask)) { schedule_timeout_uninterruptible(1); return NULL; } @@ -2285,7 +2285,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, out_of_memory(zonelist, gfp_mask, order, nodemask, false); out: - clear_zonelist_oom(zonelist, gfp_mask); + oom_zonelist_unlock(zonelist, gfp_mask); return page; } -- cgit v1.2.2 From 8fe780484d2674eec27e12bb29c07d3e98a7ad21 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:54 -0700 Subject: mm, thp: restructure thp avoidance of light synchronous migration __GFP_NO_KSWAPD, once the way to determine if an allocation was for thp or not, has gained more users. Their use is not necessarily wrong, they are trying to do a memory allocation that can easily fail without disturbing kswapd, so the bit has gained additional usecases. This restructures the check to determine whether MIGRATE_SYNC_LIGHT should be used for memory compaction in the page allocator. Rather than testing solely for __GFP_NO_KSWAPD, test for all bits that must be set for thp allocations. This also moves the check to be done only after the page allocator is aborted for deferred or contended memory compaction since setting migration_mode for this case is pointless. Signed-off-by: David Rientjes Cc: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 578236089ec1..18cee0d4c8a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2638,14 +2638,6 @@ rebalance: if (page) goto got_pg; - /* - * It can become very expensive to allocate transparent hugepages at - * fault, so use asynchronous memory compaction for THP unless it is - * khugepaged trying to collapse. - */ - if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) - migration_mode = MIGRATE_SYNC_LIGHT; - /* * If compaction is deferred for high-order allocations, it is because * sync compaction recently failed. In this is the case and the caller @@ -2656,6 +2648,15 @@ rebalance: (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; + /* + * It can become very expensive to allocate transparent hugepages at + * fault, so use asynchronous memory compaction for THP unless it is + * khugepaged trying to collapse. + */ + if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || + (current->flags & PF_KTHREAD)) + migration_mode = MIGRATE_SYNC_LIGHT; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, -- cgit v1.2.2 From d0177639310d23c7739500df3c6ce6fdfe34acec Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Wed, 6 Aug 2014 16:07:56 -0700 Subject: mm: fix potential infinite loop in dissolve_free_huge_pages() It is possible for some platforms, such as powerpc to set HPAGE_SHIFT to 0 to indicate huge pages not supported. When this is the case, hugetlbfs could be disabled during boot time: hugetlbfs: disabling because there are no supported hugepage sizes Then in dissolve_free_huge_pages(), order is kept maximum (64 for 64bits), and the for loop below won't end: for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) As suggested by Naoya, below fix checks hugepages_supported() before calling dissolve_free_huge_pages(). [rientjes@google.com: no legitimate reason to call dissolve_free_huge_pages() when !hugepages_supported()] Signed-off-by: Li Zhong Acked-by: Naoya Horiguchi Acked-by: David Rientjes Signed-off-by: David Rientjes Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d9ad93b55585..eeceeeb09019 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1088,6 +1088,9 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) unsigned long pfn; struct hstate *h; + if (!hugepages_supported()) + return; + /* Set scan step to minimum hugepage size */ for_each_hstate(h) if (order > huge_page_order(h)) -- cgit v1.2.2 From fb794bcbb4e5552242f9a4c5e1ffe4c6da29a968 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:58 -0700 Subject: mm, oom: remove unnecessary exit_state check The oom killer scans each process and determines whether it is eligible for oom kill or whether the oom killer should abort because of concurrent memory freeing. It will abort when an eligible process is found to have TIF_MEMDIE set, meaning it has already been oom killed and we're waiting for it to exit. Processes with task->mm == NULL should not be considered because they are either kthreads or have already detached their memory and killing them would not lead to memory freeing. That memory is only freed after exit_mm() has returned, however, and not when task->mm is first set to NULL. Clear TIF_MEMDIE after exit_mm()'s mmput() so that an oom killed process is no longer considered for oom kill, but only until exit_mm() has returned. This was fragile in the past because it relied on exit_notify() to be reached before no longer considering TIF_MEMDIE processes. Signed-off-by: David Rientjes Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d33aca1552ad..1e11df8fa7ec 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -258,8 +258,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, bool force_kill) { - if (task->exit_state) - return OOM_SCAN_CONTINUE; if (oom_unkillable_task(task, NULL, nodemask)) return OOM_SCAN_CONTINUE; -- cgit v1.2.2 From 7c0db9e917f77e6de2a524b33b5436491850dc79 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 6 Aug 2014 16:08:01 -0700 Subject: mm, vmscan: fix an outdated comment still mentioning get_scan_ratio Quite a while ago, get_scan_ratio() has been renamed get_scan_count(), however a comment in shrink_active_list() still mention it. This patch fixes the outdated comment. Signed-off-by: Jerome Marchand Cc: Michal Hocko Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 9c8222b499b4..88ab53c9949a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1756,7 +1756,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * Count referenced pages from currently used mappings as rotated, * even though only some of them are actually re-activated. This * helps balance scan pressure between file and anonymous pages in - * get_scan_ratio. + * get_scan_count. */ reclaim_stat->recent_rotated[file] += nr_rotated; -- cgit v1.2.2 From 2ab051e11bfa3cbb7b24177f3d6aaed10a0d743e Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 6 Aug 2014 16:08:03 -0700 Subject: memcg, vmscan: Fix forced scan of anonymous pages When memory cgoups are enabled, the code that decides to force to scan anonymous pages in get_scan_count() compares global values (free, high_watermark) to a value that is restricted to a memory cgroup (file). It make the code over-eager to force anon scan. For instance, it will force anon scan when scanning a memcg that is mainly populated by anonymous page, even when there is plenty of file pages to get rid of in others memcgs, even when swappiness == 0. It breaks user's expectation about swappiness and hurts performance. This patch makes sure that forced anon scan only happens when there not enough file pages for the all zone, not just in one random memcg. [hannes@cmpxchg.org: cleanups] Signed-off-by: Jerome Marchand Acked-by: Michal Hocko Acked-by: Johannes Weiner Reviewed-by: Rik van Riel Cc: Mel Gorman Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 88ab53c9949a..d2f65c856350 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1930,11 +1930,6 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, goto out; } - anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + - get_lru_size(lruvec, LRU_INACTIVE_ANON); - file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + - get_lru_size(lruvec, LRU_INACTIVE_FILE); - /* * Prevent the reclaimer from falling into the cache trap: as * cache pages start out inactive, every cache fault will tip @@ -1945,9 +1940,14 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, * anon pages. Try to detect this based on file LRU size. */ if (global_reclaim(sc)) { - unsigned long free = zone_page_state(zone, NR_FREE_PAGES); + unsigned long zonefile; + unsigned long zonefree; + + zonefree = zone_page_state(zone, NR_FREE_PAGES); + zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); - if (unlikely(file + free <= high_wmark_pages(zone))) { + if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { scan_balance = SCAN_ANON; goto out; } @@ -1982,6 +1982,12 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, * * anon in [0], file in [1] */ + + anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + + get_lru_size(lruvec, LRU_INACTIVE_ANON); + file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + + get_lru_size(lruvec, LRU_INACTIVE_FILE); + spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; -- cgit v1.2.2 From aecd6f44266c13b8709245b21ded2d19291ab070 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Aug 2014 16:08:05 -0700 Subject: mm: close race between do_fault_around() and fault_around_bytes_set() Things can go wrong if fault_around_bytes will be changed under do_fault_around(): between fault_around_mask() and fault_around_pages(). Let's read fault_around_bytes only once during do_fault_around() and calculate mask based on the reading. Note: fault_around_bytes can only be updated via debug interface. Also I've tried but was not able to trigger a bad behaviour without the patch. So I would not consider this patch as urgent. Signed-off-by: Kirill A. Shutemov Cc: Dave Hansen Cc: Andrey Ryabinin Cc: Sasha Levin Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 4d0a543f3bb3..dc47261c4686 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2768,16 +2768,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, static unsigned long fault_around_bytes = rounddown_pow_of_two(65536); -static inline unsigned long fault_around_pages(void) -{ - return fault_around_bytes >> PAGE_SHIFT; -} - -static inline unsigned long fault_around_mask(void) -{ - return ~(fault_around_bytes - 1) & PAGE_MASK; -} - #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) { @@ -2842,12 +2832,15 @@ late_initcall(fault_around_debugfs); static void do_fault_around(struct vm_area_struct *vma, unsigned long address, pte_t *pte, pgoff_t pgoff, unsigned int flags) { - unsigned long start_addr; + unsigned long start_addr, nr_pages, mask; pgoff_t max_pgoff; struct vm_fault vmf; int off; - start_addr = max(address & fault_around_mask(), vma->vm_start); + nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; + mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; + + start_addr = max(address & mask, vma->vm_start); off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); pte -= off; pgoff -= off; @@ -2859,7 +2852,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, - pgoff + fault_around_pages() - 1); + pgoff + nr_pages - 1); /* Check if it makes any sense to call ->map_pages */ while (!pte_none(*pte)) { @@ -2894,7 +2887,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, * something). */ if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && - fault_around_pages() > 1) { + fault_around_bytes >> PAGE_SHIFT > 1) { pte = pte_offset_map_lock(mm, pmd, address, &ptl); do_fault_around(vma, address, pte, pgoff, flags); if (!pte_same(*pte, orig_pte)) -- cgit v1.2.2 From 3a91053aebb23205caf67927be00c54cef6424b3 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Aug 2014 16:08:07 -0700 Subject: mm: mark fault_around_bytes __read_mostly fault_around_bytes can only be changed via debugfs. Let's mark it read-mostly. Signed-off-by: Kirill A. Shutemov Suggested-by: David Rientjes Acked-by: David Rientjes Cc: Dave Hansen Cc: Andrey Ryabinin Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index dc47261c4686..5596d77e8656 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2766,7 +2766,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, update_mmu_cache(vma, address, pte); } -static unsigned long fault_around_bytes = rounddown_pow_of_two(65536); +static unsigned long fault_around_bytes __read_mostly = + rounddown_pow_of_two(65536); #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) -- cgit v1.2.2 From dbffcd03d77a3fb4d80a7981c7e589fc35769e9b Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 6 Aug 2014 16:08:12 -0700 Subject: mm: change confusing #ifdef use in __access_remote_vm This patch changes confusing #ifdef use in __access_remote_vm into merely ugly #ifdef use. Addresses bug https://bugzilla.kernel.org/show_bug.cgi?id=81651 Signed-off-by: Rik van Riel Reported-by: David Binderman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 5596d77e8656..5c55270729f7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3613,11 +3613,13 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, ret = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma); if (ret <= 0) { +#ifndef CONFIG_HAVE_IOREMAP_PROT + break; +#else /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ -#ifdef CONFIG_HAVE_IOREMAP_PROT vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) break; @@ -3625,9 +3627,9 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, ret = vma->vm_ops->access(vma, addr, buf, len, write); if (ret <= 0) -#endif break; bytes = ret; +#endif } else { bytes = len; offset = addr & (PAGE_SIZE-1); -- cgit v1.2.2 From 61e02c745721a361ba238e70bfa1c84a4df1a4b7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:08:16 -0700 Subject: mm: memcontrol: clean up reclaim size variable use in try_charge() Charge reclaim and OOM currently use the charge batch variable, but batching is already disabled at that point. To simplify the charge logic, the batch variable is reset to the original request size when reclaim is entered, so it's functionally equal, but it's misleading. Switch reclaim/OOM to nr_pages, which is the original request size. Signed-off-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a6a062e409eb..90dc501eaf3f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2612,7 +2612,7 @@ retry: nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); - if (mem_cgroup_margin(mem_over_limit) >= batch) + if (mem_cgroup_margin(mem_over_limit) >= nr_pages) goto retry; if (gfp_mask & __GFP_NORETRY) @@ -2626,7 +2626,7 @@ retry: * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER)) + if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) goto retry; /* * At task move, charge accounts can be doubly counted. So, it's @@ -2644,7 +2644,7 @@ retry: if (fatal_signal_pending(current)) goto bypass; - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch)); + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; -- cgit v1.2.2 From b972216e27d1c853eced33f8638926636c606341 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Aug 2014 16:08:20 -0700 Subject: mmu_notifier: add call_srcu and sync function for listener to delay call and sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When kernel device drivers or subsystems want to bind their lifespan to t= he lifespan of the mm_struct, they usually use one of the following methods: 1. Manually calling a function in the interested kernel module. The funct= ion call needs to be placed in mmput. This method was rejected by several ker= nel maintainers. 2. Registering to the mmu notifier release mechanism. The problem with the latter approach is that the mmu_notifier_release cal= lback is called from__mmu_notifier_release (called from exit_mmap). That functi= on iterates over the list of mmu notifiers and don't expect the release call= back function to remove itself from the list. Therefore, the callback function= in the kernel module can't release the mmu_notifier_object, which is actuall= y the kernel module's object itself. As a result, the destruction of the kernel module's object must to be done in a delayed fashion. This patch adds support for this delayed callback, by adding a new mmu_notifier_call_srcu function that receives a function ptr and calls th= at function with call_srcu. In that function, the kernel module releases its object. To use mmu_notifier_call_srcu, the calling module needs to call b= efore that a new function called mmu_notifier_unregister_no_release that as its= name implies, unregisters a notifier without calling its notifier release call= back. This patch also adds a function that will call barrier_srcu so those kern= el modules can sync with mmu_notifier. Signed-off-by: Peter Zijlstra Signed-off-by: Jérôme Glisse Signed-off-by: Oded Gabbay Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 41cefdf0aadd..950813b1eb36 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -22,6 +22,25 @@ /* global SRCU for all MMs */ static struct srcu_struct srcu; +/* + * This function allows mmu_notifier::release callback to delay a call to + * a function that will free appropriate resources. The function must be + * quick and must not block. + */ +void mmu_notifier_call_srcu(struct rcu_head *rcu, + void (*func)(struct rcu_head *rcu)) +{ + call_srcu(&srcu, rcu, func); +} +EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); + +void mmu_notifier_synchronize(void) +{ + /* Wait for any running method to finish. */ + srcu_barrier(&srcu); +} +EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); + /* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap @@ -53,7 +72,6 @@ void __mmu_notifier_release(struct mm_struct *mm) */ if (mn->ops->release) mn->ops->release(mn, mm); - srcu_read_unlock(&srcu, id); spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { @@ -69,6 +87,7 @@ void __mmu_notifier_release(struct mm_struct *mm) hlist_del_init_rcu(&mn->hlist); } spin_unlock(&mm->mmu_notifier_mm->lock); + srcu_read_unlock(&srcu, id); /* * synchronize_srcu here prevents mmu_notifier_release from returning to @@ -325,6 +344,25 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmu_notifier_unregister); +/* + * Same as mmu_notifier_unregister but no callback and no srcu synchronization. + */ +void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + spin_lock(&mm->mmu_notifier_mm->lock); + /* + * Can not use list_del_rcu() since __mmu_notifier_release + * can delete it before we hold the lock. + */ + hlist_del_init_rcu(&mn->hlist); + spin_unlock(&mm->mmu_notifier_mm->lock); + + BUG_ON(atomic_read(&mm->mm_count) <= 0); + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + static int __init mmu_notifier_init(void) { return init_srcu_struct(&srcu); -- cgit v1.2.2 From 15de36a4c3cf33aa4e194bfbff002048aa4a21c3 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Wed, 6 Aug 2014 16:08:23 -0700 Subject: mm/highmem: make kmap cache coloring aware User-visible effect: Architectures that choose this method of maintaining cache coherency (MIPS and xtensa currently) are able to use high memory on cores with aliasing data cache. Without this fix such architectures can not use high memory (in case of xtensa it means that at most 128 MBytes of physical memory is available). The problem: VIPT cache with way size larger than MMU page size may suffer from aliasing problem: a single physical address accessed via different virtual addresses may end up in multiple locations in the cache. Virtual mappings of a physical address that always get cached in different cache locations are said to have different colors. L1 caching hardware usually doesn't handle this situation leaving it up to software. Software must avoid this situation as it leads to data corruption. What can be done: One way to handle this is to flush and invalidate data cache every time page mapping changes color. The other way is to always map physical page at a virtual address with the same color. Low memory pages already have this property. Giving architecture a way to control color of high memory page mapping allows reusing of existing low memory cache alias handling code. How this is done with this patch: Provide hooks that allow architectures with aliasing cache to align mapping address of high pages according to their color. Such architectures may enforce similar coloring of low- and high-memory page mappings and reuse existing cache management functions to support highmem. This code is based on the implementation of similar feature for MIPS by Leonid Yegoshin. Signed-off-by: Max Filippov Cc: Leonid Yegoshin Cc: Chris Zankel Cc: Marc Gauthier Cc: David Rientjes Cc: Steven Hill Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 75 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index b32b70cdaed6..123bcd3ed4f2 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -44,6 +44,66 @@ DEFINE_PER_CPU(int, __kmap_atomic_idx); */ #ifdef CONFIG_HIGHMEM +/* + * Architecture with aliasing data cache may define the following family of + * helper functions in its asm/highmem.h to control cache color of virtual + * addresses where physical memory pages are mapped by kmap. + */ +#ifndef get_pkmap_color + +/* + * Determine color of virtual address where the page should be mapped. + */ +static inline unsigned int get_pkmap_color(struct page *page) +{ + return 0; +} +#define get_pkmap_color get_pkmap_color + +/* + * Get next index for mapping inside PKMAP region for page with given color. + */ +static inline unsigned int get_next_pkmap_nr(unsigned int color) +{ + static unsigned int last_pkmap_nr; + + last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; + return last_pkmap_nr; +} + +/* + * Determine if page index inside PKMAP region (pkmap_nr) of given color + * has wrapped around PKMAP region end. When this happens an attempt to + * flush all unused PKMAP slots is made. + */ +static inline int no_more_pkmaps(unsigned int pkmap_nr, unsigned int color) +{ + return pkmap_nr == 0; +} + +/* + * Get the number of PKMAP entries of the given color. If no free slot is + * found after checking that many entries, kmap will sleep waiting for + * someone to call kunmap and free PKMAP slot. + */ +static inline int get_pkmap_entries_count(unsigned int color) +{ + return LAST_PKMAP; +} + +/* + * Get head of a wait queue for PKMAP entries of the given color. + * Wait queues for different mapping colors should be independent to avoid + * unnecessary wakeups caused by freeing of slots of other colors. + */ +static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) +{ + static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); + + return &pkmap_map_wait; +} +#endif + unsigned long totalhigh_pages __read_mostly; EXPORT_SYMBOL(totalhigh_pages); @@ -68,13 +128,10 @@ unsigned int nr_free_highpages (void) } static int pkmap_count[LAST_PKMAP]; -static unsigned int last_pkmap_nr; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); pte_t * pkmap_page_table; -static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); - /* * Most architectures have no use for kmap_high_get(), so let's abstract * the disabling of IRQ out of the locking in that case to save on a @@ -161,15 +218,17 @@ static inline unsigned long map_new_virtual(struct page *page) { unsigned long vaddr; int count; + unsigned int last_pkmap_nr; + unsigned int color = get_pkmap_color(page); start: - count = LAST_PKMAP; + count = get_pkmap_entries_count(color); /* Find an empty entry */ for (;;) { - last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; - if (!last_pkmap_nr) { + last_pkmap_nr = get_next_pkmap_nr(color); + if (no_more_pkmaps(last_pkmap_nr, color)) { flush_all_zero_pkmaps(); - count = LAST_PKMAP; + count = get_pkmap_entries_count(color); } if (!pkmap_count[last_pkmap_nr]) break; /* Found a usable entry */ @@ -181,12 +240,14 @@ start: */ { DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *pkmap_map_wait = + get_pkmap_wait_queue_head(color); __set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&pkmap_map_wait, &wait); + add_wait_queue(pkmap_map_wait, &wait); unlock_kmap(); schedule(); - remove_wait_queue(&pkmap_map_wait, &wait); + remove_wait_queue(pkmap_map_wait, &wait); lock_kmap(); /* Somebody else might have mapped it while we slept */ @@ -274,6 +335,8 @@ void kunmap_high(struct page *page) unsigned long nr; unsigned long flags; int need_wakeup; + unsigned int color = get_pkmap_color(page); + wait_queue_head_t *pkmap_map_wait; lock_kmap_any(flags); vaddr = (unsigned long)page_address(page); @@ -299,13 +362,14 @@ void kunmap_high(struct page *page) * no need for the wait-queue-head's lock. Simply * test if the queue is empty. */ - need_wakeup = waitqueue_active(&pkmap_map_wait); + pkmap_map_wait = get_pkmap_wait_queue_head(color); + need_wakeup = waitqueue_active(pkmap_map_wait); } unlock_kmap_any(flags); /* do wake-up, if needed, race-free outside of the spin lock */ if (need_wakeup) - wake_up(&pkmap_map_wait); + wake_up(pkmap_map_wait); } EXPORT_SYMBOL(kunmap_high); -- cgit v1.2.2 From 99eef8e9369abe009006b4fa7f6ca5086c09cf46 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 6 Aug 2014 16:08:33 -0700 Subject: mm/zbud: change zbud_alloc size type to size_t Change the type of the zbud_alloc() size param from unsigned int to size_t. Technically, this should not make any difference, as the zbud implementation already restricts the size to well within either type's limits; but as zsmalloc (and kmalloc) use size_t, and zpool will use size_t, this brings the size parameter type in line with zsmalloc/zpool. Signed-off-by: Dan Streetman Acked-by: Seth Jennings Tested-by: Seth Jennings Cc: Weijie Yang Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zbud.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/zbud.c b/mm/zbud.c index 01df13a7e2e1..d01226117b8d 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -122,7 +122,7 @@ enum buddy { }; /* Converts an allocation size in bytes to size in zbud chunks */ -static int size_to_chunks(int size) +static int size_to_chunks(size_t size) { return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; } @@ -247,7 +247,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate * a new page. */ -int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, +int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, unsigned long *handle) { int chunks, i, freechunks; -- cgit v1.2.2 From af8d417a04564bca0348e7e3c749ab12a3e837ad Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 6 Aug 2014 16:08:36 -0700 Subject: mm/zpool: implement common zpool api to zbud/zsmalloc Add zpool api. zpool provides an interface for memory storage, typically of compressed memory. Users can select what backend to use; currently the only implementations are zbud, a low density implementation with up to two compressed pages per storage page, and zsmalloc, a higher density implementation with multiple compressed pages per storage page. Signed-off-by: Dan Streetman Tested-by: Seth Jennings Cc: Minchan Kim Cc: Nitin Gupta Cc: Weijie Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 41 ++++--- mm/Makefile | 1 + mm/zpool.c | 364 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/zsmalloc.c | 1 - 4 files changed, 389 insertions(+), 18 deletions(-) create mode 100644 mm/zpool.c (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index f4899ec39cf4..12179b8c3b89 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -519,15 +519,17 @@ config CMA_AREAS If unsure, leave the default value "7". -config ZBUD - tristate - default n +config MEM_SOFT_DIRTY + bool "Track memory changes" + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS + select PROC_PAGE_MONITOR help - A special purpose allocator for storing compressed pages. - It is designed to store up to two compressed pages per physical - page. While this design limits storage density, it has simple and - deterministic reclaim properties that make it preferable to a higher - density approach when reclaim will be used. + This option enables memory changes tracking by introducing a + soft-dirty bit on pte-s. This bit it set when someone writes + into a page just as regular dirty bit, but unlike the latter + it can be cleared by hands. + + See Documentation/vm/soft-dirty.txt for more details. config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" @@ -549,17 +551,22 @@ config ZSWAP they have not be fully explored on the large set of potential configurations and workloads that exist. -config MEM_SOFT_DIRTY - bool "Track memory changes" - depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS - select PROC_PAGE_MONITOR +config ZPOOL + tristate "Common API for compressed memory storage" + default n help - This option enables memory changes tracking by introducing a - soft-dirty bit on pte-s. This bit it set when someone writes - into a page just as regular dirty bit, but unlike the latter - it can be cleared by hands. + Compressed memory storage API. This allows using either zbud or + zsmalloc. - See Documentation/vm/soft-dirty.txt for more details. +config ZBUD + tristate "Low density storage for compressed pages" + default n + help + A special purpose allocator for storing compressed pages. + It is designed to store up to two compressed pages per physical + page. While this design limits storage density, it has simple and + deterministic reclaim properties that make it preferable to a higher + density approach when reclaim will be used. config ZSMALLOC tristate "Memory allocator for compressed pages" diff --git a/mm/Makefile b/mm/Makefile index 8338473c329a..632ae77e6070 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -59,6 +59,7 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o +obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o diff --git a/mm/zpool.c b/mm/zpool.c new file mode 100644 index 000000000000..e40612a1df00 --- /dev/null +++ b/mm/zpool.c @@ -0,0 +1,364 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for memory storage pool implementations. + * Typically, this is used to store compressed memory. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +struct zpool { + char *type; + + struct zpool_driver *driver; + void *pool; + struct zpool_ops *ops; + + struct list_head list; +}; + +static LIST_HEAD(drivers_head); +static DEFINE_SPINLOCK(drivers_lock); + +static LIST_HEAD(pools_head); +static DEFINE_SPINLOCK(pools_lock); + +/** + * zpool_register_driver() - register a zpool implementation. + * @driver: driver to register + */ +void zpool_register_driver(struct zpool_driver *driver) +{ + spin_lock(&drivers_lock); + atomic_set(&driver->refcount, 0); + list_add(&driver->list, &drivers_head); + spin_unlock(&drivers_lock); +} +EXPORT_SYMBOL(zpool_register_driver); + +/** + * zpool_unregister_driver() - unregister a zpool implementation. + * @driver: driver to unregister. + * + * Module usage counting is used to prevent using a driver + * while/after unloading, so if this is called from module + * exit function, this should never fail; if called from + * other than the module exit function, and this returns + * failure, the driver is in use and must remain available. + */ +int zpool_unregister_driver(struct zpool_driver *driver) +{ + int ret = 0, refcount; + + spin_lock(&drivers_lock); + refcount = atomic_read(&driver->refcount); + WARN_ON(refcount < 0); + if (refcount > 0) + ret = -EBUSY; + else + list_del(&driver->list); + spin_unlock(&drivers_lock); + + return ret; +} +EXPORT_SYMBOL(zpool_unregister_driver); + +/** + * zpool_evict() - evict callback from a zpool implementation. + * @pool: pool to evict from. + * @handle: handle to evict. + * + * This can be used by zpool implementations to call the + * user's evict zpool_ops struct evict callback. + */ +int zpool_evict(void *pool, unsigned long handle) +{ + struct zpool *zpool; + + spin_lock(&pools_lock); + list_for_each_entry(zpool, &pools_head, list) { + if (zpool->pool == pool) { + spin_unlock(&pools_lock); + if (!zpool->ops || !zpool->ops->evict) + return -EINVAL; + return zpool->ops->evict(zpool, handle); + } + } + spin_unlock(&pools_lock); + + return -ENOENT; +} +EXPORT_SYMBOL(zpool_evict); + +static struct zpool_driver *zpool_get_driver(char *type) +{ + struct zpool_driver *driver; + + spin_lock(&drivers_lock); + list_for_each_entry(driver, &drivers_head, list) { + if (!strcmp(driver->type, type)) { + bool got = try_module_get(driver->owner); + + if (got) + atomic_inc(&driver->refcount); + spin_unlock(&drivers_lock); + return got ? driver : NULL; + } + } + + spin_unlock(&drivers_lock); + return NULL; +} + +static void zpool_put_driver(struct zpool_driver *driver) +{ + atomic_dec(&driver->refcount); + module_put(driver->owner); +} + +/** + * zpool_create_pool() - Create a new zpool + * @type The type of the zpool to create (e.g. zbud, zsmalloc) + * @gfp The GFP flags to use when allocating the pool. + * @ops The optional ops callback. + * + * This creates a new zpool of the specified type. The gfp flags will be + * used when allocating memory, if the implementation supports it. If the + * ops param is NULL, then the created zpool will not be shrinkable. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: New zpool on success, NULL on failure. + */ +struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) +{ + struct zpool_driver *driver; + struct zpool *zpool; + + pr_info("creating pool type %s\n", type); + + driver = zpool_get_driver(type); + + if (!driver) { + request_module(type); + driver = zpool_get_driver(type); + } + + if (!driver) { + pr_err("no driver for type %s\n", type); + return NULL; + } + + zpool = kmalloc(sizeof(*zpool), gfp); + if (!zpool) { + pr_err("couldn't create zpool - out of memory\n"); + zpool_put_driver(driver); + return NULL; + } + + zpool->type = driver->type; + zpool->driver = driver; + zpool->pool = driver->create(gfp, ops); + zpool->ops = ops; + + if (!zpool->pool) { + pr_err("couldn't create %s pool\n", type); + zpool_put_driver(driver); + kfree(zpool); + return NULL; + } + + pr_info("created %s pool\n", type); + + spin_lock(&pools_lock); + list_add(&zpool->list, &pools_head); + spin_unlock(&pools_lock); + + return zpool; +} + +/** + * zpool_destroy_pool() - Destroy a zpool + * @pool The zpool to destroy. + * + * Implementations must guarantee this to be thread-safe, + * however only when destroying different pools. The same + * pool should only be destroyed once, and should not be used + * after it is destroyed. + * + * This destroys an existing zpool. The zpool should not be in use. + */ +void zpool_destroy_pool(struct zpool *zpool) +{ + pr_info("destroying pool type %s\n", zpool->type); + + spin_lock(&pools_lock); + list_del(&zpool->list); + spin_unlock(&pools_lock); + zpool->driver->destroy(zpool->pool); + zpool_put_driver(zpool->driver); + kfree(zpool); +} + +/** + * zpool_get_type() - Get the type of the zpool + * @pool The zpool to check + * + * This returns the type of the pool. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: The type of zpool. + */ +char *zpool_get_type(struct zpool *zpool) +{ + return zpool->type; +} + +/** + * zpool_malloc() - Allocate memory + * @pool The zpool to allocate from. + * @size The amount of memory to allocate. + * @gfp The GFP flags to use when allocating memory. + * @handle Pointer to the handle to set + * + * This allocates the requested amount of memory from the pool. + * The gfp flags will be used when allocating memory, if the + * implementation supports it. The provided @handle will be + * set to the allocated object handle. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error. + */ +int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zpool->driver->malloc(zpool->pool, size, gfp, handle); +} + +/** + * zpool_free() - Free previously allocated memory + * @pool The zpool that allocated the memory. + * @handle The handle to the memory to free. + * + * This frees previously allocated memory. This does not guarantee + * that the pool will actually free memory, only that the memory + * in the pool will become available for use by the pool. + * + * Implementations must guarantee this to be thread-safe, + * however only when freeing different handles. The same + * handle should only be freed once, and should not be used + * after freeing. + */ +void zpool_free(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->free(zpool->pool, handle); +} + +/** + * zpool_shrink() - Shrink the pool size + * @pool The zpool to shrink. + * @pages The number of pages to shrink the pool. + * @reclaimed The number of pages successfully evicted. + * + * This attempts to shrink the actual memory size of the pool + * by evicting currently used handle(s). If the pool was + * created with no zpool_ops, or the evict call fails for any + * of the handles, this will fail. If non-NULL, the @reclaimed + * parameter will be set to the number of pages reclaimed, + * which may be more than the number of pages requested. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error/failure. + */ +int zpool_shrink(struct zpool *zpool, unsigned int pages, + unsigned int *reclaimed) +{ + return zpool->driver->shrink(zpool->pool, pages, reclaimed); +} + +/** + * zpool_map_handle() - Map a previously allocated handle into memory + * @pool The zpool that the handle was allocated from + * @handle The handle to map + * @mm How the memory should be mapped + * + * This maps a previously allocated handle into memory. The @mm + * param indicates to the implementation how the memory will be + * used, i.e. read-only, write-only, read-write. If the + * implementation does not support it, the memory will be treated + * as read-write. + * + * This may hold locks, disable interrupts, and/or preemption, + * and the zpool_unmap_handle() must be called to undo those + * actions. The code that uses the mapped handle should complete + * its operatons on the mapped handle memory quickly and unmap + * as soon as possible. As the implementation may use per-cpu + * data, multiple handles should not be mapped concurrently on + * any cpu. + * + * Returns: A pointer to the handle's mapped memory area. + */ +void *zpool_map_handle(struct zpool *zpool, unsigned long handle, + enum zpool_mapmode mapmode) +{ + return zpool->driver->map(zpool->pool, handle, mapmode); +} + +/** + * zpool_unmap_handle() - Unmap a previously mapped handle + * @pool The zpool that the handle was allocated from + * @handle The handle to unmap + * + * This unmaps a previously mapped handle. Any locks or other + * actions that the implementation took in zpool_map_handle() + * will be undone here. The memory area returned from + * zpool_map_handle() should no longer be used after this. + */ +void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->unmap(zpool->pool, handle); +} + +/** + * zpool_get_total_size() - The total size of the pool + * @pool The zpool to check + * + * This returns the total size in bytes of the pool. + * + * Returns: Total size of the zpool in bytes. + */ +u64 zpool_get_total_size(struct zpool *zpool) +{ + return zpool->driver->total_size(zpool->pool); +} + +static int __init init_zpool(void) +{ + pr_info("loaded\n"); + return 0; +} + +static void __exit exit_zpool(void) +{ + pr_info("unloaded\n"); +} + +module_init(init_zpool); +module_exit(exit_zpool); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Streetman "); +MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index bb62a4adc328..6a1827d3d231 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -240,7 +240,6 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; - /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); -- cgit v1.2.2 From c795779df29e180738568d2a5eb3a42f3b5e47f0 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 6 Aug 2014 16:08:38 -0700 Subject: mm/zpool: zbud/zsmalloc implement zpool Update zbud and zsmalloc to implement the zpool api. [fengguang.wu@intel.com: make functions static] Signed-off-by: Dan Streetman Tested-by: Seth Jennings Cc: Minchan Kim Cc: Nitin Gupta Cc: Weijie Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zbud.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/zsmalloc.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+) (limited to 'mm') diff --git a/mm/zbud.c b/mm/zbud.c index d01226117b8d..a05790b1915e 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -51,6 +51,7 @@ #include #include #include +#include /***************** * Structures @@ -112,6 +113,90 @@ struct zbud_header { bool under_reclaim; }; +/***************** + * zpool + ****************/ + +#ifdef CONFIG_ZPOOL + +static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) +{ + return zpool_evict(pool, handle); +} + +static struct zbud_ops zbud_zpool_ops = { + .evict = zbud_zpool_evict +}; + +static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +{ + return zbud_create_pool(gfp, &zbud_zpool_ops); +} + +static void zbud_zpool_destroy(void *pool) +{ + zbud_destroy_pool(pool); +} + +static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zbud_alloc(pool, size, gfp, handle); +} +static void zbud_zpool_free(void *pool, unsigned long handle) +{ + zbud_free(pool, handle); +} + +static int zbud_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = zbud_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + +static void *zbud_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + return zbud_map(pool, handle); +} +static void zbud_zpool_unmap(void *pool, unsigned long handle) +{ + zbud_unmap(pool, handle); +} + +static u64 zbud_zpool_total_size(void *pool) +{ + return zbud_get_pool_size(pool) * PAGE_SIZE; +} + +static struct zpool_driver zbud_zpool_driver = { + .type = "zbud", + .owner = THIS_MODULE, + .create = zbud_zpool_create, + .destroy = zbud_zpool_destroy, + .malloc = zbud_zpool_malloc, + .free = zbud_zpool_free, + .shrink = zbud_zpool_shrink, + .map = zbud_zpool_map, + .unmap = zbud_zpool_unmap, + .total_size = zbud_zpool_total_size, +}; + +#endif /* CONFIG_ZPOOL */ + /***************** * Helpers *****************/ @@ -511,11 +596,20 @@ static int __init init_zbud(void) /* Make sure the zbud header will fit in one chunk */ BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); pr_info("loaded\n"); + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zbud_zpool_driver); +#endif + return 0; } static void __exit exit_zbud(void) { +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zbud_zpool_driver); +#endif + pr_info("unloaded\n"); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 6a1827d3d231..4e2fc83cb394 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -92,6 +92,7 @@ #include #include #include +#include /* * This must be power of 2 and greater than of equal to sizeof(link_free). @@ -240,6 +241,82 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; +/* zpool driver */ + +#ifdef CONFIG_ZPOOL + +static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +{ + return zs_create_pool(gfp); +} + +static void zs_zpool_destroy(void *pool) +{ + zs_destroy_pool(pool); +} + +static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + *handle = zs_malloc(pool, size); + return *handle ? 0 : -1; +} +static void zs_zpool_free(void *pool, unsigned long handle) +{ + zs_free(pool, handle); +} + +static int zs_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + return -EINVAL; +} + +static void *zs_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + enum zs_mapmode zs_mm; + + switch (mm) { + case ZPOOL_MM_RO: + zs_mm = ZS_MM_RO; + break; + case ZPOOL_MM_WO: + zs_mm = ZS_MM_WO; + break; + case ZPOOL_MM_RW: /* fallthru */ + default: + zs_mm = ZS_MM_RW; + break; + } + + return zs_map_object(pool, handle, zs_mm); +} +static void zs_zpool_unmap(void *pool, unsigned long handle) +{ + zs_unmap_object(pool, handle); +} + +static u64 zs_zpool_total_size(void *pool) +{ + return zs_get_total_size_bytes(pool); +} + +static struct zpool_driver zs_zpool_driver = { + .type = "zsmalloc", + .owner = THIS_MODULE, + .create = zs_zpool_create, + .destroy = zs_zpool_destroy, + .malloc = zs_zpool_malloc, + .free = zs_zpool_free, + .shrink = zs_zpool_shrink, + .map = zs_zpool_map, + .unmap = zs_zpool_unmap, + .total_size = zs_zpool_total_size, +}; + +#endif /* CONFIG_ZPOOL */ + /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); @@ -813,6 +890,10 @@ static void zs_exit(void) { int cpu; +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif + cpu_notifier_register_begin(); for_each_online_cpu(cpu) @@ -839,6 +920,10 @@ static int zs_init(void) cpu_notifier_register_done(); +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zs_zpool_driver); +#endif + return 0; fail: zs_exit(); -- cgit v1.2.2 From 12d79d64bfd3913693304feb8636ccab504b9e63 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 6 Aug 2014 16:08:40 -0700 Subject: mm/zpool: update zswap to use zpool Change zswap to use the zpool api instead of directly using zbud. Add a boot-time param to allow selecting which zpool implementation to use, with zbud as the default. Signed-off-by: Dan Streetman Tested-by: Seth Jennings Cc: Weijie Yang Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- mm/zswap.c | 75 +++++++++++++++++++++++++++++++++++++------------------------- 2 files changed, 46 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 12179b8c3b89..886db2158538 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -535,7 +535,7 @@ config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" depends on FRONTSWAP && CRYPTO=y select CRYPTO_LZO - select ZBUD + select ZPOOL default n help A lightweight compressed cache for swap pages. It takes diff --git a/mm/zswap.c b/mm/zswap.c index 008388fe7b0f..032c21eeab2b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include @@ -45,8 +45,8 @@ /********************************* * statistics **********************************/ -/* Number of memory pages used by the compressed pool */ -static u64 zswap_pool_pages; +/* Total bytes used by the compressed storage */ +static u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ static atomic_t zswap_stored_pages = ATOMIC_INIT(0); @@ -89,8 +89,13 @@ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); -/* zbud_pool is shared by all of zswap backend */ -static struct zbud_pool *zswap_pool; +/* Compressed storage to use */ +#define ZSWAP_ZPOOL_DEFAULT "zbud" +static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; +module_param_named(zpool, zswap_zpool_type, charp, 0444); + +/* zpool is shared by all of zswap backend */ +static struct zpool *zswap_pool; /********************************* * compression functions @@ -168,7 +173,7 @@ static void zswap_comp_exit(void) * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * offset - the swap offset for the entry. Index into the red-black tree. - * handle - zbud allocation handle that stores the compressed page data + * handle - zpool allocation handle that stores the compressed page data * length - the length in bytes of the compressed page data. Needed during * decompression */ @@ -284,15 +289,15 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) } /* - * Carries out the common pattern of freeing and entry's zbud allocation, + * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. */ static void zswap_free_entry(struct zswap_entry *entry) { - zbud_free(zswap_pool, entry->handle); + zpool_free(zswap_pool, entry->handle); zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(zswap_pool); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); } /* caller must hold the tree lock */ @@ -409,7 +414,7 @@ cleanup: static bool zswap_is_full(void) { return totalram_pages * zswap_max_pool_percent / 100 < - zswap_pool_pages; + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } /********************************* @@ -525,7 +530,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, * the swap cache, the compressed version stored by zswap can be * freed. */ -static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) +static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) { struct zswap_header *zhdr; swp_entry_t swpentry; @@ -541,9 +546,9 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) }; /* extract swpentry from data */ - zhdr = zbud_map(pool, handle); + zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); swpentry = zhdr->swpentry; /* here */ - zbud_unmap(pool, handle); + zpool_unmap_handle(pool, handle); tree = zswap_trees[swp_type(swpentry)]; offset = swp_offset(swpentry); @@ -573,13 +578,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(zswap_pool, entry->handle) + - sizeof(struct zswap_header); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(zswap_pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -652,7 +657,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* reclaim space if needed */ if (zswap_is_full()) { zswap_pool_limit_hit++; - if (zbud_reclaim_page(zswap_pool, 8)) { + if (zpool_shrink(zswap_pool, 1, NULL)) { zswap_reject_reclaim_fail++; ret = -ENOMEM; goto reject; @@ -679,7 +684,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ len = dlen + sizeof(struct zswap_header); - ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, + ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; @@ -689,11 +694,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto freepage; } - zhdr = zbud_map(zswap_pool, handle); + zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW); zhdr->swpentry = swp_entry(type, offset); buf = (u8 *)(zhdr + 1); memcpy(buf, dst, dlen); - zbud_unmap(zswap_pool, handle); + zpool_unmap_handle(zswap_pool, handle); put_cpu_var(zswap_dstmem); /* populate entry */ @@ -716,7 +721,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* update stats */ atomic_inc(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(zswap_pool); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); return 0; @@ -752,13 +757,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(zswap_pool, entry->handle) + - sizeof(struct zswap_header); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(zswap_pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); BUG_ON(ret); spin_lock(&tree->lock); @@ -811,7 +816,7 @@ static void zswap_frontswap_invalidate_area(unsigned type) zswap_trees[type] = NULL; } -static struct zbud_ops zswap_zbud_ops = { +static struct zpool_ops zswap_zpool_ops = { .evict = zswap_writeback_entry }; @@ -869,8 +874,8 @@ static int __init zswap_debugfs_init(void) zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_u64("duplicate_entry", S_IRUGO, zswap_debugfs_root, &zswap_duplicate_entry); - debugfs_create_u64("pool_pages", S_IRUGO, - zswap_debugfs_root, &zswap_pool_pages); + debugfs_create_u64("pool_total_size", S_IRUGO, + zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", S_IRUGO, zswap_debugfs_root, &zswap_stored_pages); @@ -895,16 +900,26 @@ static void __exit zswap_debugfs_exit(void) { } **********************************/ static int __init init_zswap(void) { + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + if (!zswap_enabled) return 0; pr_info("loading zswap\n"); - zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); + zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); + if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + pr_info("%s zpool not available\n", zswap_zpool_type); + zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; + zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, + &zswap_zpool_ops); + } if (!zswap_pool) { - pr_err("zbud pool creation failed\n"); + pr_err("%s zpool not available\n", zswap_zpool_type); + pr_err("zpool creation failed\n"); goto error; } + pr_info("using %s pool\n", zswap_zpool_type); if (zswap_entry_cache_create()) { pr_err("entry cache creation failed\n"); @@ -928,7 +943,7 @@ pcpufail: compfail: zswap_entry_cache_destory(); cachefail: - zbud_destroy_pool(zswap_pool); + zpool_destroy_pool(zswap_pool); error: return -ENOMEM; } -- cgit v1.2.2