From 3b33719c9b741066f7d2bc6036409752f8e0478d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 10 Aug 2016 16:27:44 -0700 Subject: thp: move shmem_huge_enabled() outside of SYSFS ifdef The newly introduced shmem_huge_enabled() function has two definitions, but neither of them is visible if CONFIG_SYSFS is disabled, leading to a build error: mm/khugepaged.o: In function `khugepaged': khugepaged.c:(.text.khugepaged+0x3ca): undefined reference to `shmem_huge_enabled' This changes the #ifdef guards around the definition to match those that are used in the header file. Fixes: e496cf3d7821 ("thp: introduce CONFIG_TRANSPARENT_HUGE_PAGECACHE") Link: http://lkml.kernel.org/r/20160809123638.1357593-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 7f7748a0f9e1..fd8b2b5741b1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3975,7 +3975,9 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute shmem_enabled_attr = __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); +#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE bool shmem_huge_enabled(struct vm_area_struct *vma) { struct inode *inode = file_inode(vma->vm_file); @@ -4006,7 +4008,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) return false; } } -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ +#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ #else /* !CONFIG_SHMEM */ -- cgit v1.2.2 From 81cbcbc2d810c0ce49fba81f864302e1afe5ff27 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 10 Aug 2016 16:27:46 -0700 Subject: mm/page_alloc.c: fix wrong initialization when sysctl_min_unmapped_ratio changes Before resetting min_unmapped_pages, we need to initialize min_unmapped_pages rather than min_slab_pages. Fixes: a5f5f91da6 (mm: convert zone_reclaim to node_reclaim) Link: http://lkml.kernel.org/r/1470724248-26780-1-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: Johannes Weiner Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ee744fa3b93d..9a92718b1103 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6854,7 +6854,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, return rc; for_each_online_pgdat(pgdat) - pgdat->min_slab_pages = 0; + pgdat->min_unmapped_pages = 0; for_each_zone(zone) zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * -- cgit v1.2.2 From 6423aa8192c596848e1b23bd4193dc0924e7274d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 10 Aug 2016 16:27:49 -0700 Subject: mm/page_alloc.c: recalculate some of node threshold when on/offline memory Some of node threshold depends on number of managed pages in the node. When memory is going on/offline, it can be changed and we need to adjust them. Add recalculation to appropriate places and clean-up related functions for better maintenance. Link: http://lkml.kernel.org/r/1470724248-26780-2-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: Johannes Weiner Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 50 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9a92718b1103..ab2c0ff8c2e6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4757,6 +4757,8 @@ int local_memory_node(int node) } #endif +static void setup_min_unmapped_ratio(void); +static void setup_min_slab_ratio(void); #else /* CONFIG_NUMA */ static void set_zonelist_order(void) @@ -5878,9 +5880,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; #ifdef CONFIG_NUMA zone->node = nid; - pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio) - / 100; - pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; zone->zone_pgdat = pgdat; @@ -6801,6 +6800,12 @@ int __meminit init_per_zone_wmark_min(void) setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); + +#ifdef CONFIG_NUMA + setup_min_unmapped_ratio(); + setup_min_slab_ratio(); +#endif + return 0; } core_initcall(init_per_zone_wmark_min) @@ -6842,16 +6847,10 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, } #ifdef CONFIG_NUMA -int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) +static void setup_min_unmapped_ratio(void) { - struct pglist_data *pgdat; + pg_data_t *pgdat; struct zone *zone; - int rc; - - rc = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (rc) - return rc; for_each_online_pgdat(pgdat) pgdat->min_unmapped_pages = 0; @@ -6859,26 +6858,47 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, for_each_zone(zone) zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * sysctl_min_unmapped_ratio) / 100; - return 0; } -int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, + +int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - struct pglist_data *pgdat; - struct zone *zone; int rc; rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; + setup_min_unmapped_ratio(); + + return 0; +} + +static void setup_min_slab_ratio(void) +{ + pg_data_t *pgdat; + struct zone *zone; + for_each_online_pgdat(pgdat) pgdat->min_slab_pages = 0; for_each_zone(zone) zone->zone_pgdat->min_slab_pages += (zone->managed_pages * sysctl_min_slab_ratio) / 100; +} + +int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + setup_min_slab_ratio(); + return 0; } #endif -- cgit v1.2.2 From c8efc390c1e0eca195ae59a2f7cec46773620e0c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 10 Aug 2016 16:27:52 -0700 Subject: mm, rmap: fix false positive VM_BUG() in page_add_file_rmap() PageTransCompound() doesn't distinguish THP from from any other type of compound pages. This can lead to false-positive VM_BUG_ON() in page_add_file_rmap() if called on compound page from a driver[1]. I think we can exclude such cases by checking if the page belong to a mapping. The VM_BUG_ON_PAGE() is downgraded to VM_WARN_ON_ONCE(). This path should not cause any harm to non-THP page, but good to know if we step on anything else. [1] http://lkml.kernel.org/r/c711e067-0bff-a6cb-3c37-04dfe77d2db1@redhat.com Link: http://lkml.kernel.org/r/20160810161345.GA67522@black.fi.intel.com Signed-off-by: Kirill A. Shutemov Reported-by: Laura Abbott Tested-by: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 709bc83703b1..d4f56060ba3f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1284,8 +1284,9 @@ void page_add_file_rmap(struct page *page, bool compound) VM_BUG_ON_PAGE(!PageSwapBacked(page), page); __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); } else { - if (PageTransCompound(page)) { - VM_BUG_ON_PAGE(!PageLocked(page), page); + if (PageTransCompound(page) && page_mapping(page)) { + VM_WARN_ON_ONCE(!PageLocked(page)); + SetPageDoubleMap(compound_head(page)); if (PageMlocked(page)) clear_page_mlock(compound_head(page)); -- cgit v1.2.2 From 57dea93ac42d341e1fe902528b348ef6763fa485 Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Wed, 10 Aug 2016 16:27:55 -0700 Subject: rmap: fix compound check logic in page_remove_file_rmap In page_remove_file_rmap(.) we have the following check: VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); This is meant to check for either HugeTLB pages or THP when a compound page is passed in. Unfortunately, if one disables CONFIG_TRANSPARENT_HUGEPAGE, then PageTransHuge(.) will always return false, provoking BUGs when one runs the libhugetlbfs test suite. This patch replaces PageTransHuge(), with PageHead() which will work for both HugeTLB and THP. Fixes: dd78fedde4b9 ("rmap: support file thp") Link: http://lkml.kernel.org/r/1470838217-5889-1-git-send-email-steve.capper@arm.com Signed-off-by: Steve Capper Acked-by: Kirill A. Shutemov Cc: Huang Shijie Cc: Will Deacon Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index d4f56060ba3f..1ef36404e7b2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1304,7 +1304,7 @@ static void page_remove_file_rmap(struct page *page, bool compound) { int i, nr = 1; - VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); + VM_BUG_ON_PAGE(compound && !PageHead(page), page); lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ -- cgit v1.2.2 From 6039892396d845b18228935561960441900cffca Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 10 Aug 2016 16:27:58 -0700 Subject: mm/slub.c: run free_partial() outside of the kmem_cache_node->list_lock With debugobjects enabled and using SLAB_DESTROY_BY_RCU, when a kmem_cache_node is destroyed the call_rcu() may trigger a slab allocation to fill the debug object pool (__debug_object_init:fill_pool). Everywhere but during kmem_cache_destroy(), discard_slab() is performed outside of the kmem_cache_node->list_lock and avoids a lockdep warning about potential recursion: ============================================= [ INFO: possible recursive locking detected ] 4.8.0-rc1-gfxbench+ #1 Tainted: G U --------------------------------------------- rmmod/8895 is trying to acquire lock: (&(&n->list_lock)->rlock){-.-...}, at: [] get_partial_node.isra.63+0x47/0x430 but task is already holding lock: (&(&n->list_lock)->rlock){-.-...}, at: [] __kmem_cache_shutdown+0x54/0x320 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(&n->list_lock)->rlock); lock(&(&n->list_lock)->rlock); *** DEADLOCK *** May be due to missing lock nesting notation 5 locks held by rmmod/8895: #0: (&dev->mutex){......}, at: driver_detach+0x42/0xc0 #1: (&dev->mutex){......}, at: driver_detach+0x50/0xc0 #2: (cpu_hotplug.dep_map){++++++}, at: get_online_cpus+0x2d/0x80 #3: (slab_mutex){+.+.+.}, at: kmem_cache_destroy+0x3c/0x220 #4: (&(&n->list_lock)->rlock){-.-...}, at: __kmem_cache_shutdown+0x54/0x320 stack backtrace: CPU: 6 PID: 8895 Comm: rmmod Tainted: G U 4.8.0-rc1-gfxbench+ #1 Hardware name: Gigabyte Technology Co., Ltd. H87M-D3H/H87M-D3H, BIOS F11 08/18/2015 Call Trace: __lock_acquire+0x1646/0x1ad0 lock_acquire+0xb2/0x200 _raw_spin_lock+0x36/0x50 get_partial_node.isra.63+0x47/0x430 ___slab_alloc.constprop.67+0x1a7/0x3b0 __slab_alloc.isra.64.constprop.66+0x43/0x80 kmem_cache_alloc+0x236/0x2d0 __debug_object_init+0x2de/0x400 debug_object_activate+0x109/0x1e0 __call_rcu.constprop.63+0x32/0x2f0 call_rcu+0x12/0x20 discard_slab+0x3d/0x40 __kmem_cache_shutdown+0xdb/0x320 shutdown_cache+0x19/0x60 kmem_cache_destroy+0x1ae/0x220 i915_gem_load_cleanup+0x14/0x40 [i915] i915_driver_unload+0x151/0x180 [i915] i915_pci_remove+0x14/0x20 [i915] pci_device_remove+0x34/0xb0 __device_release_driver+0x95/0x140 driver_detach+0xb6/0xc0 bus_remove_driver+0x53/0xd0 driver_unregister+0x27/0x50 pci_unregister_driver+0x25/0x70 i915_exit+0x1a/0x1e2 [i915] SyS_delete_module+0x193/0x1f0 entry_SYSCALL_64_fastpath+0x1c/0xac Fixes: 52b4b950b507 ("mm: slab: free kmem_cache_node after destroy sysfs file") Link: http://lkml.kernel.org/r/1470759070-18743-1-git-send-email-chris@chris-wilson.co.uk Reported-by: Dave Gordon Signed-off-by: Chris Wilson Reviewed-by: Vladimir Davydov Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Dmitry Safonov Cc: Daniel Vetter Cc: Dave Gordon Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index cead06394e9e..9adae58462f8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3629,6 +3629,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, */ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { + LIST_HEAD(discard); struct page *page, *h; BUG_ON(irqs_disabled()); @@ -3636,13 +3637,16 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { remove_partial(n, page); - discard_slab(s, page); + list_add(&page->lru, &discard); } else { list_slab_objects(s, page, "Objects remaining in %s on __kmem_cache_shutdown()"); } } spin_unlock_irq(&n->list_lock); + + list_for_each_entry_safe(page, h, &discard, lru) + discard_slab(s, page); } /* -- cgit v1.2.2 From c1470b33bb6e18cddd361fef339ef225b8339fe7 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Thu, 11 Aug 2016 15:32:55 -0700 Subject: mm/hugetlb: fix incorrect hugepages count during mem hotplug When memory hotplug operates, free hugepages will be freed if the movable node is offline. Therefore, /proc/sys/vm/nr_hugepages will be incorrect. Fix it by reducing max_huge_pages when the node is offlined. n-horiguchi@ah.jp.nec.com said: : dissolve_free_huge_page intends to break a hugepage into buddy, and the : destination hugepage is supposed to be allocated from the pool of the : destination node, so the system-wide pool size is reduced. So adding : h->max_huge_pages-- makes sense to me. Link: http://lkml.kernel.org/r/1470624546-902-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Cc: Mike Kravetz Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b9aa1b0b38b0..87e11d8ad536 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1448,6 +1448,7 @@ static void dissolve_free_huge_page(struct page *page) list_del(&page->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; + h->max_huge_pages--; update_and_free_page(h, page); } spin_unlock(&hugetlb_lock); -- cgit v1.2.2 From 2f95ff90b99600f53df4a0aa652322d349d67957 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 11 Aug 2016 15:32:57 -0700 Subject: proc, meminfo: use correct helpers for calculating LRU sizes in meminfo meminfo_proc_show() and si_mem_available() are using the wrong helpers for calculating the size of the LRUs. The user-visible impact is that there appears to be an abnormally high number of unevictable pages. Link: http://lkml.kernel.org/r/20160805105805.GR2799@techsingularity.net Signed-off-by: Mel Gorman Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ab2c0ff8c2e6..3fbe73a6fe4b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4060,7 +4060,7 @@ long si_mem_available(void) int lru; for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_page_state(NR_LRU_BASE + lru); + pages[lru] = global_node_page_state(NR_LRU_BASE + lru); for_each_zone(zone) wmark_low += zone->watermark[WMARK_LOW]; -- cgit v1.2.2 From 1f47b61fb4077936465dcde872a4e5cc4fe708da Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 11 Aug 2016 15:33:00 -0700 Subject: mm: memcontrol: fix swap counter leak on swapout from offline cgroup An offline memory cgroup might have anonymous memory or shmem left charged to it and no swap. Since only swap entries pin the id of an offline cgroup, such a cgroup will have no id and so an attempt to swapout its anon/shmem will not store memory cgroup info in the swap cgroup map. As a result, memcg->swap or memcg->memsw will never get uncharged from it and any of its ascendants. Fix this by always charging swapout to the first ancestor cgroup that hasn't released its id yet. [hannes@cmpxchg.org: add comment to mem_cgroup_swapout] [vdavydov@virtuozzo.com: use WARN_ON_ONCE() in mem_cgroup_id_get_online()] Link: http://lkml.kernel.org/r/20160803123445.GJ13263@esperanza Fixes: 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs") Link: http://lkml.kernel.org/r/5336daa5c9a32e776067773d9da655d2dc126491.1470219853.git.vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: [3.19+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e74d7080ec9e..791b00ca4e8b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4082,6 +4082,24 @@ static void mem_cgroup_id_get(struct mem_cgroup *memcg) atomic_inc(&memcg->id.ref); } +static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +{ + while (!atomic_inc_not_zero(&memcg->id.ref)) { + /* + * The root cgroup cannot be destroyed, so it's refcount must + * always be >= 1. + */ + if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { + VM_BUG_ON(1); + break; + } + memcg = parent_mem_cgroup(memcg); + if (!memcg) + memcg = root_mem_cgroup; + } + return memcg; +} + static void mem_cgroup_id_put(struct mem_cgroup *memcg) { if (atomic_dec_and_test(&memcg->id.ref)) { @@ -5800,7 +5818,7 @@ subsys_initcall(mem_cgroup_init); */ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *swap_memcg; unsigned short oldid; VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5815,16 +5833,27 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!memcg) return; - mem_cgroup_id_get(memcg); - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + /* + * In case the memcg owning these pages has been offlined and doesn't + * have an ID allocated to it anymore, charge the closest online + * ancestor for the swap instead and transfer the memory+swap charge. + */ + swap_memcg = mem_cgroup_id_get_online(memcg); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg)); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, true); + mem_cgroup_swap_statistics(swap_memcg, true); page->mem_cgroup = NULL; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); + if (memcg != swap_memcg) { + if (!mem_cgroup_is_root(swap_memcg)) + page_counter_charge(&swap_memcg->memsw, 1); + page_counter_uncharge(&memcg->memsw, 1); + } + /* * Interrupts should be disabled here because the caller holds the * mapping->tree_lock lock which is taken with interrupts-off. It is @@ -5863,11 +5892,14 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!memcg) return 0; + memcg = mem_cgroup_id_get_online(memcg); + if (!mem_cgroup_is_root(memcg) && - !page_counter_try_charge(&memcg->swap, 1, &counter)) + !page_counter_try_charge(&memcg->swap, 1, &counter)) { + mem_cgroup_id_put(memcg); return -ENOMEM; + } - mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); -- cgit v1.2.2 From 615d66c37c755c49ce022c9e5ac0875d27d2603d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 11 Aug 2016 15:33:03 -0700 Subject: mm: memcontrol: fix memcg id ref counter on swap charge move Since commit 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs") swap entries do not pin memcg->css.refcnt directly. Instead, they pin memcg->id.ref. So we should adjust the reference counters accordingly when moving swap charges between cgroups. Fixes: 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs") Link: http://lkml.kernel.org/r/9ce297c64954a42dc90b543bc76106c4a94f07e8.1470219853.git.vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: [3.19+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 791b00ca4e8b..2ff0289ad061 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4077,9 +4077,9 @@ static struct cftype mem_cgroup_legacy_files[] = { static DEFINE_IDR(mem_cgroup_idr); -static void mem_cgroup_id_get(struct mem_cgroup *memcg) +static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { - atomic_inc(&memcg->id.ref); + atomic_add(n, &memcg->id.ref); } static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) @@ -4100,9 +4100,9 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) return memcg; } -static void mem_cgroup_id_put(struct mem_cgroup *memcg) +static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { - if (atomic_dec_and_test(&memcg->id.ref)) { + if (atomic_sub_and_test(n, &memcg->id.ref)) { idr_remove(&mem_cgroup_idr, memcg->id.id); memcg->id.id = 0; @@ -4111,6 +4111,16 @@ static void mem_cgroup_id_put(struct mem_cgroup *memcg) } } +static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) +{ + mem_cgroup_id_get_many(memcg, 1); +} + +static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + mem_cgroup_id_put_many(memcg, 1); +} + /** * mem_cgroup_from_id - look up a memcg from a memcg id * @id: the memcg id to look up @@ -4745,6 +4755,8 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.from)) page_counter_uncharge(&mc.from->memsw, mc.moved_swap); + mem_cgroup_id_put_many(mc.from, mc.moved_swap); + /* * we charged both to->memory and to->memsw, so we * should uncharge to->memory. @@ -4752,9 +4764,9 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.to)) page_counter_uncharge(&mc.to->memory, mc.moved_swap); - css_put_many(&mc.from->css, mc.moved_swap); + mem_cgroup_id_get_many(mc.to, mc.moved_swap); + css_put_many(&mc.to->css, mc.moved_swap); - /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } memcg_oom_recover(from); -- cgit v1.2.2 From bcbf0d566b6e59a6e873bfe415cc415111a819e2 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 11 Aug 2016 15:33:06 -0700 Subject: kasan: remove the unnecessary WARN_ONCE from quarantine.c It's quite unlikely that the user will so little memory that the per-CPU quarantines won't fit into the given fraction of the available memory. Even in that case he won't be able to do anything with the information given in the warning. Link: http://lkml.kernel.org/r/1470929182-101413-1-git-send-email-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Kuthonuzo Luruo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/quarantine.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index b6728a33a4ac..baabaad4a4aa 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -217,11 +217,8 @@ void quarantine_reduce(void) new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / QUARANTINE_FRACTION; percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); - if (WARN_ONCE(new_quarantine_size < percpu_quarantines, - "Too little memory, disabling global KASAN quarantine.\n")) - new_quarantine_size = 0; - else - new_quarantine_size -= percpu_quarantines; + new_quarantine_size = (new_quarantine_size < percpu_quarantines) ? + 0 : new_quarantine_size - percpu_quarantines; WRITE_ONCE(quarantine_size, new_quarantine_size); last = global_quarantine.head; -- cgit v1.2.2 From f33e6f0671b3ba81acef4d7c078af86afcc855c4 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 11 Aug 2016 15:33:09 -0700 Subject: mm, oom: fix uninitialized ret in task_will_free_mem() mm/oom_kill.c: In function `task_will_free_mem': mm/oom_kill.c:767: warning: `ret' may be used uninitialized in this function If __task_will_free_mem() is never called inside the for_each_process() loop, ret will not be initialized. Fixes: 1af8bb43269563e4 ("mm, oom: fortify task_will_free_mem()") Link: http://lkml.kernel.org/r/1470255599-24841-1-git-send-email-geert@linux-m68k.org Signed-off-by: Geert Uytterhoeven Acked-by: Tetsuo Handa Acked-by: Michal Hocko Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7d0a275df822..d53a9aa00977 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -764,7 +764,7 @@ bool task_will_free_mem(struct task_struct *task) { struct mm_struct *mm = task->mm; struct task_struct *p; - bool ret; + bool ret = true; /* * Skip tasks without mm because it might have passed its exit_mm and -- cgit v1.2.2 From 5830169f47269f78f6624bd70165eb571270da82 Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Thu, 11 Aug 2016 15:33:12 -0700 Subject: mm/memory_hotplug.c: initialize per_cpu_nodestats for hotadded pgdats The following oops occurs after a pgdat is hotadded: Unable to handle kernel paging request for data at address 0x00c30001 Faulting instruction address: 0xc00000000022f8f4 Oops: Kernel access of bad area, sig: 11 [#1] SMP NR_CPUS=2048 NUMA pSeries Modules linked in: ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter nls_utf8 isofs sg virtio_balloon uio_pdrv_genirq uio ip_tables xfs libcrc32c sr_mod cdrom sd_mod virtio_net ibmvscsi scsi_transport_srp virtio_pci virtio_ring virtio dm_mirror dm_region_hash dm_log dm_mod CPU: 0 PID: 0 Comm: swapper/0 Tainted: G W 4.8.0-rc1-device #110 task: c000000000ef3080 task.stack: c000000000f6c000 NIP: c00000000022f8f4 LR: c00000000022f948 CTR: 0000000000000000 REGS: c000000000f6fa50 TRAP: 0300 Tainted: G W (4.8.0-rc1-device) MSR: 800000010280b033 CR: 84002028 XER: 20000000 CFAR: d000000001d2013c DAR: 0000000000c30001 DSISR: 40000000 SOFTE: 0 NIP refresh_cpu_vm_stats+0x1a4/0x2f0 LR refresh_cpu_vm_stats+0x1f8/0x2f0 Call Trace: refresh_cpu_vm_stats+0x1f8/0x2f0 (unreliable) Add per_cpu_nodestats initialization to the hotplug codepath. Link: http://lkml.kernel.org/r/1470931473-7090-1-git-send-email-arbab@linux.vnet.ibm.com Signed-off-by: Reza Arbab Cc: Mel Gorman Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3894b65b1555..41266dc29f33 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1219,6 +1219,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_node(nid, zones_size, start_pfn, zholes_size); + pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); /* * The node we allocated has no zone fallback lists. For avoiding @@ -1249,6 +1250,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) static void rollback_node_hotadd(int nid, pg_data_t *pgdat) { arch_refresh_nodedata(nid, NULL); + free_percpu(pgdat->per_cpu_nodestats); arch_free_nodedata(pgdat); return; } -- cgit v1.2.2 From 7329a655875a2f4bd6984fe8a7e00a6981e802f3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Aug 2016 12:15:22 -0700 Subject: usercopy: avoid potentially undefined behavior in pointer math check_bogus_address() checked for pointer overflow using this expression, where 'ptr' has type 'const void *': ptr + n < ptr Since pointer wraparound is undefined behavior, gcc at -O2 by default treats it like the following, which would not behave as intended: (long)n < 0 Fortunately, this doesn't currently happen for kernel code because kernel code is compiled with -fno-strict-overflow. But the expression should be fixed anyway to use well-defined integer arithmetic, since it could be treated differently by different compilers in the future or could be reported by tools checking for undefined behavior. Signed-off-by: Eric Biggers Signed-off-by: Kees Cook --- mm/usercopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/usercopy.c b/mm/usercopy.c index 8ebae91a6b55..82f81df2edcf 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -124,7 +124,7 @@ static inline const char *check_kernel_text_object(const void *ptr, static inline const char *check_bogus_address(const void *ptr, unsigned long n) { /* Reject if object wraps past end of memory. */ - if (ptr + n < ptr) + if ((unsigned long)ptr + n < (unsigned long)ptr) return ""; /* Reject if NULL or ZERO-allocation. */ -- cgit v1.2.2 From 94cd97af690dd9537818dc9841d0ec68bb1dd877 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 22 Aug 2016 11:53:59 -0500 Subject: usercopy: fix overlap check for kernel text When running with a local patch which moves the '_stext' symbol to the very beginning of the kernel text area, I got the following panic with CONFIG_HARDENED_USERCOPY: usercopy: kernel memory exposure attempt detected from ffff88103dfff000 () (4096 bytes) ------------[ cut here ]------------ kernel BUG at mm/usercopy.c:79! invalid opcode: 0000 [#1] SMP ... CPU: 0 PID: 4800 Comm: cp Not tainted 4.8.0-rc3.after+ #1 Hardware name: Dell Inc. PowerEdge R720/0X3D66, BIOS 2.5.4 01/22/2016 task: ffff880817444140 task.stack: ffff880816274000 RIP: 0010:[] __check_object_size+0x76/0x413 RSP: 0018:ffff880816277c40 EFLAGS: 00010246 RAX: 000000000000006b RBX: ffff88103dfff000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff88081f80dfa8 RDI: ffff88081f80dfa8 RBP: ffff880816277c90 R08: 000000000000054c R09: 0000000000000000 R10: 0000000000000005 R11: 0000000000000006 R12: 0000000000001000 R13: ffff88103e000000 R14: ffff88103dffffff R15: 0000000000000001 FS: 00007fb9d1750800(0000) GS:ffff88081f800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000021d2000 CR3: 000000081a08f000 CR4: 00000000001406f0 Stack: ffff880816277cc8 0000000000010000 000000043de07000 0000000000000000 0000000000001000 ffff880816277e60 0000000000001000 ffff880816277e28 000000000000c000 0000000000001000 ffff880816277ce8 ffffffff8136c3a6 Call Trace: [] copy_page_to_iter_iovec+0xa6/0x1c0 [] copy_page_to_iter+0x16/0x90 [] generic_file_read_iter+0x3e3/0x7c0 [] ? xfs_file_buffered_aio_write+0xad/0x260 [xfs] [] ? down_read+0x12/0x40 [] xfs_file_buffered_aio_read+0x51/0xc0 [xfs] [] xfs_file_read_iter+0x62/0xb0 [xfs] [] __vfs_read+0xdf/0x130 [] vfs_read+0x8e/0x140 [] SyS_read+0x55/0xc0 [] do_syscall_64+0x67/0x160 [] entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:[<00007fb9d0c33c00>] 0x7fb9d0c33c00 RSP: 002b:00007ffc9c262f28 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: fffffffffff8ffff RCX: 00007fb9d0c33c00 RDX: 0000000000010000 RSI: 00000000021c3000 RDI: 0000000000000004 RBP: 00000000021c3000 R08: 0000000000000000 R09: 00007ffc9c264d6c R10: 00007ffc9c262c50 R11: 0000000000000246 R12: 0000000000010000 R13: 00007ffc9c2630b0 R14: 0000000000000004 R15: 0000000000010000 Code: 81 48 0f 44 d0 48 c7 c6 90 4d a3 81 48 c7 c0 bb b3 a2 81 48 0f 44 f0 4d 89 e1 48 89 d9 48 c7 c7 68 16 a3 81 31 c0 e8 f4 57 f7 ff <0f> 0b 48 8d 90 00 40 00 00 48 39 d3 0f 83 22 01 00 00 48 39 c3 RIP [] __check_object_size+0x76/0x413 RSP The checked object's range [ffff88103dfff000, ffff88103e000000) is valid, so there shouldn't have been a BUG. The hardened usercopy code got confused because the range's ending address is the same as the kernel's text starting address at 0xffff88103e000000. The overlap check is slightly off. Fixes: f5509cc18daa ("mm: Hardened usercopy") Signed-off-by: Josh Poimboeuf Signed-off-by: Kees Cook --- mm/usercopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/usercopy.c b/mm/usercopy.c index 82f81df2edcf..a3cc3052f830 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -83,7 +83,7 @@ static bool overlaps(const void *ptr, unsigned long n, unsigned long low, unsigned long check_high = check_low + n; /* Does not overlap if entirely above or entirely below. */ - if (check_low >= high || check_high < low) + if (check_low >= high || check_high <= low) return false; return true; -- cgit v1.2.2 From 804dd150468cfd920d92d4b3cf00536fedef3902 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 25 Aug 2016 15:16:57 -0700 Subject: soft_dirty: fix soft_dirty during THP split While adding proper userfaultfd_wp support with bits in pagetable and swap entry to avoid false positives WP userfaults through swap/fork/ KSM/etc, I've been adding a framework that mostly mirrors soft dirty. So I noticed in one place I had to add uffd_wp support to the pagetables that wasn't covered by soft_dirty and I think it should have. Example: in the THP migration code migrate_misplaced_transhuge_page() pmd_mkdirty is called unconditionally after mk_huge_pmd. entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); That sets soft dirty too (it's a false positive for soft dirty, the soft dirty bit could be more finegrained and transfer the bit like uffd_wp will do.. pmd/pte_uffd_wp() enforces the invariant that when it's set pmd/pte_write is not set). However in the THP split there's no unconditional pmd_mkdirty after mk_huge_pmd and pte_swp_mksoft_dirty isn't called after the migration entry is created. The code sets the dirty bit in the struct page instead of setting it in the pagetable (which is fully equivalent as far as the real dirty bit is concerned, as the whole point of pagetable bits is to be eventually flushed out of to the page, but that is not equivalent for the soft-dirty bit that gets lost in translation). This was found by code review only and totally untested as I'm working to actually replace soft dirty and I don't have time to test potential soft dirty bugfixes as well :). Transfer the soft_dirty from pmd to pte during THP splits. This fix avoids losing the soft_dirty bit and avoids userland memory corruption in the checkpoint. Fixes: eef1b3ba053aa6 ("thp: implement split_huge_pmd()") Link: http://lkml.kernel.org/r/1471610515-30229-2-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Acked-by: Pavel Emelyanov Cc: "Kirill A. Shutemov" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2373f0a7d340..2db2112aa31e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1512,7 +1512,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct page *page; pgtable_t pgtable; pmd_t _pmd; - bool young, write, dirty; + bool young, write, dirty, soft_dirty; unsigned long addr; int i; @@ -1546,6 +1546,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, write = pmd_write(*pmd); young = pmd_young(*pmd); dirty = pmd_dirty(*pmd); + soft_dirty = pmd_soft_dirty(*pmd); pmdp_huge_split_prepare(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); @@ -1562,6 +1563,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t swp_entry; swp_entry = make_migration_entry(page + i, write); entry = swp_entry_to_pte(swp_entry); + if (soft_dirty) + entry = pte_swp_mksoft_dirty(entry); } else { entry = mk_pte(page + i, vma->vm_page_prot); entry = maybe_mkwrite(entry, vma); @@ -1569,6 +1572,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_wrprotect(entry); if (!young) entry = pte_mkold(entry); + if (soft_dirty) + entry = pte_mksoft_dirty(entry); } if (dirty) SetPageDirty(page + i); -- cgit v1.2.2 From b32eaf71db6085f2ba54cf3ddf688bfc858219d0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 25 Aug 2016 15:17:05 -0700 Subject: mm: clarify COMPACTION Kconfig text The current wording of the COMPACTION Kconfig help text doesn't emphasise that disabling COMPACTION might cripple the page allocator which relies on the compaction quite heavily for high order requests and an unexpected OOM can happen with the lack of compaction. Make sure we are vocal about that. Link: http://lkml.kernel.org/r/20160823091726.GK23577@dhcp22.suse.cz Signed-off-by: Michal Hocko Cc: Markus Trippelsdorf Cc: Mel Gorman Cc: Joonsoo Kim Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 78a23c5c302d..be0ee11fa0d9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -262,7 +262,14 @@ config COMPACTION select MIGRATION depends on MMU help - Allows the compaction of memory for the allocation of huge pages. + Compaction is the only memory management component to form + high order (larger physically contiguous) memory blocks + reliably. The page allocator relies on compaction heavily and + the lack of the feature can lead to unexpected OOM killer + invocations for high order memory requests. You shouldn't + disable this option unless there really is a strong reason for + it and then we would be really interested to hear about that at + linux-mm@kvack.org. # # support for page migration -- cgit v1.2.2 From 358c07fcc3b60ab08d77f1684de8bd81bcf49a1a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 25 Aug 2016 15:17:08 -0700 Subject: mm: memcontrol: avoid unused function warning A bugfix in v4.8-rc2 introduced a harmless warning when CONFIG_MEMCG_SWAP is disabled but CONFIG_MEMCG is enabled: mm/memcontrol.c:4085:27: error: 'mem_cgroup_id_get_online' defined but not used [-Werror=unused-function] static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) This moves the function inside of the #ifdef block that hides the calling function, to avoid the warning. Fixes: 1f47b61fb407 ("mm: memcontrol: fix swap counter leak on swapout from offline cgroup") Link: http://lkml.kernel.org/r/20160824113733.2776701-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Michal Hocko Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2ff0289ad061..9a6a51a7c416 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4082,24 +4082,6 @@ static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) atomic_add(n, &memcg->id.ref); } -static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) -{ - while (!atomic_inc_not_zero(&memcg->id.ref)) { - /* - * The root cgroup cannot be destroyed, so it's refcount must - * always be >= 1. - */ - if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { - VM_BUG_ON(1); - break; - } - memcg = parent_mem_cgroup(memcg); - if (!memcg) - memcg = root_mem_cgroup; - } - return memcg; -} - static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { if (atomic_sub_and_test(n, &memcg->id.ref)) { @@ -5821,6 +5803,24 @@ static int __init mem_cgroup_init(void) subsys_initcall(mem_cgroup_init); #ifdef CONFIG_MEMCG_SWAP +static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +{ + while (!atomic_inc_not_zero(&memcg->id.ref)) { + /* + * The root cgroup cannot be destroyed, so it's refcount must + * always be >= 1. + */ + if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { + VM_BUG_ON(1); + break; + } + memcg = parent_mem_cgroup(memcg); + if (!memcg) + memcg = root_mem_cgroup; + } + return memcg; +} + /** * mem_cgroup_swapout - transfer a memsw charge to swap * @page: page whose memsw charge to transfer -- cgit v1.2.2 From 11bd969fdefea3ac0cb9791224f1e09784e21e58 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 25 Aug 2016 15:17:17 -0700 Subject: mm: silently skip readahead for DAX inodes For DAX inodes we need to be careful to never have page cache pages in the mapping->page_tree. This radix tree should be composed only of DAX exceptional entries and zero pages. ltp's readahead02 test was triggering a warning because we were trying to insert a DAX exceptional entry but found that a page cache page had already been inserted into the tree. This page was being inserted into the radix tree in response to a readahead(2) call. Readahead doesn't make sense for DAX inodes, but we don't want it to report a failure either. Instead, we just return success and don't do any work. Link: http://lkml.kernel.org/r/20160824221429.21158-1-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reported-by: Jeff Moyer Cc: Dan Williams Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: [4.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index 65ec288dc057..c8a955b1297e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -544,6 +545,14 @@ do_readahead(struct address_space *mapping, struct file *filp, if (!mapping || !mapping->a_ops) return -EINVAL; + /* + * Readahead doesn't make sense for DAX inodes, but we don't want it + * to report a failure either. Instead, we just return success and + * don't do any work. + */ + if (dax_mapping(mapping)) + return 0; + return force_page_cache_readahead(mapping, filp, index, nr); } -- cgit v1.2.2