mm: fix false-positive OVERCOMMIT_GUESS failures

With the default overcommit==guess we occasionally run into mmap rejections despite plenty of memory that would get dropped under pressure but just isn't accounted reclaimable. One example of this is dying cgroups pinned by some page cache. A previous case was auxiliary path name memory associated with dentries; we have since annotated those allocations to avoid overcommit failures (see d79f7aa496fc ("mm: treat indirectly reclaimable memory as free in overcommit logic")). But trying to classify all allocated memory reliably as reclaimable and unreclaimable is a bit of a fool's errand. There could be a myriad of dependencies that constantly change with kernel versions. It becomes even more questionable of an effort when considering how this estimate of available memory is used: it's not compared to the system-wide allocated virtual memory in any way. It's not even compared to the allocating process's address space. It's compared to the single allocation request at hand! So we have an elaborate left-hand side of the equation that tries to assess the exact breathing room the system has available down to a page - and then compare it to an isolated allocation request with no additional context. We could fail an allocation of N bytes, but for two allocations of N/2 bytes we'd do this elaborate dance twice in a row and then still let N bytes of virtual memory through. This doesn't make a whole lot of sense. Let's take a step back and look at the actual goal of the heuristic. From the documentation: Heuristic overcommit handling. Obvious overcommits of address space are refused. Used for a typical system. It ensures a seriously wild allocation fails while allowing overcommit to reduce swap usage. root is allowed to allocate slightly more memory in this mode. This is the default. If all we want to do is catch clearly bogus allocation requests irrespective of the general virtual memory situation, the physical memory counter-part doesn't need to be that complicated, either. When in GUESS mode, catch wild allocations by comparing their request size to total amount of ram and swap in the system. Link: http://lkml.kernel.org/r/20190412191418.26333-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Roman Gushchin <guro@fb.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Johannes Weiner <hannes@cmpxchg.org> 2019-05-13 20:21:50 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2019-05-14 12:47:50 -0400
commit: 8c7829b04c523cdc732cb77f59f03320e09f3386 (patch)
tree: 75e0a075776ef67f7c2fab1336fff5ee73953fef
parent: ac5c94264580f498e484c854031d0226b3c1038f (diff)
1 files changed, 5 insertions, 46 deletions
diff --git a/mm/util.c b/mm/util.c
index 05a464929b3e..e2e4f8c3fa12 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -652,7 +652,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
 */
 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
-        long free, allowed, reserve;
+        long allowed;
        VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
                        -(s64)vm_committed_as_batch * num_online_cpus(),
@@ -667,51 +667,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                return 0;
        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
-                free = global_zone_page_state(NR_FREE_PAGES);
+                if (pages > totalram_pages() + total_swap_pages)
-                free += global_node_page_state(NR_FILE_PAGES);
-                /*
-                 * shmem pages shouldn't be counted as free in this
-                 * case, they can't be purged, only swapped out, and
-                 * that won't affect the overall amount of available
-                 * memory in the system.
-                 */
-                free -= global_node_page_state(NR_SHMEM);
-                free += get_nr_swap_pages();
-                /*
-                 * Any slabs which are created with the
-                 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
-                 * which are reclaimable, under pressure.  The dentry
-                 * cache and most inode caches should fall into this
-                 */
-                free += global_node_page_state(NR_SLAB_RECLAIMABLE);
-                /*
-                 * Part of the kernel memory, which can be released
-                 * under memory pressure.
-                 */
-                free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
-                /*
-                 * Leave reserved pages. The pages are not for anonymous pages.
-                 */
-                if (free <= totalreserve_pages)
                        goto error;
-                else
+                return 0;
-                        free -= totalreserve_pages;
-                /*
-                 * Reserve some for root
-                 */
-                if (!cap_sys_admin)
-                        free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-                if (free > pages)
-                        return 0;
-                goto error;
        }
        allowed = vm_commit_limit();
@@ -725,7 +683,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
         * Don't let a single process grow so big a user can't recover
         */
        if (mm) {
-                reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+                long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
                allowed -= min_t(long, mm->total_vm / 32, reserve);
        }
author	Johannes Weiner <hannes@cmpxchg.org>	2019-05-13 20:21:50 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2019-05-14 12:47:50 -0400
commit	8c7829b04c523cdc732cb77f59f03320e09f3386 (patch)
tree	75e0a075776ef67f7c2fab1336fff5ee73953fef
parent	ac5c94264580f498e484c854031d0226b3c1038f (diff)

diff --git a/mm/util.c b/mm/util.c index 05a464929b3e..e2e4f8c3fa12 100644 --- a/mm/util.c +++ b/mm/util.c
@@ -652,7 +652,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
652	*/	652	*/
653	int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)	653	int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
654	{	654	{
655	long free, allowed, reserve;	655	long allowed;
656		656
657	VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <	657	VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
658	-(s64)vm_committed_as_batch * num_online_cpus(),	658	-(s64)vm_committed_as_batch * num_online_cpus(),
@@ -667,51 +667,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
667	return 0;	667	return 0;
668		668
669	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {	669	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
670	free = global_zone_page_state(NR_FREE_PAGES);	670	if (pages > totalram_pages() + total_swap_pages)
671	free += global_node_page_state(NR_FILE_PAGES);
672
673	/*
674	* shmem pages shouldn't be counted as free in this
675	* case, they can't be purged, only swapped out, and
676	* that won't affect the overall amount of available
677	* memory in the system.
678	*/
679	free -= global_node_page_state(NR_SHMEM);
680
681	free += get_nr_swap_pages();
682
683	/*
684	* Any slabs which are created with the
685	* SLAB_RECLAIM_ACCOUNT flag claim to have contents
686	* which are reclaimable, under pressure. The dentry
687	* cache and most inode caches should fall into this
688	*/
689	free += global_node_page_state(NR_SLAB_RECLAIMABLE);
690
691	/*
692	* Part of the kernel memory, which can be released
693	* under memory pressure.
694	*/
695	free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
696
697	/*
698	* Leave reserved pages. The pages are not for anonymous pages.
699	*/
700	if (free <= totalreserve_pages)
701	goto error;	671	goto error;
702	else	672	return 0;
703	free -= totalreserve_pages;
704
705	/*
706	* Reserve some for root
707	*/
708	if (!cap_sys_admin)
709	free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
710
711	if (free > pages)
712	return 0;
713
714	goto error;
715	}	673	}
716		674
717	allowed = vm_commit_limit();	675	allowed = vm_commit_limit();
@@ -725,7 +683,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
725	* Don't let a single process grow so big a user can't recover	683	* Don't let a single process grow so big a user can't recover
726	*/	684	*/
727	if (mm) {	685	if (mm) {
728	reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);	686	long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
		687
729	allowed -= min_t(long, mm->total_vm / 32, reserve);	688	allowed -= min_t(long, mm->total_vm / 32, reserve);
730	}	689	}
731		690