mm: account pmd page tables to the process

Dave noticed that unprivileged process can allocate significant amount of memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PMD page tables. Linux kernel doesn't account PMD tables to the process, only PTE. The use-cases below use few tricks to allocate a lot of PMD page tables while keeping VmRSS and VmPTE low. oom_score for the process will be 0. #include <errno.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <sys/mman.h> #include <sys/prctl.h> #define PUD_SIZE (1UL << 30) #define PMD_SIZE (1UL << 21) #define NR_PUD 130000 int main(void) { char *addr = NULL; unsigned long i; prctl(PR_SET_THP_DISABLE); for (i = 0; i < NR_PUD ; i++) { addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); break; } *addr = 'x'; munmap(addr, PMD_SIZE); mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0); if (addr == MAP_FAILED) perror("re-mmap"), exit(1); } printf("PID %d consumed %lu KiB in PMD page tables\n", getpid(), i * 4096 >> 10); return pause(); } The patch addresses the issue by account PMD tables to the process the same way we account PTE. The main place where PMD tables is accounted is __pmd_alloc() and free_pmd_range(). But there're few corner cases: - HugeTLB can share PMD page tables. The patch handles by accounting the table to all processes who share it. - x86 PAE pre-allocates few PMD tables on fork. - Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity check on exit(2). Accounting only happens on configuration where PMD page table's level is present (PMD is not folded). As with nr_ptes we use per-mm counter. The counter value is used to calculate baseline for badness score by oom-killer. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Reported-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Hugh Dickins <hughd@google.com> Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org> Cc: Pavel Emelyanov <xemul@openvz.org> Cc: David Rientjes <rientjes@google.com> Tested-by: Sedat Dilek <sedat.dilek@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> 2015-02-11 18:26:50 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-02-11 20:06:04 -0500
commit: dc6c9a35b66b520cf67e05d8ca60ebecad3b0479 (patch)
tree: 41075776145d02727c15c27d522b4c93529cca77 /mm/oom_kill.c
parent: 8aa76875dc15b2dd21fa74eb7c12dc3c75f4b6b6 (diff)
1 files changed, 5 insertions, 4 deletions
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b8df76ee2be3..642f38cb175a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
         * The baseline for the badness score is the proportion of RAM that each
         * task's rss, pagetable and swap space use.
         */
-        points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
+        points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-                 get_mm_counter(p->mm, MM_SWAPENTS);
+                atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
        task_unlock(p);
        /*
@@ -351,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
        struct task_struct *p;
        struct task_struct *task;
-        pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes swapents oom_score_adj name\n");
+        pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name\n");
        rcu_read_lock();
        for_each_process(p) {
                if (oom_unkillable_task(p, memcg, nodemask))
@@ -367,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
                        continue;
                }
-                pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu         %5hd %s\n",
+                pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu         %5hd %s\n",
                        task->pid, from_kuid(&init_user_ns, task_uid(task)),
                        task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
                        atomic_long_read(&task->mm->nr_ptes),
+                        mm_nr_pmds(task->mm),
                        get_mm_counter(task->mm, MM_SWAPENTS),
                        task->signal->oom_score_adj, task->comm);
                task_unlock(task);
author	Kirill A. Shutemov <kirill.shutemov@linux.intel.com>	2015-02-11 18:26:50 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-02-11 20:06:04 -0500
commit	dc6c9a35b66b520cf67e05d8ca60ebecad3b0479 (patch)
tree	41075776145d02727c15c27d522b4c93529cca77 /mm/oom_kill.c
parent	8aa76875dc15b2dd21fa74eb7c12dc3c75f4b6b6 (diff)