aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorKirill A. Shutemov <kirill.shutemov@linux.intel.com>2015-02-11 18:26:50 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-11 20:06:04 -0500
commitdc6c9a35b66b520cf67e05d8ca60ebecad3b0479 (patch)
tree41075776145d02727c15c27d522b4c93529cca77 /mm
parent8aa76875dc15b2dd21fa74eb7c12dc3c75f4b6b6 (diff)
mm: account pmd page tables to the process
Dave noticed that unprivileged process can allocate significant amount of memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PMD page tables. Linux kernel doesn't account PMD tables to the process, only PTE. The use-cases below use few tricks to allocate a lot of PMD page tables while keeping VmRSS and VmPTE low. oom_score for the process will be 0. #include <errno.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <sys/mman.h> #include <sys/prctl.h> #define PUD_SIZE (1UL << 30) #define PMD_SIZE (1UL << 21) #define NR_PUD 130000 int main(void) { char *addr = NULL; unsigned long i; prctl(PR_SET_THP_DISABLE); for (i = 0; i < NR_PUD ; i++) { addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); break; } *addr = 'x'; munmap(addr, PMD_SIZE); mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0); if (addr == MAP_FAILED) perror("re-mmap"), exit(1); } printf("PID %d consumed %lu KiB in PMD page tables\n", getpid(), i * 4096 >> 10); return pause(); } The patch addresses the issue by account PMD tables to the process the same way we account PTE. The main place where PMD tables is accounted is __pmd_alloc() and free_pmd_range(). But there're few corner cases: - HugeTLB can share PMD page tables. The patch handles by accounting the table to all processes who share it. - x86 PAE pre-allocates few PMD tables on fork. - Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity check on exit(2). Accounting only happens on configuration where PMD page table's level is present (PMD is not folded). As with nr_ptes we use per-mm counter. The counter value is used to calculate baseline for badness score by oom-killer. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Reported-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Hugh Dickins <hughd@google.com> Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org> Cc: Pavel Emelyanov <xemul@openvz.org> Cc: David Rientjes <rientjes@google.com> Tested-by: Sedat Dilek <sedat.dilek@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/debug.c3
-rw-r--r--mm/hugetlb.c8
-rw-r--r--mm/memory.c15
-rw-r--r--mm/mmap.c4
-rw-r--r--mm/oom_kill.c9
5 files changed, 25 insertions, 14 deletions
diff --git a/mm/debug.c b/mm/debug.c
index d69cb5a7ba9a..3eb3ac2fcee7 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -173,7 +173,7 @@ void dump_mm(const struct mm_struct *mm)
173 "get_unmapped_area %p\n" 173 "get_unmapped_area %p\n"
174#endif 174#endif
175 "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" 175 "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
176 "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n" 176 "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
177 "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" 177 "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
178 "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" 178 "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
179 "start_code %lx end_code %lx start_data %lx end_data %lx\n" 179 "start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -206,6 +206,7 @@ void dump_mm(const struct mm_struct *mm)
206 mm->pgd, atomic_read(&mm->mm_users), 206 mm->pgd, atomic_read(&mm->mm_users),
207 atomic_read(&mm->mm_count), 207 atomic_read(&mm->mm_count),
208 atomic_long_read((atomic_long_t *)&mm->nr_ptes), 208 atomic_long_read((atomic_long_t *)&mm->nr_ptes),
209 mm_nr_pmds((struct mm_struct *)mm),
209 mm->map_count, 210 mm->map_count,
210 mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, 211 mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
211 mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, 212 mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fd28d6ba5e5d..0a9ac6c26832 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3598,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3598 if (saddr) { 3598 if (saddr) {
3599 spte = huge_pte_offset(svma->vm_mm, saddr); 3599 spte = huge_pte_offset(svma->vm_mm, saddr);
3600 if (spte) { 3600 if (spte) {
3601 mm_inc_nr_pmds(mm);
3601 get_page(virt_to_page(spte)); 3602 get_page(virt_to_page(spte));
3602 break; 3603 break;
3603 } 3604 }
@@ -3609,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3609 3610
3610 ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); 3611 ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
3611 spin_lock(ptl); 3612 spin_lock(ptl);
3612 if (pud_none(*pud)) 3613 if (pud_none(*pud)) {
3613 pud_populate(mm, pud, 3614 pud_populate(mm, pud,
3614 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 3615 (pmd_t *)((unsigned long)spte & PAGE_MASK));
3615 else 3616 } else {
3616 put_page(virt_to_page(spte)); 3617 put_page(virt_to_page(spte));
3618 mm_inc_nr_pmds(mm);
3619 }
3617 spin_unlock(ptl); 3620 spin_unlock(ptl);
3618out: 3621out:
3619 pte = (pte_t *)pmd_alloc(mm, pud, addr); 3622 pte = (pte_t *)pmd_alloc(mm, pud, addr);
@@ -3644,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
3644 3647
3645 pud_clear(pud); 3648 pud_clear(pud);
3646 put_page(virt_to_page(ptep)); 3649 put_page(virt_to_page(ptep));
3650 mm_dec_nr_pmds(mm);
3647 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; 3651 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
3648 return 1; 3652 return 1;
3649} 3653}
diff --git a/mm/memory.c b/mm/memory.c
index d63849b5188f..bbe6a73a899d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
428 pmd = pmd_offset(pud, start); 428 pmd = pmd_offset(pud, start);
429 pud_clear(pud); 429 pud_clear(pud);
430 pmd_free_tlb(tlb, pmd, start); 430 pmd_free_tlb(tlb, pmd, start);
431 mm_dec_nr_pmds(tlb->mm);
431} 432}
432 433
433static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 434static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -3322,15 +3323,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3322 3323
3323 spin_lock(&mm->page_table_lock); 3324 spin_lock(&mm->page_table_lock);
3324#ifndef __ARCH_HAS_4LEVEL_HACK 3325#ifndef __ARCH_HAS_4LEVEL_HACK
3325 if (pud_present(*pud)) /* Another has populated it */ 3326 if (!pud_present(*pud)) {
3326 pmd_free(mm, new); 3327 mm_inc_nr_pmds(mm);
3327 else
3328 pud_populate(mm, pud, new); 3328 pud_populate(mm, pud, new);
3329#else 3329 } else /* Another has populated it */
3330 if (pgd_present(*pud)) /* Another has populated it */
3331 pmd_free(mm, new); 3330 pmd_free(mm, new);
3332 else 3331#else
3332 if (!pgd_present(*pud)) {
3333 mm_inc_nr_pmds(mm);
3333 pgd_populate(mm, pud, new); 3334 pgd_populate(mm, pud, new);
3335 } else /* Another has populated it */
3336 pmd_free(mm, new);
3334#endif /* __ARCH_HAS_4LEVEL_HACK */ 3337#endif /* __ARCH_HAS_4LEVEL_HACK */
3335 spin_unlock(&mm->page_table_lock); 3338 spin_unlock(&mm->page_table_lock);
3336 return 0; 3339 return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index 14d84666e8ba..6a7d36d133fb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2853,7 +2853,9 @@ void exit_mmap(struct mm_struct *mm)
2853 vm_unacct_memory(nr_accounted); 2853 vm_unacct_memory(nr_accounted);
2854 2854
2855 WARN_ON(atomic_long_read(&mm->nr_ptes) > 2855 WARN_ON(atomic_long_read(&mm->nr_ptes) >
2856 (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); 2856 round_up(FIRST_USER_ADDRESS, PMD_SIZE) >> PMD_SHIFT);
2857 WARN_ON(mm_nr_pmds(mm) >
2858 round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT);
2857} 2859}
2858 2860
2859/* Insert vm structure into process list sorted by address 2861/* Insert vm structure into process list sorted by address
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b8df76ee2be3..642f38cb175a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
169 * The baseline for the badness score is the proportion of RAM that each 169 * The baseline for the badness score is the proportion of RAM that each
170 * task's rss, pagetable and swap space use. 170 * task's rss, pagetable and swap space use.
171 */ 171 */
172 points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + 172 points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
173 get_mm_counter(p->mm, MM_SWAPENTS); 173 atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
174 task_unlock(p); 174 task_unlock(p);
175 175
176 /* 176 /*
@@ -351,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
351 struct task_struct *p; 351 struct task_struct *p;
352 struct task_struct *task; 352 struct task_struct *task;
353 353
354 pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); 354 pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n");
355 rcu_read_lock(); 355 rcu_read_lock();
356 for_each_process(p) { 356 for_each_process(p) {
357 if (oom_unkillable_task(p, memcg, nodemask)) 357 if (oom_unkillable_task(p, memcg, nodemask))
@@ -367,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
367 continue; 367 continue;
368 } 368 }
369 369
370 pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", 370 pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n",
371 task->pid, from_kuid(&init_user_ns, task_uid(task)), 371 task->pid, from_kuid(&init_user_ns, task_uid(task)),
372 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 372 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
373 atomic_long_read(&task->mm->nr_ptes), 373 atomic_long_read(&task->mm->nr_ptes),
374 mm_nr_pmds(task->mm),
374 get_mm_counter(task->mm, MM_SWAPENTS), 375 get_mm_counter(task->mm, MM_SWAPENTS),
375 task->signal->oom_score_adj, task->comm); 376 task->signal->oom_score_adj, task->comm);
376 task_unlock(task); 377 task_unlock(task);