diff options
author | Kirill A. Shutemov <kirill.shutemov@linux.intel.com> | 2015-02-11 18:26:50 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-11 20:06:04 -0500 |
commit | dc6c9a35b66b520cf67e05d8ca60ebecad3b0479 (patch) | |
tree | 41075776145d02727c15c27d522b4c93529cca77 /mm | |
parent | 8aa76875dc15b2dd21fa74eb7c12dc3c75f4b6b6 (diff) |
mm: account pmd page tables to the process
Dave noticed that unprivileged process can allocate significant amount of
memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and
memory cgroup. The trick is to allocate a lot of PMD page tables. Linux
kernel doesn't account PMD tables to the process, only PTE.
The use-cases below use few tricks to allocate a lot of PMD page tables
while keeping VmRSS and VmPTE low. oom_score for the process will be 0.
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#define PUD_SIZE (1UL << 30)
#define PMD_SIZE (1UL << 21)
#define NR_PUD 130000
int main(void)
{
char *addr = NULL;
unsigned long i;
prctl(PR_SET_THP_DISABLE);
for (i = 0; i < NR_PUD ; i++) {
addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ,
MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
if (addr == MAP_FAILED) {
perror("mmap");
break;
}
*addr = 'x';
munmap(addr, PMD_SIZE);
mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ,
MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
if (addr == MAP_FAILED)
perror("re-mmap"), exit(1);
}
printf("PID %d consumed %lu KiB in PMD page tables\n",
getpid(), i * 4096 >> 10);
return pause();
}
The patch addresses the issue by account PMD tables to the process the
same way we account PTE.
The main place where PMD tables is accounted is __pmd_alloc() and
free_pmd_range(). But there're few corner cases:
- HugeTLB can share PMD page tables. The patch handles by accounting
the table to all processes who share it.
- x86 PAE pre-allocates few PMD tables on fork.
- Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity
check on exit(2).
Accounting only happens on configuration where PMD page table's level is
present (PMD is not folded). As with nr_ptes we use per-mm counter. The
counter value is used to calculate baseline for badness score by
oom-killer.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: David Rientjes <rientjes@google.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/debug.c | 3 | ||||
-rw-r--r-- | mm/hugetlb.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 15 | ||||
-rw-r--r-- | mm/mmap.c | 4 | ||||
-rw-r--r-- | mm/oom_kill.c | 9 |
5 files changed, 25 insertions, 14 deletions
diff --git a/mm/debug.c b/mm/debug.c index d69cb5a7ba9a..3eb3ac2fcee7 100644 --- a/mm/debug.c +++ b/mm/debug.c | |||
@@ -173,7 +173,7 @@ void dump_mm(const struct mm_struct *mm) | |||
173 | "get_unmapped_area %p\n" | 173 | "get_unmapped_area %p\n" |
174 | #endif | 174 | #endif |
175 | "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" | 175 | "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" |
176 | "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n" | 176 | "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" |
177 | "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" | 177 | "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" |
178 | "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" | 178 | "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" |
179 | "start_code %lx end_code %lx start_data %lx end_data %lx\n" | 179 | "start_code %lx end_code %lx start_data %lx end_data %lx\n" |
@@ -206,6 +206,7 @@ void dump_mm(const struct mm_struct *mm) | |||
206 | mm->pgd, atomic_read(&mm->mm_users), | 206 | mm->pgd, atomic_read(&mm->mm_users), |
207 | atomic_read(&mm->mm_count), | 207 | atomic_read(&mm->mm_count), |
208 | atomic_long_read((atomic_long_t *)&mm->nr_ptes), | 208 | atomic_long_read((atomic_long_t *)&mm->nr_ptes), |
209 | mm_nr_pmds((struct mm_struct *)mm), | ||
209 | mm->map_count, | 210 | mm->map_count, |
210 | mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, | 211 | mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, |
211 | mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, | 212 | mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fd28d6ba5e5d..0a9ac6c26832 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -3598,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
3598 | if (saddr) { | 3598 | if (saddr) { |
3599 | spte = huge_pte_offset(svma->vm_mm, saddr); | 3599 | spte = huge_pte_offset(svma->vm_mm, saddr); |
3600 | if (spte) { | 3600 | if (spte) { |
3601 | mm_inc_nr_pmds(mm); | ||
3601 | get_page(virt_to_page(spte)); | 3602 | get_page(virt_to_page(spte)); |
3602 | break; | 3603 | break; |
3603 | } | 3604 | } |
@@ -3609,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
3609 | 3610 | ||
3610 | ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); | 3611 | ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); |
3611 | spin_lock(ptl); | 3612 | spin_lock(ptl); |
3612 | if (pud_none(*pud)) | 3613 | if (pud_none(*pud)) { |
3613 | pud_populate(mm, pud, | 3614 | pud_populate(mm, pud, |
3614 | (pmd_t *)((unsigned long)spte & PAGE_MASK)); | 3615 | (pmd_t *)((unsigned long)spte & PAGE_MASK)); |
3615 | else | 3616 | } else { |
3616 | put_page(virt_to_page(spte)); | 3617 | put_page(virt_to_page(spte)); |
3618 | mm_inc_nr_pmds(mm); | ||
3619 | } | ||
3617 | spin_unlock(ptl); | 3620 | spin_unlock(ptl); |
3618 | out: | 3621 | out: |
3619 | pte = (pte_t *)pmd_alloc(mm, pud, addr); | 3622 | pte = (pte_t *)pmd_alloc(mm, pud, addr); |
@@ -3644,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | |||
3644 | 3647 | ||
3645 | pud_clear(pud); | 3648 | pud_clear(pud); |
3646 | put_page(virt_to_page(ptep)); | 3649 | put_page(virt_to_page(ptep)); |
3650 | mm_dec_nr_pmds(mm); | ||
3647 | *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; | 3651 | *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; |
3648 | return 1; | 3652 | return 1; |
3649 | } | 3653 | } |
diff --git a/mm/memory.c b/mm/memory.c index d63849b5188f..bbe6a73a899d 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | |||
428 | pmd = pmd_offset(pud, start); | 428 | pmd = pmd_offset(pud, start); |
429 | pud_clear(pud); | 429 | pud_clear(pud); |
430 | pmd_free_tlb(tlb, pmd, start); | 430 | pmd_free_tlb(tlb, pmd, start); |
431 | mm_dec_nr_pmds(tlb->mm); | ||
431 | } | 432 | } |
432 | 433 | ||
433 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | 434 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, |
@@ -3322,15 +3323,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | |||
3322 | 3323 | ||
3323 | spin_lock(&mm->page_table_lock); | 3324 | spin_lock(&mm->page_table_lock); |
3324 | #ifndef __ARCH_HAS_4LEVEL_HACK | 3325 | #ifndef __ARCH_HAS_4LEVEL_HACK |
3325 | if (pud_present(*pud)) /* Another has populated it */ | 3326 | if (!pud_present(*pud)) { |
3326 | pmd_free(mm, new); | 3327 | mm_inc_nr_pmds(mm); |
3327 | else | ||
3328 | pud_populate(mm, pud, new); | 3328 | pud_populate(mm, pud, new); |
3329 | #else | 3329 | } else /* Another has populated it */ |
3330 | if (pgd_present(*pud)) /* Another has populated it */ | ||
3331 | pmd_free(mm, new); | 3330 | pmd_free(mm, new); |
3332 | else | 3331 | #else |
3332 | if (!pgd_present(*pud)) { | ||
3333 | mm_inc_nr_pmds(mm); | ||
3333 | pgd_populate(mm, pud, new); | 3334 | pgd_populate(mm, pud, new); |
3335 | } else /* Another has populated it */ | ||
3336 | pmd_free(mm, new); | ||
3334 | #endif /* __ARCH_HAS_4LEVEL_HACK */ | 3337 | #endif /* __ARCH_HAS_4LEVEL_HACK */ |
3335 | spin_unlock(&mm->page_table_lock); | 3338 | spin_unlock(&mm->page_table_lock); |
3336 | return 0; | 3339 | return 0; |
@@ -2853,7 +2853,9 @@ void exit_mmap(struct mm_struct *mm) | |||
2853 | vm_unacct_memory(nr_accounted); | 2853 | vm_unacct_memory(nr_accounted); |
2854 | 2854 | ||
2855 | WARN_ON(atomic_long_read(&mm->nr_ptes) > | 2855 | WARN_ON(atomic_long_read(&mm->nr_ptes) > |
2856 | (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); | 2856 | round_up(FIRST_USER_ADDRESS, PMD_SIZE) >> PMD_SHIFT); |
2857 | WARN_ON(mm_nr_pmds(mm) > | ||
2858 | round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT); | ||
2857 | } | 2859 | } |
2858 | 2860 | ||
2859 | /* Insert vm structure into process list sorted by address | 2861 | /* Insert vm structure into process list sorted by address |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b8df76ee2be3..642f38cb175a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
169 | * The baseline for the badness score is the proportion of RAM that each | 169 | * The baseline for the badness score is the proportion of RAM that each |
170 | * task's rss, pagetable and swap space use. | 170 | * task's rss, pagetable and swap space use. |
171 | */ | 171 | */ |
172 | points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + | 172 | points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + |
173 | get_mm_counter(p->mm, MM_SWAPENTS); | 173 | atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); |
174 | task_unlock(p); | 174 | task_unlock(p); |
175 | 175 | ||
176 | /* | 176 | /* |
@@ -351,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
351 | struct task_struct *p; | 351 | struct task_struct *p; |
352 | struct task_struct *task; | 352 | struct task_struct *task; |
353 | 353 | ||
354 | pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); | 354 | pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); |
355 | rcu_read_lock(); | 355 | rcu_read_lock(); |
356 | for_each_process(p) { | 356 | for_each_process(p) { |
357 | if (oom_unkillable_task(p, memcg, nodemask)) | 357 | if (oom_unkillable_task(p, memcg, nodemask)) |
@@ -367,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
367 | continue; | 367 | continue; |
368 | } | 368 | } |
369 | 369 | ||
370 | pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", | 370 | pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", |
371 | task->pid, from_kuid(&init_user_ns, task_uid(task)), | 371 | task->pid, from_kuid(&init_user_ns, task_uid(task)), |
372 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), | 372 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), |
373 | atomic_long_read(&task->mm->nr_ptes), | 373 | atomic_long_read(&task->mm->nr_ptes), |
374 | mm_nr_pmds(task->mm), | ||
374 | get_mm_counter(task->mm, MM_SWAPENTS), | 375 | get_mm_counter(task->mm, MM_SWAPENTS), |
375 | task->signal->oom_score_adj, task->comm); | 376 | task->signal->oom_score_adj, task->comm); |
376 | task_unlock(task); | 377 | task_unlock(task); |