diff options
author | Kirill A. Shutemov <kirill.shutemov@linux.intel.com> | 2015-02-11 18:26:50 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-11 20:06:04 -0500 |
commit | dc6c9a35b66b520cf67e05d8ca60ebecad3b0479 (patch) | |
tree | 41075776145d02727c15c27d522b4c93529cca77 /arch/x86/mm | |
parent | 8aa76875dc15b2dd21fa74eb7c12dc3c75f4b6b6 (diff) |
mm: account pmd page tables to the process
Dave noticed that unprivileged process can allocate significant amount of
memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and
memory cgroup. The trick is to allocate a lot of PMD page tables. Linux
kernel doesn't account PMD tables to the process, only PTE.
The use-cases below use few tricks to allocate a lot of PMD page tables
while keeping VmRSS and VmPTE low. oom_score for the process will be 0.
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#define PUD_SIZE (1UL << 30)
#define PMD_SIZE (1UL << 21)
#define NR_PUD 130000
int main(void)
{
char *addr = NULL;
unsigned long i;
prctl(PR_SET_THP_DISABLE);
for (i = 0; i < NR_PUD ; i++) {
addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ,
MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
if (addr == MAP_FAILED) {
perror("mmap");
break;
}
*addr = 'x';
munmap(addr, PMD_SIZE);
mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ,
MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
if (addr == MAP_FAILED)
perror("re-mmap"), exit(1);
}
printf("PID %d consumed %lu KiB in PMD page tables\n",
getpid(), i * 4096 >> 10);
return pause();
}
The patch addresses the issue by account PMD tables to the process the
same way we account PTE.
The main place where PMD tables is accounted is __pmd_alloc() and
free_pmd_range(). But there're few corner cases:
- HugeTLB can share PMD page tables. The patch handles by accounting
the table to all processes who share it.
- x86 PAE pre-allocates few PMD tables on fork.
- Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity
check on exit(2).
Accounting only happens on configuration where PMD page table's level is
present (PMD is not folded). As with nr_ptes we use per-mm counter. The
counter value is used to calculate baseline for badness score by
oom-killer.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: David Rientjes <rientjes@google.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/pgtable.c | 14 |
1 files changed, 9 insertions, 5 deletions
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6fb6927f9e76..7b22adaad4f1 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -190,7 +190,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) | |||
190 | 190 | ||
191 | #endif /* CONFIG_X86_PAE */ | 191 | #endif /* CONFIG_X86_PAE */ |
192 | 192 | ||
193 | static void free_pmds(pmd_t *pmds[]) | 193 | static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) |
194 | { | 194 | { |
195 | int i; | 195 | int i; |
196 | 196 | ||
@@ -198,10 +198,11 @@ static void free_pmds(pmd_t *pmds[]) | |||
198 | if (pmds[i]) { | 198 | if (pmds[i]) { |
199 | pgtable_pmd_page_dtor(virt_to_page(pmds[i])); | 199 | pgtable_pmd_page_dtor(virt_to_page(pmds[i])); |
200 | free_page((unsigned long)pmds[i]); | 200 | free_page((unsigned long)pmds[i]); |
201 | mm_dec_nr_pmds(mm); | ||
201 | } | 202 | } |
202 | } | 203 | } |
203 | 204 | ||
204 | static int preallocate_pmds(pmd_t *pmds[]) | 205 | static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) |
205 | { | 206 | { |
206 | int i; | 207 | int i; |
207 | bool failed = false; | 208 | bool failed = false; |
@@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[]) | |||
215 | pmd = NULL; | 216 | pmd = NULL; |
216 | failed = true; | 217 | failed = true; |
217 | } | 218 | } |
219 | if (pmd) | ||
220 | mm_inc_nr_pmds(mm); | ||
218 | pmds[i] = pmd; | 221 | pmds[i] = pmd; |
219 | } | 222 | } |
220 | 223 | ||
221 | if (failed) { | 224 | if (failed) { |
222 | free_pmds(pmds); | 225 | free_pmds(mm, pmds); |
223 | return -ENOMEM; | 226 | return -ENOMEM; |
224 | } | 227 | } |
225 | 228 | ||
@@ -246,6 +249,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) | |||
246 | 249 | ||
247 | paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); | 250 | paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); |
248 | pmd_free(mm, pmd); | 251 | pmd_free(mm, pmd); |
252 | mm_dec_nr_pmds(mm); | ||
249 | } | 253 | } |
250 | } | 254 | } |
251 | } | 255 | } |
@@ -283,7 +287,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) | |||
283 | 287 | ||
284 | mm->pgd = pgd; | 288 | mm->pgd = pgd; |
285 | 289 | ||
286 | if (preallocate_pmds(pmds) != 0) | 290 | if (preallocate_pmds(mm, pmds) != 0) |
287 | goto out_free_pgd; | 291 | goto out_free_pgd; |
288 | 292 | ||
289 | if (paravirt_pgd_alloc(mm) != 0) | 293 | if (paravirt_pgd_alloc(mm) != 0) |
@@ -304,7 +308,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) | |||
304 | return pgd; | 308 | return pgd; |
305 | 309 | ||
306 | out_free_pmds: | 310 | out_free_pmds: |
307 | free_pmds(pmds); | 311 | free_pmds(mm, pmds); |
308 | out_free_pgd: | 312 | out_free_pgd: |
309 | free_page((unsigned long)pgd); | 313 | free_page((unsigned long)pgd); |
310 | out: | 314 | out: |