diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-12 18:44:27 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-12 18:44:27 -0400 |
commit | ac4de9543aca59f2b763746647577302fbedd57e (patch) | |
tree | 40407750569ee030de56233c41c9a97f7e89cf67 | |
parent | 26935fb06ee88f1188789807687c03041f3c70d9 (diff) | |
parent | de32a8177f64bc62e1b19c685dd391af664ab13f (diff) |
Merge branch 'akpm' (patches from Andrew Morton)
Merge more patches from Andrew Morton:
"The rest of MM. Plus one misc cleanup"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (35 commits)
mm/Kconfig: add MMU dependency for MIGRATION.
kernel: replace strict_strto*() with kstrto*()
mm, thp: count thp_fault_fallback anytime thp fault fails
thp: consolidate code between handle_mm_fault() and do_huge_pmd_anonymous_page()
thp: do_huge_pmd_anonymous_page() cleanup
thp: move maybe_pmd_mkwrite() out of mk_huge_pmd()
mm: cleanup add_to_page_cache_locked()
thp: account anon transparent huge pages into NR_ANON_PAGES
truncate: drop 'oldsize' truncate_pagecache() parameter
mm: make lru_add_drain_all() selective
memcg: document cgroup dirty/writeback memory statistics
memcg: add per cgroup writeback pages accounting
memcg: check for proper lock held in mem_cgroup_update_page_stat
memcg: remove MEMCG_NR_FILE_MAPPED
memcg: reduce function dereference
memcg: avoid overflow caused by PAGE_ALIGN
memcg: rename RESOURCE_MAX to RES_COUNTER_MAX
memcg: correct RESOURCE_MAX to ULLONG_MAX
mm: memcg: do not trap chargers with full callstack on OOM
mm: memcg: rework and document OOM waiting and wakeup
...
79 files changed, 973 insertions, 919 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 2a3330696372..8af4ad121828 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -490,6 +490,8 @@ pgpgin - # of charging events to the memory cgroup. The charging | |||
490 | pgpgout - # of uncharging events to the memory cgroup. The uncharging | 490 | pgpgout - # of uncharging events to the memory cgroup. The uncharging |
491 | event happens each time a page is unaccounted from the cgroup. | 491 | event happens each time a page is unaccounted from the cgroup. |
492 | swap - # of bytes of swap usage | 492 | swap - # of bytes of swap usage |
493 | writeback - # of bytes of file/anon cache that are queued for syncing to | ||
494 | disk. | ||
493 | inactive_anon - # of bytes of anonymous and swap cache memory on inactive | 495 | inactive_anon - # of bytes of anonymous and swap cache memory on inactive |
494 | LRU list. | 496 | LRU list. |
495 | active_anon - # of bytes of anonymous and swap cache memory on active | 497 | active_anon - # of bytes of anonymous and swap cache memory on active |
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 0c4132dd3507..98838a05ba6d 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c | |||
@@ -89,8 +89,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, | |||
89 | const struct exception_table_entry *fixup; | 89 | const struct exception_table_entry *fixup; |
90 | int fault, si_code = SEGV_MAPERR; | 90 | int fault, si_code = SEGV_MAPERR; |
91 | siginfo_t info; | 91 | siginfo_t info; |
92 | unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 92 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
93 | (cause > 0 ? FAULT_FLAG_WRITE : 0)); | ||
94 | 93 | ||
95 | /* As of EV6, a load into $31/$f31 is a prefetch, and never faults | 94 | /* As of EV6, a load into $31/$f31 is a prefetch, and never faults |
96 | (or is suppressed by the PALcode). Support that for older CPUs | 95 | (or is suppressed by the PALcode). Support that for older CPUs |
@@ -115,7 +114,8 @@ do_page_fault(unsigned long address, unsigned long mmcsr, | |||
115 | if (address >= TASK_SIZE) | 114 | if (address >= TASK_SIZE) |
116 | goto vmalloc_fault; | 115 | goto vmalloc_fault; |
117 | #endif | 116 | #endif |
118 | 117 | if (user_mode(regs)) | |
118 | flags |= FAULT_FLAG_USER; | ||
119 | retry: | 119 | retry: |
120 | down_read(&mm->mmap_sem); | 120 | down_read(&mm->mmap_sem); |
121 | vma = find_vma(mm, address); | 121 | vma = find_vma(mm, address); |
@@ -142,6 +142,7 @@ retry: | |||
142 | } else { | 142 | } else { |
143 | if (!(vma->vm_flags & VM_WRITE)) | 143 | if (!(vma->vm_flags & VM_WRITE)) |
144 | goto bad_area; | 144 | goto bad_area; |
145 | flags |= FAULT_FLAG_WRITE; | ||
145 | } | 146 | } |
146 | 147 | ||
147 | /* If for any reason at all we couldn't handle the fault, | 148 | /* If for any reason at all we couldn't handle the fault, |
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 0fd1f0d515ff..d63f3de0cd5b 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c | |||
@@ -60,8 +60,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address) | |||
60 | siginfo_t info; | 60 | siginfo_t info; |
61 | int fault, ret; | 61 | int fault, ret; |
62 | int write = regs->ecr_cause & ECR_C_PROTV_STORE; /* ST/EX */ | 62 | int write = regs->ecr_cause & ECR_C_PROTV_STORE; /* ST/EX */ |
63 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 63 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
64 | (write ? FAULT_FLAG_WRITE : 0); | ||
65 | 64 | ||
66 | /* | 65 | /* |
67 | * We fault-in kernel-space virtual memory on-demand. The | 66 | * We fault-in kernel-space virtual memory on-demand. The |
@@ -89,6 +88,8 @@ void do_page_fault(struct pt_regs *regs, unsigned long address) | |||
89 | if (in_atomic() || !mm) | 88 | if (in_atomic() || !mm) |
90 | goto no_context; | 89 | goto no_context; |
91 | 90 | ||
91 | if (user_mode(regs)) | ||
92 | flags |= FAULT_FLAG_USER; | ||
92 | retry: | 93 | retry: |
93 | down_read(&mm->mmap_sem); | 94 | down_read(&mm->mmap_sem); |
94 | vma = find_vma(mm, address); | 95 | vma = find_vma(mm, address); |
@@ -117,12 +118,12 @@ good_area: | |||
117 | if (write) { | 118 | if (write) { |
118 | if (!(vma->vm_flags & VM_WRITE)) | 119 | if (!(vma->vm_flags & VM_WRITE)) |
119 | goto bad_area; | 120 | goto bad_area; |
121 | flags |= FAULT_FLAG_WRITE; | ||
120 | } else { | 122 | } else { |
121 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | 123 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) |
122 | goto bad_area; | 124 | goto bad_area; |
123 | } | 125 | } |
124 | 126 | ||
125 | survive: | ||
126 | /* | 127 | /* |
127 | * If for any reason at all we couldn't handle the fault, | 128 | * If for any reason at all we couldn't handle the fault, |
128 | * make sure we exit gracefully rather than endlessly redo | 129 | * make sure we exit gracefully rather than endlessly redo |
@@ -201,10 +202,6 @@ no_context: | |||
201 | die("Oops", regs, address); | 202 | die("Oops", regs, address); |
202 | 203 | ||
203 | out_of_memory: | 204 | out_of_memory: |
204 | if (is_global_init(tsk)) { | ||
205 | yield(); | ||
206 | goto survive; | ||
207 | } | ||
208 | up_read(&mm->mmap_sem); | 205 | up_read(&mm->mmap_sem); |
209 | 206 | ||
210 | if (user_mode(regs)) { | 207 | if (user_mode(regs)) { |
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index c97f7940cb95..eb8830a4c5ed 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c | |||
@@ -261,9 +261,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) | |||
261 | struct task_struct *tsk; | 261 | struct task_struct *tsk; |
262 | struct mm_struct *mm; | 262 | struct mm_struct *mm; |
263 | int fault, sig, code; | 263 | int fault, sig, code; |
264 | int write = fsr & FSR_WRITE; | 264 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
265 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | ||
266 | (write ? FAULT_FLAG_WRITE : 0); | ||
267 | 265 | ||
268 | if (notify_page_fault(regs, fsr)) | 266 | if (notify_page_fault(regs, fsr)) |
269 | return 0; | 267 | return 0; |
@@ -282,6 +280,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) | |||
282 | if (in_atomic() || !mm) | 280 | if (in_atomic() || !mm) |
283 | goto no_context; | 281 | goto no_context; |
284 | 282 | ||
283 | if (user_mode(regs)) | ||
284 | flags |= FAULT_FLAG_USER; | ||
285 | if (fsr & FSR_WRITE) | ||
286 | flags |= FAULT_FLAG_WRITE; | ||
287 | |||
285 | /* | 288 | /* |
286 | * As per x86, we may deadlock here. However, since the kernel only | 289 | * As per x86, we may deadlock here. However, since the kernel only |
287 | * validly references user space from well defined areas of the code, | 290 | * validly references user space from well defined areas of the code, |
@@ -349,6 +352,13 @@ retry: | |||
349 | if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) | 352 | if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) |
350 | return 0; | 353 | return 0; |
351 | 354 | ||
355 | /* | ||
356 | * If we are in kernel mode at this point, we | ||
357 | * have no context to handle this fault with. | ||
358 | */ | ||
359 | if (!user_mode(regs)) | ||
360 | goto no_context; | ||
361 | |||
352 | if (fault & VM_FAULT_OOM) { | 362 | if (fault & VM_FAULT_OOM) { |
353 | /* | 363 | /* |
354 | * We ran out of memory, call the OOM killer, and return to | 364 | * We ran out of memory, call the OOM killer, and return to |
@@ -359,13 +369,6 @@ retry: | |||
359 | return 0; | 369 | return 0; |
360 | } | 370 | } |
361 | 371 | ||
362 | /* | ||
363 | * If we are in kernel mode at this point, we | ||
364 | * have no context to handle this fault with. | ||
365 | */ | ||
366 | if (!user_mode(regs)) | ||
367 | goto no_context; | ||
368 | |||
369 | if (fault & VM_FAULT_SIGBUS) { | 372 | if (fault & VM_FAULT_SIGBUS) { |
370 | /* | 373 | /* |
371 | * We had some memory, but were unable to | 374 | * We had some memory, but were unable to |
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 6c8ba25bf6bb..6d6acf153bff 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c | |||
@@ -199,13 +199,6 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, | |||
199 | unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC; | 199 | unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC; |
200 | unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; | 200 | unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
201 | 201 | ||
202 | if (esr & ESR_LNX_EXEC) { | ||
203 | vm_flags = VM_EXEC; | ||
204 | } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { | ||
205 | vm_flags = VM_WRITE; | ||
206 | mm_flags |= FAULT_FLAG_WRITE; | ||
207 | } | ||
208 | |||
209 | tsk = current; | 202 | tsk = current; |
210 | mm = tsk->mm; | 203 | mm = tsk->mm; |
211 | 204 | ||
@@ -220,6 +213,16 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, | |||
220 | if (in_atomic() || !mm) | 213 | if (in_atomic() || !mm) |
221 | goto no_context; | 214 | goto no_context; |
222 | 215 | ||
216 | if (user_mode(regs)) | ||
217 | mm_flags |= FAULT_FLAG_USER; | ||
218 | |||
219 | if (esr & ESR_LNX_EXEC) { | ||
220 | vm_flags = VM_EXEC; | ||
221 | } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { | ||
222 | vm_flags = VM_WRITE; | ||
223 | mm_flags |= FAULT_FLAG_WRITE; | ||
224 | } | ||
225 | |||
223 | /* | 226 | /* |
224 | * As per x86, we may deadlock here. However, since the kernel only | 227 | * As per x86, we may deadlock here. However, since the kernel only |
225 | * validly references user space from well defined areas of the code, | 228 | * validly references user space from well defined areas of the code, |
@@ -288,6 +291,13 @@ retry: | |||
288 | VM_FAULT_BADACCESS)))) | 291 | VM_FAULT_BADACCESS)))) |
289 | return 0; | 292 | return 0; |
290 | 293 | ||
294 | /* | ||
295 | * If we are in kernel mode at this point, we have no context to | ||
296 | * handle this fault with. | ||
297 | */ | ||
298 | if (!user_mode(regs)) | ||
299 | goto no_context; | ||
300 | |||
291 | if (fault & VM_FAULT_OOM) { | 301 | if (fault & VM_FAULT_OOM) { |
292 | /* | 302 | /* |
293 | * We ran out of memory, call the OOM killer, and return to | 303 | * We ran out of memory, call the OOM killer, and return to |
@@ -298,13 +308,6 @@ retry: | |||
298 | return 0; | 308 | return 0; |
299 | } | 309 | } |
300 | 310 | ||
301 | /* | ||
302 | * If we are in kernel mode at this point, we have no context to | ||
303 | * handle this fault with. | ||
304 | */ | ||
305 | if (!user_mode(regs)) | ||
306 | goto no_context; | ||
307 | |||
308 | if (fault & VM_FAULT_SIGBUS) { | 311 | if (fault & VM_FAULT_SIGBUS) { |
309 | /* | 312 | /* |
310 | * We had some memory, but were unable to successfully fix up | 313 | * We had some memory, but were unable to successfully fix up |
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c index b2f2d2d66849..0eca93327195 100644 --- a/arch/avr32/mm/fault.c +++ b/arch/avr32/mm/fault.c | |||
@@ -86,6 +86,8 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs) | |||
86 | 86 | ||
87 | local_irq_enable(); | 87 | local_irq_enable(); |
88 | 88 | ||
89 | if (user_mode(regs)) | ||
90 | flags |= FAULT_FLAG_USER; | ||
89 | retry: | 91 | retry: |
90 | down_read(&mm->mmap_sem); | 92 | down_read(&mm->mmap_sem); |
91 | 93 | ||
@@ -228,9 +230,9 @@ no_context: | |||
228 | */ | 230 | */ |
229 | out_of_memory: | 231 | out_of_memory: |
230 | up_read(&mm->mmap_sem); | 232 | up_read(&mm->mmap_sem); |
231 | pagefault_out_of_memory(); | ||
232 | if (!user_mode(regs)) | 233 | if (!user_mode(regs)) |
233 | goto no_context; | 234 | goto no_context; |
235 | pagefault_out_of_memory(); | ||
234 | return; | 236 | return; |
235 | 237 | ||
236 | do_sigbus: | 238 | do_sigbus: |
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c index 73312ab6c696..1790f22e71a2 100644 --- a/arch/cris/mm/fault.c +++ b/arch/cris/mm/fault.c | |||
@@ -58,8 +58,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs, | |||
58 | struct vm_area_struct * vma; | 58 | struct vm_area_struct * vma; |
59 | siginfo_t info; | 59 | siginfo_t info; |
60 | int fault; | 60 | int fault; |
61 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 61 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
62 | ((writeaccess & 1) ? FAULT_FLAG_WRITE : 0); | ||
63 | 62 | ||
64 | D(printk(KERN_DEBUG | 63 | D(printk(KERN_DEBUG |
65 | "Page fault for %lX on %X at %lX, prot %d write %d\n", | 64 | "Page fault for %lX on %X at %lX, prot %d write %d\n", |
@@ -117,6 +116,8 @@ do_page_fault(unsigned long address, struct pt_regs *regs, | |||
117 | if (in_atomic() || !mm) | 116 | if (in_atomic() || !mm) |
118 | goto no_context; | 117 | goto no_context; |
119 | 118 | ||
119 | if (user_mode(regs)) | ||
120 | flags |= FAULT_FLAG_USER; | ||
120 | retry: | 121 | retry: |
121 | down_read(&mm->mmap_sem); | 122 | down_read(&mm->mmap_sem); |
122 | vma = find_vma(mm, address); | 123 | vma = find_vma(mm, address); |
@@ -155,6 +156,7 @@ retry: | |||
155 | } else if (writeaccess == 1) { | 156 | } else if (writeaccess == 1) { |
156 | if (!(vma->vm_flags & VM_WRITE)) | 157 | if (!(vma->vm_flags & VM_WRITE)) |
157 | goto bad_area; | 158 | goto bad_area; |
159 | flags |= FAULT_FLAG_WRITE; | ||
158 | } else { | 160 | } else { |
159 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | 161 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) |
160 | goto bad_area; | 162 | goto bad_area; |
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c index 331c1e2cfb67..9a66372fc7c7 100644 --- a/arch/frv/mm/fault.c +++ b/arch/frv/mm/fault.c | |||
@@ -34,11 +34,11 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear | |||
34 | struct vm_area_struct *vma; | 34 | struct vm_area_struct *vma; |
35 | struct mm_struct *mm; | 35 | struct mm_struct *mm; |
36 | unsigned long _pme, lrai, lrad, fixup; | 36 | unsigned long _pme, lrai, lrad, fixup; |
37 | unsigned long flags = 0; | ||
37 | siginfo_t info; | 38 | siginfo_t info; |
38 | pgd_t *pge; | 39 | pgd_t *pge; |
39 | pud_t *pue; | 40 | pud_t *pue; |
40 | pte_t *pte; | 41 | pte_t *pte; |
41 | int write; | ||
42 | int fault; | 42 | int fault; |
43 | 43 | ||
44 | #if 0 | 44 | #if 0 |
@@ -81,6 +81,9 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear | |||
81 | if (in_atomic() || !mm) | 81 | if (in_atomic() || !mm) |
82 | goto no_context; | 82 | goto no_context; |
83 | 83 | ||
84 | if (user_mode(__frame)) | ||
85 | flags |= FAULT_FLAG_USER; | ||
86 | |||
84 | down_read(&mm->mmap_sem); | 87 | down_read(&mm->mmap_sem); |
85 | 88 | ||
86 | vma = find_vma(mm, ear0); | 89 | vma = find_vma(mm, ear0); |
@@ -129,7 +132,6 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear | |||
129 | */ | 132 | */ |
130 | good_area: | 133 | good_area: |
131 | info.si_code = SEGV_ACCERR; | 134 | info.si_code = SEGV_ACCERR; |
132 | write = 0; | ||
133 | switch (esr0 & ESR0_ATXC) { | 135 | switch (esr0 & ESR0_ATXC) { |
134 | default: | 136 | default: |
135 | /* handle write to write protected page */ | 137 | /* handle write to write protected page */ |
@@ -140,7 +142,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear | |||
140 | #endif | 142 | #endif |
141 | if (!(vma->vm_flags & VM_WRITE)) | 143 | if (!(vma->vm_flags & VM_WRITE)) |
142 | goto bad_area; | 144 | goto bad_area; |
143 | write = 1; | 145 | flags |= FAULT_FLAG_WRITE; |
144 | break; | 146 | break; |
145 | 147 | ||
146 | /* handle read from protected page */ | 148 | /* handle read from protected page */ |
@@ -162,7 +164,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear | |||
162 | * make sure we exit gracefully rather than endlessly redo | 164 | * make sure we exit gracefully rather than endlessly redo |
163 | * the fault. | 165 | * the fault. |
164 | */ | 166 | */ |
165 | fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0); | 167 | fault = handle_mm_fault(mm, vma, ear0, flags); |
166 | if (unlikely(fault & VM_FAULT_ERROR)) { | 168 | if (unlikely(fault & VM_FAULT_ERROR)) { |
167 | if (fault & VM_FAULT_OOM) | 169 | if (fault & VM_FAULT_OOM) |
168 | goto out_of_memory; | 170 | goto out_of_memory; |
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index 1bd276dbec7d..8704c9320032 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c | |||
@@ -53,8 +53,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) | |||
53 | int si_code = SEGV_MAPERR; | 53 | int si_code = SEGV_MAPERR; |
54 | int fault; | 54 | int fault; |
55 | const struct exception_table_entry *fixup; | 55 | const struct exception_table_entry *fixup; |
56 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 56 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
57 | (cause > 0 ? FAULT_FLAG_WRITE : 0); | ||
58 | 57 | ||
59 | /* | 58 | /* |
60 | * If we're in an interrupt or have no user context, | 59 | * If we're in an interrupt or have no user context, |
@@ -65,6 +64,8 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) | |||
65 | 64 | ||
66 | local_irq_enable(); | 65 | local_irq_enable(); |
67 | 66 | ||
67 | if (user_mode(regs)) | ||
68 | flags |= FAULT_FLAG_USER; | ||
68 | retry: | 69 | retry: |
69 | down_read(&mm->mmap_sem); | 70 | down_read(&mm->mmap_sem); |
70 | vma = find_vma(mm, address); | 71 | vma = find_vma(mm, address); |
@@ -96,6 +97,7 @@ good_area: | |||
96 | case FLT_STORE: | 97 | case FLT_STORE: |
97 | if (!(vma->vm_flags & VM_WRITE)) | 98 | if (!(vma->vm_flags & VM_WRITE)) |
98 | goto bad_area; | 99 | goto bad_area; |
100 | flags |= FAULT_FLAG_WRITE; | ||
99 | break; | 101 | break; |
100 | } | 102 | } |
101 | 103 | ||
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 6cf0341f978e..7225dad87094 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c | |||
@@ -90,8 +90,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re | |||
90 | mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT) | 90 | mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT) |
91 | | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)); | 91 | | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)); |
92 | 92 | ||
93 | flags |= ((mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0); | ||
94 | |||
95 | /* mmap_sem is performance critical.... */ | 93 | /* mmap_sem is performance critical.... */ |
96 | prefetchw(&mm->mmap_sem); | 94 | prefetchw(&mm->mmap_sem); |
97 | 95 | ||
@@ -119,6 +117,10 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re | |||
119 | if (notify_page_fault(regs, TRAP_BRKPT)) | 117 | if (notify_page_fault(regs, TRAP_BRKPT)) |
120 | return; | 118 | return; |
121 | 119 | ||
120 | if (user_mode(regs)) | ||
121 | flags |= FAULT_FLAG_USER; | ||
122 | if (mask & VM_WRITE) | ||
123 | flags |= FAULT_FLAG_WRITE; | ||
122 | retry: | 124 | retry: |
123 | down_read(&mm->mmap_sem); | 125 | down_read(&mm->mmap_sem); |
124 | 126 | ||
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c index 3cdfa9c1d091..e9c6a8014bd6 100644 --- a/arch/m32r/mm/fault.c +++ b/arch/m32r/mm/fault.c | |||
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, | |||
78 | struct mm_struct *mm; | 78 | struct mm_struct *mm; |
79 | struct vm_area_struct * vma; | 79 | struct vm_area_struct * vma; |
80 | unsigned long page, addr; | 80 | unsigned long page, addr; |
81 | int write; | 81 | unsigned long flags = 0; |
82 | int fault; | 82 | int fault; |
83 | siginfo_t info; | 83 | siginfo_t info; |
84 | 84 | ||
@@ -117,6 +117,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, | |||
117 | if (in_atomic() || !mm) | 117 | if (in_atomic() || !mm) |
118 | goto bad_area_nosemaphore; | 118 | goto bad_area_nosemaphore; |
119 | 119 | ||
120 | if (error_code & ACE_USERMODE) | ||
121 | flags |= FAULT_FLAG_USER; | ||
122 | |||
120 | /* When running in the kernel we expect faults to occur only to | 123 | /* When running in the kernel we expect faults to occur only to |
121 | * addresses in user space. All other faults represent errors in the | 124 | * addresses in user space. All other faults represent errors in the |
122 | * kernel and should generate an OOPS. Unfortunately, in the case of an | 125 | * kernel and should generate an OOPS. Unfortunately, in the case of an |
@@ -166,14 +169,13 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, | |||
166 | */ | 169 | */ |
167 | good_area: | 170 | good_area: |
168 | info.si_code = SEGV_ACCERR; | 171 | info.si_code = SEGV_ACCERR; |
169 | write = 0; | ||
170 | switch (error_code & (ACE_WRITE|ACE_PROTECTION)) { | 172 | switch (error_code & (ACE_WRITE|ACE_PROTECTION)) { |
171 | default: /* 3: write, present */ | 173 | default: /* 3: write, present */ |
172 | /* fall through */ | 174 | /* fall through */ |
173 | case ACE_WRITE: /* write, not present */ | 175 | case ACE_WRITE: /* write, not present */ |
174 | if (!(vma->vm_flags & VM_WRITE)) | 176 | if (!(vma->vm_flags & VM_WRITE)) |
175 | goto bad_area; | 177 | goto bad_area; |
176 | write++; | 178 | flags |= FAULT_FLAG_WRITE; |
177 | break; | 179 | break; |
178 | case ACE_PROTECTION: /* read, present */ | 180 | case ACE_PROTECTION: /* read, present */ |
179 | case 0: /* read, not present */ | 181 | case 0: /* read, not present */ |
@@ -194,7 +196,7 @@ good_area: | |||
194 | */ | 196 | */ |
195 | addr = (address & PAGE_MASK); | 197 | addr = (address & PAGE_MASK); |
196 | set_thread_fault_code(error_code); | 198 | set_thread_fault_code(error_code); |
197 | fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0); | 199 | fault = handle_mm_fault(mm, vma, addr, flags); |
198 | if (unlikely(fault & VM_FAULT_ERROR)) { | 200 | if (unlikely(fault & VM_FAULT_ERROR)) { |
199 | if (fault & VM_FAULT_OOM) | 201 | if (fault & VM_FAULT_OOM) |
200 | goto out_of_memory; | 202 | goto out_of_memory; |
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index a563727806bf..eb1d61f68725 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c | |||
@@ -88,6 +88,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, | |||
88 | if (in_atomic() || !mm) | 88 | if (in_atomic() || !mm) |
89 | goto no_context; | 89 | goto no_context; |
90 | 90 | ||
91 | if (user_mode(regs)) | ||
92 | flags |= FAULT_FLAG_USER; | ||
91 | retry: | 93 | retry: |
92 | down_read(&mm->mmap_sem); | 94 | down_read(&mm->mmap_sem); |
93 | 95 | ||
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c index 8fddf46e6c62..332680e5ebf2 100644 --- a/arch/metag/mm/fault.c +++ b/arch/metag/mm/fault.c | |||
@@ -53,8 +53,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, | |||
53 | struct vm_area_struct *vma, *prev_vma; | 53 | struct vm_area_struct *vma, *prev_vma; |
54 | siginfo_t info; | 54 | siginfo_t info; |
55 | int fault; | 55 | int fault; |
56 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 56 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
57 | (write_access ? FAULT_FLAG_WRITE : 0); | ||
58 | 57 | ||
59 | tsk = current; | 58 | tsk = current; |
60 | 59 | ||
@@ -109,6 +108,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, | |||
109 | if (in_atomic() || !mm) | 108 | if (in_atomic() || !mm) |
110 | goto no_context; | 109 | goto no_context; |
111 | 110 | ||
111 | if (user_mode(regs)) | ||
112 | flags |= FAULT_FLAG_USER; | ||
112 | retry: | 113 | retry: |
113 | down_read(&mm->mmap_sem); | 114 | down_read(&mm->mmap_sem); |
114 | 115 | ||
@@ -121,6 +122,7 @@ good_area: | |||
121 | if (write_access) { | 122 | if (write_access) { |
122 | if (!(vma->vm_flags & VM_WRITE)) | 123 | if (!(vma->vm_flags & VM_WRITE)) |
123 | goto bad_area; | 124 | goto bad_area; |
125 | flags |= FAULT_FLAG_WRITE; | ||
124 | } else { | 126 | } else { |
125 | if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) | 127 | if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) |
126 | goto bad_area; | 128 | goto bad_area; |
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index 731f739d17a1..fa4cf52aa7a6 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c | |||
@@ -92,8 +92,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, | |||
92 | int code = SEGV_MAPERR; | 92 | int code = SEGV_MAPERR; |
93 | int is_write = error_code & ESR_S; | 93 | int is_write = error_code & ESR_S; |
94 | int fault; | 94 | int fault; |
95 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 95 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
96 | (is_write ? FAULT_FLAG_WRITE : 0); | ||
97 | 96 | ||
98 | regs->ear = address; | 97 | regs->ear = address; |
99 | regs->esr = error_code; | 98 | regs->esr = error_code; |
@@ -121,6 +120,9 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, | |||
121 | die("Weird page fault", regs, SIGSEGV); | 120 | die("Weird page fault", regs, SIGSEGV); |
122 | } | 121 | } |
123 | 122 | ||
123 | if (user_mode(regs)) | ||
124 | flags |= FAULT_FLAG_USER; | ||
125 | |||
124 | /* When running in the kernel we expect faults to occur only to | 126 | /* When running in the kernel we expect faults to occur only to |
125 | * addresses in user space. All other faults represent errors in the | 127 | * addresses in user space. All other faults represent errors in the |
126 | * kernel and should generate an OOPS. Unfortunately, in the case of an | 128 | * kernel and should generate an OOPS. Unfortunately, in the case of an |
@@ -199,6 +201,7 @@ good_area: | |||
199 | if (unlikely(is_write)) { | 201 | if (unlikely(is_write)) { |
200 | if (unlikely(!(vma->vm_flags & VM_WRITE))) | 202 | if (unlikely(!(vma->vm_flags & VM_WRITE))) |
201 | goto bad_area; | 203 | goto bad_area; |
204 | flags |= FAULT_FLAG_WRITE; | ||
202 | /* a read */ | 205 | /* a read */ |
203 | } else { | 206 | } else { |
204 | /* protection fault */ | 207 | /* protection fault */ |
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 85df1cd8d446..becc42bb1849 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c | |||
@@ -42,8 +42,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, | |||
42 | const int field = sizeof(unsigned long) * 2; | 42 | const int field = sizeof(unsigned long) * 2; |
43 | siginfo_t info; | 43 | siginfo_t info; |
44 | int fault; | 44 | int fault; |
45 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 45 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
46 | (write ? FAULT_FLAG_WRITE : 0); | ||
47 | 46 | ||
48 | #if 0 | 47 | #if 0 |
49 | printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(), | 48 | printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(), |
@@ -93,6 +92,8 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, | |||
93 | if (in_atomic() || !mm) | 92 | if (in_atomic() || !mm) |
94 | goto bad_area_nosemaphore; | 93 | goto bad_area_nosemaphore; |
95 | 94 | ||
95 | if (user_mode(regs)) | ||
96 | flags |= FAULT_FLAG_USER; | ||
96 | retry: | 97 | retry: |
97 | down_read(&mm->mmap_sem); | 98 | down_read(&mm->mmap_sem); |
98 | vma = find_vma(mm, address); | 99 | vma = find_vma(mm, address); |
@@ -114,6 +115,7 @@ good_area: | |||
114 | if (write) { | 115 | if (write) { |
115 | if (!(vma->vm_flags & VM_WRITE)) | 116 | if (!(vma->vm_flags & VM_WRITE)) |
116 | goto bad_area; | 117 | goto bad_area; |
118 | flags |= FAULT_FLAG_WRITE; | ||
117 | } else { | 119 | } else { |
118 | if (cpu_has_rixi) { | 120 | if (cpu_has_rixi) { |
119 | if (address == regs->cp0_epc && !(vma->vm_flags & VM_EXEC)) { | 121 | if (address == regs->cp0_epc && !(vma->vm_flags & VM_EXEC)) { |
@@ -241,6 +243,8 @@ out_of_memory: | |||
241 | * (which will retry the fault, or kill us if we got oom-killed). | 243 | * (which will retry the fault, or kill us if we got oom-killed). |
242 | */ | 244 | */ |
243 | up_read(&mm->mmap_sem); | 245 | up_read(&mm->mmap_sem); |
246 | if (!user_mode(regs)) | ||
247 | goto no_context; | ||
244 | pagefault_out_of_memory(); | 248 | pagefault_out_of_memory(); |
245 | return; | 249 | return; |
246 | 250 | ||
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c index 8a2e6ded9a44..3516cbdf1ee9 100644 --- a/arch/mn10300/mm/fault.c +++ b/arch/mn10300/mm/fault.c | |||
@@ -171,6 +171,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code, | |||
171 | if (in_atomic() || !mm) | 171 | if (in_atomic() || !mm) |
172 | goto no_context; | 172 | goto no_context; |
173 | 173 | ||
174 | if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) | ||
175 | flags |= FAULT_FLAG_USER; | ||
174 | retry: | 176 | retry: |
175 | down_read(&mm->mmap_sem); | 177 | down_read(&mm->mmap_sem); |
176 | 178 | ||
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 4a41f8493ab0..0703acf7d327 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c | |||
@@ -86,6 +86,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address, | |||
86 | if (user_mode(regs)) { | 86 | if (user_mode(regs)) { |
87 | /* Exception was in userspace: reenable interrupts */ | 87 | /* Exception was in userspace: reenable interrupts */ |
88 | local_irq_enable(); | 88 | local_irq_enable(); |
89 | flags |= FAULT_FLAG_USER; | ||
89 | } else { | 90 | } else { |
90 | /* If exception was in a syscall, then IRQ's may have | 91 | /* If exception was in a syscall, then IRQ's may have |
91 | * been enabled or disabled. If they were enabled, | 92 | * been enabled or disabled. If they were enabled, |
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index f247a3480e8e..d10d27a720c0 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c | |||
@@ -180,6 +180,10 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, | |||
180 | if (in_atomic() || !mm) | 180 | if (in_atomic() || !mm) |
181 | goto no_context; | 181 | goto no_context; |
182 | 182 | ||
183 | if (user_mode(regs)) | ||
184 | flags |= FAULT_FLAG_USER; | ||
185 | if (acc_type & VM_WRITE) | ||
186 | flags |= FAULT_FLAG_WRITE; | ||
183 | retry: | 187 | retry: |
184 | down_read(&mm->mmap_sem); | 188 | down_read(&mm->mmap_sem); |
185 | vma = find_vma_prev(mm, address, &prev_vma); | 189 | vma = find_vma_prev(mm, address, &prev_vma); |
@@ -203,8 +207,7 @@ good_area: | |||
203 | * fault. | 207 | * fault. |
204 | */ | 208 | */ |
205 | 209 | ||
206 | fault = handle_mm_fault(mm, vma, address, | 210 | fault = handle_mm_fault(mm, vma, address, flags); |
207 | flags | ((acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0)); | ||
208 | 211 | ||
209 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 212 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
210 | return; | 213 | return; |
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 2dd69bf4af46..51ab9e7e6c39 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c | |||
@@ -223,9 +223,6 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, | |||
223 | is_write = error_code & ESR_DST; | 223 | is_write = error_code & ESR_DST; |
224 | #endif /* CONFIG_4xx || CONFIG_BOOKE */ | 224 | #endif /* CONFIG_4xx || CONFIG_BOOKE */ |
225 | 225 | ||
226 | if (is_write) | ||
227 | flags |= FAULT_FLAG_WRITE; | ||
228 | |||
229 | #ifdef CONFIG_PPC_ICSWX | 226 | #ifdef CONFIG_PPC_ICSWX |
230 | /* | 227 | /* |
231 | * we need to do this early because this "data storage | 228 | * we need to do this early because this "data storage |
@@ -288,6 +285,9 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, | |||
288 | if (user_mode(regs)) | 285 | if (user_mode(regs)) |
289 | store_update_sp = store_updates_sp(regs); | 286 | store_update_sp = store_updates_sp(regs); |
290 | 287 | ||
288 | if (user_mode(regs)) | ||
289 | flags |= FAULT_FLAG_USER; | ||
290 | |||
291 | /* When running in the kernel we expect faults to occur only to | 291 | /* When running in the kernel we expect faults to occur only to |
292 | * addresses in user space. All other faults represent errors in the | 292 | * addresses in user space. All other faults represent errors in the |
293 | * kernel and should generate an OOPS. Unfortunately, in the case of an | 293 | * kernel and should generate an OOPS. Unfortunately, in the case of an |
@@ -415,6 +415,7 @@ good_area: | |||
415 | } else if (is_write) { | 415 | } else if (is_write) { |
416 | if (!(vma->vm_flags & VM_WRITE)) | 416 | if (!(vma->vm_flags & VM_WRITE)) |
417 | goto bad_area; | 417 | goto bad_area; |
418 | flags |= FAULT_FLAG_WRITE; | ||
418 | /* a read */ | 419 | /* a read */ |
419 | } else { | 420 | } else { |
420 | /* protection fault */ | 421 | /* protection fault */ |
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 7de4469915f0..fc6679210d83 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c | |||
@@ -302,6 +302,8 @@ static inline int do_exception(struct pt_regs *regs, int access) | |||
302 | address = trans_exc_code & __FAIL_ADDR_MASK; | 302 | address = trans_exc_code & __FAIL_ADDR_MASK; |
303 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); | 303 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); |
304 | flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; | 304 | flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
305 | if (user_mode(regs)) | ||
306 | flags |= FAULT_FLAG_USER; | ||
305 | if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) | 307 | if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) |
306 | flags |= FAULT_FLAG_WRITE; | 308 | flags |= FAULT_FLAG_WRITE; |
307 | down_read(&mm->mmap_sem); | 309 | down_read(&mm->mmap_sem); |
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c index 6b18fb0189ae..52238983527d 100644 --- a/arch/score/mm/fault.c +++ b/arch/score/mm/fault.c | |||
@@ -47,6 +47,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, | |||
47 | struct task_struct *tsk = current; | 47 | struct task_struct *tsk = current; |
48 | struct mm_struct *mm = tsk->mm; | 48 | struct mm_struct *mm = tsk->mm; |
49 | const int field = sizeof(unsigned long) * 2; | 49 | const int field = sizeof(unsigned long) * 2; |
50 | unsigned long flags = 0; | ||
50 | siginfo_t info; | 51 | siginfo_t info; |
51 | int fault; | 52 | int fault; |
52 | 53 | ||
@@ -75,6 +76,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, | |||
75 | if (in_atomic() || !mm) | 76 | if (in_atomic() || !mm) |
76 | goto bad_area_nosemaphore; | 77 | goto bad_area_nosemaphore; |
77 | 78 | ||
79 | if (user_mode(regs)) | ||
80 | flags |= FAULT_FLAG_USER; | ||
81 | |||
78 | down_read(&mm->mmap_sem); | 82 | down_read(&mm->mmap_sem); |
79 | vma = find_vma(mm, address); | 83 | vma = find_vma(mm, address); |
80 | if (!vma) | 84 | if (!vma) |
@@ -95,18 +99,18 @@ good_area: | |||
95 | if (write) { | 99 | if (write) { |
96 | if (!(vma->vm_flags & VM_WRITE)) | 100 | if (!(vma->vm_flags & VM_WRITE)) |
97 | goto bad_area; | 101 | goto bad_area; |
102 | flags |= FAULT_FLAG_WRITE; | ||
98 | } else { | 103 | } else { |
99 | if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) | 104 | if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) |
100 | goto bad_area; | 105 | goto bad_area; |
101 | } | 106 | } |
102 | 107 | ||
103 | survive: | ||
104 | /* | 108 | /* |
105 | * If for any reason at all we couldn't handle the fault, | 109 | * If for any reason at all we couldn't handle the fault, |
106 | * make sure we exit gracefully rather than endlessly redo | 110 | * make sure we exit gracefully rather than endlessly redo |
107 | * the fault. | 111 | * the fault. |
108 | */ | 112 | */ |
109 | fault = handle_mm_fault(mm, vma, address, write); | 113 | fault = handle_mm_fault(mm, vma, address, flags); |
110 | if (unlikely(fault & VM_FAULT_ERROR)) { | 114 | if (unlikely(fault & VM_FAULT_ERROR)) { |
111 | if (fault & VM_FAULT_OOM) | 115 | if (fault & VM_FAULT_OOM) |
112 | goto out_of_memory; | 116 | goto out_of_memory; |
@@ -167,11 +171,6 @@ no_context: | |||
167 | */ | 171 | */ |
168 | out_of_memory: | 172 | out_of_memory: |
169 | up_read(&mm->mmap_sem); | 173 | up_read(&mm->mmap_sem); |
170 | if (is_global_init(tsk)) { | ||
171 | yield(); | ||
172 | down_read(&mm->mmap_sem); | ||
173 | goto survive; | ||
174 | } | ||
175 | if (!user_mode(regs)) | 174 | if (!user_mode(regs)) |
176 | goto no_context; | 175 | goto no_context; |
177 | pagefault_out_of_memory(); | 176 | pagefault_out_of_memory(); |
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 1f49c28affa9..541dc6101508 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c | |||
@@ -400,9 +400,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, | |||
400 | struct mm_struct *mm; | 400 | struct mm_struct *mm; |
401 | struct vm_area_struct * vma; | 401 | struct vm_area_struct * vma; |
402 | int fault; | 402 | int fault; |
403 | int write = error_code & FAULT_CODE_WRITE; | 403 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
404 | unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | ||
405 | (write ? FAULT_FLAG_WRITE : 0)); | ||
406 | 404 | ||
407 | tsk = current; | 405 | tsk = current; |
408 | mm = tsk->mm; | 406 | mm = tsk->mm; |
@@ -476,6 +474,11 @@ good_area: | |||
476 | 474 | ||
477 | set_thread_fault_code(error_code); | 475 | set_thread_fault_code(error_code); |
478 | 476 | ||
477 | if (user_mode(regs)) | ||
478 | flags |= FAULT_FLAG_USER; | ||
479 | if (error_code & FAULT_CODE_WRITE) | ||
480 | flags |= FAULT_FLAG_WRITE; | ||
481 | |||
479 | /* | 482 | /* |
480 | * If for any reason at all we couldn't handle the fault, | 483 | * If for any reason at all we couldn't handle the fault, |
481 | * make sure we exit gracefully rather than endlessly redo | 484 | * make sure we exit gracefully rather than endlessly redo |
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index e98bfda205a2..59dbd4645725 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c | |||
@@ -177,8 +177,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, | |||
177 | unsigned long g2; | 177 | unsigned long g2; |
178 | int from_user = !(regs->psr & PSR_PS); | 178 | int from_user = !(regs->psr & PSR_PS); |
179 | int fault, code; | 179 | int fault, code; |
180 | unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 180 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
181 | (write ? FAULT_FLAG_WRITE : 0)); | ||
182 | 181 | ||
183 | if (text_fault) | 182 | if (text_fault) |
184 | address = regs->pc; | 183 | address = regs->pc; |
@@ -235,6 +234,11 @@ good_area: | |||
235 | goto bad_area; | 234 | goto bad_area; |
236 | } | 235 | } |
237 | 236 | ||
237 | if (from_user) | ||
238 | flags |= FAULT_FLAG_USER; | ||
239 | if (write) | ||
240 | flags |= FAULT_FLAG_WRITE; | ||
241 | |||
238 | /* | 242 | /* |
239 | * If for any reason at all we couldn't handle the fault, | 243 | * If for any reason at all we couldn't handle the fault, |
240 | * make sure we exit gracefully rather than endlessly redo | 244 | * make sure we exit gracefully rather than endlessly redo |
@@ -383,6 +387,7 @@ static void force_user_fault(unsigned long address, int write) | |||
383 | struct vm_area_struct *vma; | 387 | struct vm_area_struct *vma; |
384 | struct task_struct *tsk = current; | 388 | struct task_struct *tsk = current; |
385 | struct mm_struct *mm = tsk->mm; | 389 | struct mm_struct *mm = tsk->mm; |
390 | unsigned int flags = FAULT_FLAG_USER; | ||
386 | int code; | 391 | int code; |
387 | 392 | ||
388 | code = SEGV_MAPERR; | 393 | code = SEGV_MAPERR; |
@@ -402,11 +407,12 @@ good_area: | |||
402 | if (write) { | 407 | if (write) { |
403 | if (!(vma->vm_flags & VM_WRITE)) | 408 | if (!(vma->vm_flags & VM_WRITE)) |
404 | goto bad_area; | 409 | goto bad_area; |
410 | flags |= FAULT_FLAG_WRITE; | ||
405 | } else { | 411 | } else { |
406 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | 412 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) |
407 | goto bad_area; | 413 | goto bad_area; |
408 | } | 414 | } |
409 | switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0)) { | 415 | switch (handle_mm_fault(mm, vma, address, flags)) { |
410 | case VM_FAULT_SIGBUS: | 416 | case VM_FAULT_SIGBUS: |
411 | case VM_FAULT_OOM: | 417 | case VM_FAULT_OOM: |
412 | goto do_sigbus; | 418 | goto do_sigbus; |
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 5062ff389e83..2ebec263d685 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c | |||
@@ -315,7 +315,8 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) | |||
315 | bad_kernel_pc(regs, address); | 315 | bad_kernel_pc(regs, address); |
316 | return; | 316 | return; |
317 | } | 317 | } |
318 | } | 318 | } else |
319 | flags |= FAULT_FLAG_USER; | ||
319 | 320 | ||
320 | /* | 321 | /* |
321 | * If we're in an interrupt or have no user | 322 | * If we're in an interrupt or have no user |
@@ -418,13 +419,14 @@ good_area: | |||
418 | vma->vm_file != NULL) | 419 | vma->vm_file != NULL) |
419 | set_thread_fault_code(fault_code | | 420 | set_thread_fault_code(fault_code | |
420 | FAULT_CODE_BLKCOMMIT); | 421 | FAULT_CODE_BLKCOMMIT); |
422 | |||
423 | flags |= FAULT_FLAG_WRITE; | ||
421 | } else { | 424 | } else { |
422 | /* Allow reads even for write-only mappings */ | 425 | /* Allow reads even for write-only mappings */ |
423 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | 426 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) |
424 | goto bad_area; | 427 | goto bad_area; |
425 | } | 428 | } |
426 | 429 | ||
427 | flags |= ((fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0); | ||
428 | fault = handle_mm_fault(mm, vma, address, flags); | 430 | fault = handle_mm_fault(mm, vma, address, flags); |
429 | 431 | ||
430 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 432 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index 111d5a9b76f1..4c288f199453 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c | |||
@@ -280,8 +280,7 @@ static int handle_page_fault(struct pt_regs *regs, | |||
280 | if (!is_page_fault) | 280 | if (!is_page_fault) |
281 | write = 1; | 281 | write = 1; |
282 | 282 | ||
283 | flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 283 | flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
284 | (write ? FAULT_FLAG_WRITE : 0)); | ||
285 | 284 | ||
286 | is_kernel_mode = !user_mode(regs); | 285 | is_kernel_mode = !user_mode(regs); |
287 | 286 | ||
@@ -365,6 +364,9 @@ static int handle_page_fault(struct pt_regs *regs, | |||
365 | goto bad_area_nosemaphore; | 364 | goto bad_area_nosemaphore; |
366 | } | 365 | } |
367 | 366 | ||
367 | if (!is_kernel_mode) | ||
368 | flags |= FAULT_FLAG_USER; | ||
369 | |||
368 | /* | 370 | /* |
369 | * When running in the kernel we expect faults to occur only to | 371 | * When running in the kernel we expect faults to occur only to |
370 | * addresses in user space. All other faults represent errors in the | 372 | * addresses in user space. All other faults represent errors in the |
@@ -425,12 +427,12 @@ good_area: | |||
425 | #endif | 427 | #endif |
426 | if (!(vma->vm_flags & VM_WRITE)) | 428 | if (!(vma->vm_flags & VM_WRITE)) |
427 | goto bad_area; | 429 | goto bad_area; |
430 | flags |= FAULT_FLAG_WRITE; | ||
428 | } else { | 431 | } else { |
429 | if (!is_page_fault || !(vma->vm_flags & VM_READ)) | 432 | if (!is_page_fault || !(vma->vm_flags & VM_READ)) |
430 | goto bad_area; | 433 | goto bad_area; |
431 | } | 434 | } |
432 | 435 | ||
433 | survive: | ||
434 | /* | 436 | /* |
435 | * If for any reason at all we couldn't handle the fault, | 437 | * If for any reason at all we couldn't handle the fault, |
436 | * make sure we exit gracefully rather than endlessly redo | 438 | * make sure we exit gracefully rather than endlessly redo |
@@ -555,11 +557,6 @@ no_context: | |||
555 | */ | 557 | */ |
556 | out_of_memory: | 558 | out_of_memory: |
557 | up_read(&mm->mmap_sem); | 559 | up_read(&mm->mmap_sem); |
558 | if (is_global_init(tsk)) { | ||
559 | yield(); | ||
560 | down_read(&mm->mmap_sem); | ||
561 | goto survive; | ||
562 | } | ||
563 | if (is_kernel_mode) | 560 | if (is_kernel_mode) |
564 | goto no_context; | 561 | goto no_context; |
565 | pagefault_out_of_memory(); | 562 | pagefault_out_of_memory(); |
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 089f3987e273..5c3aef74237f 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c | |||
@@ -30,8 +30,7 @@ int handle_page_fault(unsigned long address, unsigned long ip, | |||
30 | pmd_t *pmd; | 30 | pmd_t *pmd; |
31 | pte_t *pte; | 31 | pte_t *pte; |
32 | int err = -EFAULT; | 32 | int err = -EFAULT; |
33 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 33 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
34 | (is_write ? FAULT_FLAG_WRITE : 0); | ||
35 | 34 | ||
36 | *code_out = SEGV_MAPERR; | 35 | *code_out = SEGV_MAPERR; |
37 | 36 | ||
@@ -42,6 +41,8 @@ int handle_page_fault(unsigned long address, unsigned long ip, | |||
42 | if (in_atomic()) | 41 | if (in_atomic()) |
43 | goto out_nosemaphore; | 42 | goto out_nosemaphore; |
44 | 43 | ||
44 | if (is_user) | ||
45 | flags |= FAULT_FLAG_USER; | ||
45 | retry: | 46 | retry: |
46 | down_read(&mm->mmap_sem); | 47 | down_read(&mm->mmap_sem); |
47 | vma = find_vma(mm, address); | 48 | vma = find_vma(mm, address); |
@@ -58,12 +59,15 @@ retry: | |||
58 | 59 | ||
59 | good_area: | 60 | good_area: |
60 | *code_out = SEGV_ACCERR; | 61 | *code_out = SEGV_ACCERR; |
61 | if (is_write && !(vma->vm_flags & VM_WRITE)) | 62 | if (is_write) { |
62 | goto out; | 63 | if (!(vma->vm_flags & VM_WRITE)) |
63 | 64 | goto out; | |
64 | /* Don't require VM_READ|VM_EXEC for write faults! */ | 65 | flags |= FAULT_FLAG_WRITE; |
65 | if (!is_write && !(vma->vm_flags & (VM_READ | VM_EXEC))) | 66 | } else { |
66 | goto out; | 67 | /* Don't require VM_READ|VM_EXEC for write faults! */ |
68 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | ||
69 | goto out; | ||
70 | } | ||
67 | 71 | ||
68 | do { | 72 | do { |
69 | int fault; | 73 | int fault; |
@@ -124,6 +128,8 @@ out_of_memory: | |||
124 | * (which will retry the fault, or kill us if we got oom-killed). | 128 | * (which will retry the fault, or kill us if we got oom-killed). |
125 | */ | 129 | */ |
126 | up_read(&mm->mmap_sem); | 130 | up_read(&mm->mmap_sem); |
131 | if (!is_user) | ||
132 | goto out_nosemaphore; | ||
127 | pagefault_out_of_memory(); | 133 | pagefault_out_of_memory(); |
128 | return 0; | 134 | return 0; |
129 | } | 135 | } |
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c index f9b5c10bccee..0dc922dba915 100644 --- a/arch/unicore32/mm/fault.c +++ b/arch/unicore32/mm/fault.c | |||
@@ -209,8 +209,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs) | |||
209 | struct task_struct *tsk; | 209 | struct task_struct *tsk; |
210 | struct mm_struct *mm; | 210 | struct mm_struct *mm; |
211 | int fault, sig, code; | 211 | int fault, sig, code; |
212 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 212 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
213 | ((!(fsr ^ 0x12)) ? FAULT_FLAG_WRITE : 0); | ||
214 | 213 | ||
215 | tsk = current; | 214 | tsk = current; |
216 | mm = tsk->mm; | 215 | mm = tsk->mm; |
@@ -222,6 +221,11 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs) | |||
222 | if (in_atomic() || !mm) | 221 | if (in_atomic() || !mm) |
223 | goto no_context; | 222 | goto no_context; |
224 | 223 | ||
224 | if (user_mode(regs)) | ||
225 | flags |= FAULT_FLAG_USER; | ||
226 | if (!(fsr ^ 0x12)) | ||
227 | flags |= FAULT_FLAG_WRITE; | ||
228 | |||
225 | /* | 229 | /* |
226 | * As per x86, we may deadlock here. However, since the kernel only | 230 | * As per x86, we may deadlock here. However, since the kernel only |
227 | * validly references user space from well defined areas of the code, | 231 | * validly references user space from well defined areas of the code, |
@@ -278,6 +282,13 @@ retry: | |||
278 | (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) | 282 | (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) |
279 | return 0; | 283 | return 0; |
280 | 284 | ||
285 | /* | ||
286 | * If we are in kernel mode at this point, we | ||
287 | * have no context to handle this fault with. | ||
288 | */ | ||
289 | if (!user_mode(regs)) | ||
290 | goto no_context; | ||
291 | |||
281 | if (fault & VM_FAULT_OOM) { | 292 | if (fault & VM_FAULT_OOM) { |
282 | /* | 293 | /* |
283 | * We ran out of memory, call the OOM killer, and return to | 294 | * We ran out of memory, call the OOM killer, and return to |
@@ -288,13 +299,6 @@ retry: | |||
288 | return 0; | 299 | return 0; |
289 | } | 300 | } |
290 | 301 | ||
291 | /* | ||
292 | * If we are in kernel mode at this point, we | ||
293 | * have no context to handle this fault with. | ||
294 | */ | ||
295 | if (!user_mode(regs)) | ||
296 | goto no_context; | ||
297 | |||
298 | if (fault & VM_FAULT_SIGBUS) { | 302 | if (fault & VM_FAULT_SIGBUS) { |
299 | /* | 303 | /* |
300 | * We had some memory, but were unable to | 304 | * We had some memory, but were unable to |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 654be4ae3047..3aaeffcfd67a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -842,23 +842,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, | |||
842 | force_sig_info_fault(SIGBUS, code, address, tsk, fault); | 842 | force_sig_info_fault(SIGBUS, code, address, tsk, fault); |
843 | } | 843 | } |
844 | 844 | ||
845 | static noinline int | 845 | static noinline void |
846 | mm_fault_error(struct pt_regs *regs, unsigned long error_code, | 846 | mm_fault_error(struct pt_regs *regs, unsigned long error_code, |
847 | unsigned long address, unsigned int fault) | 847 | unsigned long address, unsigned int fault) |
848 | { | 848 | { |
849 | /* | 849 | if (fatal_signal_pending(current) && !(error_code & PF_USER)) { |
850 | * Pagefault was interrupted by SIGKILL. We have no reason to | 850 | up_read(¤t->mm->mmap_sem); |
851 | * continue pagefault. | 851 | no_context(regs, error_code, address, 0, 0); |
852 | */ | 852 | return; |
853 | if (fatal_signal_pending(current)) { | ||
854 | if (!(fault & VM_FAULT_RETRY)) | ||
855 | up_read(¤t->mm->mmap_sem); | ||
856 | if (!(error_code & PF_USER)) | ||
857 | no_context(regs, error_code, address, 0, 0); | ||
858 | return 1; | ||
859 | } | 853 | } |
860 | if (!(fault & VM_FAULT_ERROR)) | ||
861 | return 0; | ||
862 | 854 | ||
863 | if (fault & VM_FAULT_OOM) { | 855 | if (fault & VM_FAULT_OOM) { |
864 | /* Kernel mode? Handle exceptions or die: */ | 856 | /* Kernel mode? Handle exceptions or die: */ |
@@ -866,7 +858,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, | |||
866 | up_read(¤t->mm->mmap_sem); | 858 | up_read(¤t->mm->mmap_sem); |
867 | no_context(regs, error_code, address, | 859 | no_context(regs, error_code, address, |
868 | SIGSEGV, SEGV_MAPERR); | 860 | SIGSEGV, SEGV_MAPERR); |
869 | return 1; | 861 | return; |
870 | } | 862 | } |
871 | 863 | ||
872 | up_read(¤t->mm->mmap_sem); | 864 | up_read(¤t->mm->mmap_sem); |
@@ -884,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, | |||
884 | else | 876 | else |
885 | BUG(); | 877 | BUG(); |
886 | } | 878 | } |
887 | return 1; | ||
888 | } | 879 | } |
889 | 880 | ||
890 | static int spurious_fault_check(unsigned long error_code, pte_t *pte) | 881 | static int spurious_fault_check(unsigned long error_code, pte_t *pte) |
@@ -1011,9 +1002,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
1011 | unsigned long address; | 1002 | unsigned long address; |
1012 | struct mm_struct *mm; | 1003 | struct mm_struct *mm; |
1013 | int fault; | 1004 | int fault; |
1014 | int write = error_code & PF_WRITE; | 1005 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
1015 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | ||
1016 | (write ? FAULT_FLAG_WRITE : 0); | ||
1017 | 1006 | ||
1018 | tsk = current; | 1007 | tsk = current; |
1019 | mm = tsk->mm; | 1008 | mm = tsk->mm; |
@@ -1083,6 +1072,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
1083 | if (user_mode_vm(regs)) { | 1072 | if (user_mode_vm(regs)) { |
1084 | local_irq_enable(); | 1073 | local_irq_enable(); |
1085 | error_code |= PF_USER; | 1074 | error_code |= PF_USER; |
1075 | flags |= FAULT_FLAG_USER; | ||
1086 | } else { | 1076 | } else { |
1087 | if (regs->flags & X86_EFLAGS_IF) | 1077 | if (regs->flags & X86_EFLAGS_IF) |
1088 | local_irq_enable(); | 1078 | local_irq_enable(); |
@@ -1109,6 +1099,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
1109 | return; | 1099 | return; |
1110 | } | 1100 | } |
1111 | 1101 | ||
1102 | if (error_code & PF_WRITE) | ||
1103 | flags |= FAULT_FLAG_WRITE; | ||
1104 | |||
1112 | /* | 1105 | /* |
1113 | * When running in the kernel we expect faults to occur only to | 1106 | * When running in the kernel we expect faults to occur only to |
1114 | * addresses in user space. All other faults represent errors in | 1107 | * addresses in user space. All other faults represent errors in |
@@ -1187,9 +1180,17 @@ good_area: | |||
1187 | */ | 1180 | */ |
1188 | fault = handle_mm_fault(mm, vma, address, flags); | 1181 | fault = handle_mm_fault(mm, vma, address, flags); |
1189 | 1182 | ||
1190 | if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { | 1183 | /* |
1191 | if (mm_fault_error(regs, error_code, address, fault)) | 1184 | * If we need to retry but a fatal signal is pending, handle the |
1192 | return; | 1185 | * signal first. We do not need to release the mmap_sem because it |
1186 | * would already be released in __lock_page_or_retry in mm/filemap.c. | ||
1187 | */ | ||
1188 | if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))) | ||
1189 | return; | ||
1190 | |||
1191 | if (unlikely(fault & VM_FAULT_ERROR)) { | ||
1192 | mm_fault_error(regs, error_code, address, fault); | ||
1193 | return; | ||
1193 | } | 1194 | } |
1194 | 1195 | ||
1195 | /* | 1196 | /* |
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 4b7bc8db170f..70fa7bc42b4a 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c | |||
@@ -72,6 +72,8 @@ void do_page_fault(struct pt_regs *regs) | |||
72 | address, exccause, regs->pc, is_write? "w":"", is_exec? "x":""); | 72 | address, exccause, regs->pc, is_write? "w":"", is_exec? "x":""); |
73 | #endif | 73 | #endif |
74 | 74 | ||
75 | if (user_mode(regs)) | ||
76 | flags |= FAULT_FLAG_USER; | ||
75 | retry: | 77 | retry: |
76 | down_read(&mm->mmap_sem); | 78 | down_read(&mm->mmap_sem); |
77 | vma = find_vma(mm, address); | 79 | vma = find_vma(mm, address); |
diff --git a/drivers/base/node.c b/drivers/base/node.c index 7616a77ca322..bc9f43bf7e29 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c | |||
@@ -125,13 +125,7 @@ static ssize_t node_read_meminfo(struct device *dev, | |||
125 | nid, K(node_page_state(nid, NR_WRITEBACK)), | 125 | nid, K(node_page_state(nid, NR_WRITEBACK)), |
126 | nid, K(node_page_state(nid, NR_FILE_PAGES)), | 126 | nid, K(node_page_state(nid, NR_FILE_PAGES)), |
127 | nid, K(node_page_state(nid, NR_FILE_MAPPED)), | 127 | nid, K(node_page_state(nid, NR_FILE_MAPPED)), |
128 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
129 | nid, K(node_page_state(nid, NR_ANON_PAGES) | ||
130 | + node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) * | ||
131 | HPAGE_PMD_NR), | ||
132 | #else | ||
133 | nid, K(node_page_state(nid, NR_ANON_PAGES)), | 128 | nid, K(node_page_state(nid, NR_ANON_PAGES)), |
134 | #endif | ||
135 | nid, K(node_page_state(nid, NR_SHMEM)), | 129 | nid, K(node_page_state(nid, NR_SHMEM)), |
136 | nid, node_page_state(nid, NR_KERNEL_STACK) * | 130 | nid, node_page_state(nid, NR_KERNEL_STACK) * |
137 | THREAD_SIZE / 1024, | 131 | THREAD_SIZE / 1024, |
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 5f95d1ed9c6d..b9acadafa4a1 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c | |||
@@ -50,7 +50,7 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to) | |||
50 | struct inode *inode = mapping->host; | 50 | struct inode *inode = mapping->host; |
51 | 51 | ||
52 | if (to > inode->i_size) | 52 | if (to > inode->i_size) |
53 | truncate_pagecache(inode, to, inode->i_size); | 53 | truncate_pagecache(inode, inode->i_size); |
54 | } | 54 | } |
55 | 55 | ||
56 | static int adfs_write_begin(struct file *file, struct address_space *mapping, | 56 | static int adfs_write_begin(struct file *file, struct address_space *mapping, |
diff --git a/fs/affs/file.c b/fs/affs/file.c index 776e3935a758..8669b6ecddee 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c | |||
@@ -406,7 +406,7 @@ static void affs_write_failed(struct address_space *mapping, loff_t to) | |||
406 | struct inode *inode = mapping->host; | 406 | struct inode *inode = mapping->host; |
407 | 407 | ||
408 | if (to > inode->i_size) { | 408 | if (to > inode->i_size) { |
409 | truncate_pagecache(inode, to, inode->i_size); | 409 | truncate_pagecache(inode, inode->i_size); |
410 | affs_truncate(inode); | 410 | affs_truncate(inode); |
411 | } | 411 | } |
412 | } | 412 | } |
diff --git a/fs/bfs/file.c b/fs/bfs/file.c index ad3ea1497cc3..ae2892218335 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c | |||
@@ -166,7 +166,7 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to) | |||
166 | struct inode *inode = mapping->host; | 166 | struct inode *inode = mapping->host; |
167 | 167 | ||
168 | if (to > inode->i_size) | 168 | if (to > inode->i_size) |
169 | truncate_pagecache(inode, to, inode->i_size); | 169 | truncate_pagecache(inode, inode->i_size); |
170 | } | 170 | } |
171 | 171 | ||
172 | static int bfs_write_begin(struct file *file, struct address_space *mapping, | 172 | static int bfs_write_begin(struct file *file, struct address_space *mapping, |
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ef3bea7bb257..3f0ddfce96e6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c | |||
@@ -221,12 +221,10 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, | |||
221 | struct btrfs_path *path, | 221 | struct btrfs_path *path, |
222 | struct inode *inode) | 222 | struct inode *inode) |
223 | { | 223 | { |
224 | loff_t oldsize; | ||
225 | int ret = 0; | 224 | int ret = 0; |
226 | 225 | ||
227 | oldsize = i_size_read(inode); | ||
228 | btrfs_i_size_write(inode, 0); | 226 | btrfs_i_size_write(inode, 0); |
229 | truncate_pagecache(inode, oldsize, 0); | 227 | truncate_pagecache(inode, 0); |
230 | 228 | ||
231 | /* | 229 | /* |
232 | * We don't need an orphan item because truncating the free space cache | 230 | * We don't need an orphan item because truncating the free space cache |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index db1e43948579..f338c5672d58 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -4349,7 +4349,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) | |||
4349 | inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); | 4349 | inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); |
4350 | 4350 | ||
4351 | if (newsize > oldsize) { | 4351 | if (newsize > oldsize) { |
4352 | truncate_pagecache(inode, oldsize, newsize); | 4352 | truncate_pagecache(inode, newsize); |
4353 | ret = btrfs_cont_expand(inode, oldsize, newsize); | 4353 | ret = btrfs_cont_expand(inode, oldsize, newsize); |
4354 | if (ret) | 4354 | if (ret) |
4355 | return ret; | 4355 | return ret; |
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index e3bb6477c83f..f9ff9c173f78 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c | |||
@@ -1856,14 +1856,11 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from) | |||
1856 | 1856 | ||
1857 | static void cifs_setsize(struct inode *inode, loff_t offset) | 1857 | static void cifs_setsize(struct inode *inode, loff_t offset) |
1858 | { | 1858 | { |
1859 | loff_t oldsize; | ||
1860 | |||
1861 | spin_lock(&inode->i_lock); | 1859 | spin_lock(&inode->i_lock); |
1862 | oldsize = inode->i_size; | ||
1863 | i_size_write(inode, offset); | 1860 | i_size_write(inode, offset); |
1864 | spin_unlock(&inode->i_lock); | 1861 | spin_unlock(&inode->i_lock); |
1865 | 1862 | ||
1866 | truncate_pagecache(inode, oldsize, offset); | 1863 | truncate_pagecache(inode, offset); |
1867 | } | 1864 | } |
1868 | 1865 | ||
1869 | static int | 1866 | static int |
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 2ec8eb1ab269..a52a5d23c30b 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c | |||
@@ -861,7 +861,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc) | |||
861 | static void _write_failed(struct inode *inode, loff_t to) | 861 | static void _write_failed(struct inode *inode, loff_t to) |
862 | { | 862 | { |
863 | if (to > inode->i_size) | 863 | if (to > inode->i_size) |
864 | truncate_pagecache(inode, to, inode->i_size); | 864 | truncate_pagecache(inode, inode->i_size); |
865 | } | 865 | } |
866 | 866 | ||
867 | int exofs_write_begin(struct file *file, struct address_space *mapping, | 867 | int exofs_write_begin(struct file *file, struct address_space *mapping, |
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 0a87bb10998d..c260de6d7b6d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c | |||
@@ -58,7 +58,7 @@ static void ext2_write_failed(struct address_space *mapping, loff_t to) | |||
58 | struct inode *inode = mapping->host; | 58 | struct inode *inode = mapping->host; |
59 | 59 | ||
60 | if (to > inode->i_size) { | 60 | if (to > inode->i_size) { |
61 | truncate_pagecache(inode, to, inode->i_size); | 61 | truncate_pagecache(inode, inode->i_size); |
62 | ext2_truncate_blocks(inode, inode->i_size); | 62 | ext2_truncate_blocks(inode, inode->i_size); |
63 | } | 63 | } |
64 | } | 64 | } |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c79fd7dabe79..0d424d7ac02b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -4587,7 +4587,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
4587 | 4587 | ||
4588 | if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { | 4588 | if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { |
4589 | handle_t *handle; | 4589 | handle_t *handle; |
4590 | loff_t oldsize = inode->i_size; | ||
4591 | 4590 | ||
4592 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { | 4591 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { |
4593 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 4592 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
@@ -4650,7 +4649,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
4650 | * Truncate pagecache after we've waited for commit | 4649 | * Truncate pagecache after we've waited for commit |
4651 | * in data=journal mode to make pages freeable. | 4650 | * in data=journal mode to make pages freeable. |
4652 | */ | 4651 | */ |
4653 | truncate_pagecache(inode, oldsize, inode->i_size); | 4652 | truncate_pagecache(inode, inode->i_size); |
4654 | } | 4653 | } |
4655 | /* | 4654 | /* |
4656 | * We want to call ext4_truncate() even if attr->ia_size == | 4655 | * We want to call ext4_truncate() even if attr->ia_size == |
diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 11b51bb55b42..0062da21dd8b 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c | |||
@@ -147,7 +147,7 @@ static void fat_write_failed(struct address_space *mapping, loff_t to) | |||
147 | struct inode *inode = mapping->host; | 147 | struct inode *inode = mapping->host; |
148 | 148 | ||
149 | if (to > inode->i_size) { | 149 | if (to > inode->i_size) { |
150 | truncate_pagecache(inode, to, inode->i_size); | 150 | truncate_pagecache(inode, inode->i_size); |
151 | fat_truncate_blocks(inode, inode->i_size); | 151 | fat_truncate_blocks(inode, inode->i_size); |
152 | } | 152 | } |
153 | } | 153 | } |
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3ac91086f41f..62b43b577bfc 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c | |||
@@ -1678,7 +1678,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, | |||
1678 | * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. | 1678 | * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. |
1679 | */ | 1679 | */ |
1680 | if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { | 1680 | if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { |
1681 | truncate_pagecache(inode, oldsize, outarg.attr.size); | 1681 | truncate_pagecache(inode, outarg.attr.size); |
1682 | invalidate_inode_pages2(inode->i_mapping); | 1682 | invalidate_inode_pages2(inode->i_mapping); |
1683 | } | 1683 | } |
1684 | 1684 | ||
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 84434594e80e..a8ce6dab60a0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c | |||
@@ -218,7 +218,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, | |||
218 | bool inval = false; | 218 | bool inval = false; |
219 | 219 | ||
220 | if (oldsize != attr->size) { | 220 | if (oldsize != attr->size) { |
221 | truncate_pagecache(inode, oldsize, attr->size); | 221 | truncate_pagecache(inode, attr->size); |
222 | inval = true; | 222 | inval = true; |
223 | } else if (fc->auto_inval_data) { | 223 | } else if (fc->auto_inval_data) { |
224 | struct timespec new_mtime = { | 224 | struct timespec new_mtime = { |
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5e2f56fccf6b..62a65fc448dc 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c | |||
@@ -1016,7 +1016,7 @@ static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize | |||
1016 | chunk = oldsize - newsize; | 1016 | chunk = oldsize - newsize; |
1017 | if (chunk > max_chunk) | 1017 | if (chunk > max_chunk) |
1018 | chunk = max_chunk; | 1018 | chunk = max_chunk; |
1019 | truncate_pagecache(inode, oldsize, oldsize - chunk); | 1019 | truncate_pagecache(inode, oldsize - chunk); |
1020 | oldsize -= chunk; | 1020 | oldsize -= chunk; |
1021 | gfs2_trans_end(sdp); | 1021 | gfs2_trans_end(sdp); |
1022 | error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); | 1022 | error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); |
@@ -1067,7 +1067,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) | |||
1067 | if (journaled) | 1067 | if (journaled) |
1068 | error = gfs2_journaled_truncate(inode, oldsize, newsize); | 1068 | error = gfs2_journaled_truncate(inode, oldsize, newsize); |
1069 | else | 1069 | else |
1070 | truncate_pagecache(inode, oldsize, newsize); | 1070 | truncate_pagecache(inode, newsize); |
1071 | 1071 | ||
1072 | if (error) { | 1072 | if (error) { |
1073 | brelse(dibh); | 1073 | brelse(dibh); |
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index f9299d8a64e3..380ab31b5e0f 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c | |||
@@ -41,7 +41,7 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to) | |||
41 | struct inode *inode = mapping->host; | 41 | struct inode *inode = mapping->host; |
42 | 42 | ||
43 | if (to > inode->i_size) { | 43 | if (to > inode->i_size) { |
44 | truncate_pagecache(inode, to, inode->i_size); | 44 | truncate_pagecache(inode, inode->i_size); |
45 | hfs_file_truncate(inode); | 45 | hfs_file_truncate(inode); |
46 | } | 46 | } |
47 | } | 47 | } |
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 4d2edaea891c..37213d075f3c 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c | |||
@@ -36,7 +36,7 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to) | |||
36 | struct inode *inode = mapping->host; | 36 | struct inode *inode = mapping->host; |
37 | 37 | ||
38 | if (to > inode->i_size) { | 38 | if (to > inode->i_size) { |
39 | truncate_pagecache(inode, to, inode->i_size); | 39 | truncate_pagecache(inode, inode->i_size); |
40 | hfsplus_file_truncate(inode); | 40 | hfsplus_file_truncate(inode); |
41 | } | 41 | } |
42 | } | 42 | } |
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 4e9dabcf1f4c..67c1a61e0955 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c | |||
@@ -138,7 +138,7 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to) | |||
138 | hpfs_lock(inode->i_sb); | 138 | hpfs_lock(inode->i_sb); |
139 | 139 | ||
140 | if (to > inode->i_size) { | 140 | if (to > inode->i_size) { |
141 | truncate_pagecache(inode, to, inode->i_size); | 141 | truncate_pagecache(inode, inode->i_size); |
142 | hpfs_truncate(inode); | 142 | hpfs_truncate(inode); |
143 | } | 143 | } |
144 | 144 | ||
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 730f24e282a6..f4aab719add5 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c | |||
@@ -306,7 +306,7 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to) | |||
306 | struct inode *inode = mapping->host; | 306 | struct inode *inode = mapping->host; |
307 | 307 | ||
308 | if (to > inode->i_size) { | 308 | if (to > inode->i_size) { |
309 | truncate_pagecache(inode, to, inode->i_size); | 309 | truncate_pagecache(inode, inode->i_size); |
310 | jfs_truncate(inode); | 310 | jfs_truncate(inode); |
311 | } | 311 | } |
312 | } | 312 | } |
diff --git a/fs/minix/inode.c b/fs/minix/inode.c index df122496f328..0332109162a5 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c | |||
@@ -400,7 +400,7 @@ static void minix_write_failed(struct address_space *mapping, loff_t to) | |||
400 | struct inode *inode = mapping->host; | 400 | struct inode *inode = mapping->host; |
401 | 401 | ||
402 | if (to > inode->i_size) { | 402 | if (to > inode->i_size) { |
403 | truncate_pagecache(inode, to, inode->i_size); | 403 | truncate_pagecache(inode, inode->i_size); |
404 | minix_truncate(inode); | 404 | minix_truncate(inode); |
405 | } | 405 | } |
406 | } | 406 | } |
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 87e797640828..eda8879171c4 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c | |||
@@ -541,7 +541,6 @@ EXPORT_SYMBOL_GPL(nfs_setattr); | |||
541 | */ | 541 | */ |
542 | static int nfs_vmtruncate(struct inode * inode, loff_t offset) | 542 | static int nfs_vmtruncate(struct inode * inode, loff_t offset) |
543 | { | 543 | { |
544 | loff_t oldsize; | ||
545 | int err; | 544 | int err; |
546 | 545 | ||
547 | err = inode_newsize_ok(inode, offset); | 546 | err = inode_newsize_ok(inode, offset); |
@@ -549,11 +548,10 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) | |||
549 | goto out; | 548 | goto out; |
550 | 549 | ||
551 | spin_lock(&inode->i_lock); | 550 | spin_lock(&inode->i_lock); |
552 | oldsize = inode->i_size; | ||
553 | i_size_write(inode, offset); | 551 | i_size_write(inode, offset); |
554 | spin_unlock(&inode->i_lock); | 552 | spin_unlock(&inode->i_lock); |
555 | 553 | ||
556 | truncate_pagecache(inode, oldsize, offset); | 554 | truncate_pagecache(inode, offset); |
557 | out: | 555 | out: |
558 | return err; | 556 | return err; |
559 | } | 557 | } |
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b1a5277cfd18..7e350c562e0e 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c | |||
@@ -254,7 +254,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to) | |||
254 | struct inode *inode = mapping->host; | 254 | struct inode *inode = mapping->host; |
255 | 255 | ||
256 | if (to > inode->i_size) { | 256 | if (to > inode->i_size) { |
257 | truncate_pagecache(inode, to, inode->i_size); | 257 | truncate_pagecache(inode, inode->i_size); |
258 | nilfs_truncate(inode); | 258 | nilfs_truncate(inode); |
259 | } | 259 | } |
260 | } | 260 | } |
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index c5670b8d198c..ea4ba9daeb47 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c | |||
@@ -1768,7 +1768,7 @@ static void ntfs_write_failed(struct address_space *mapping, loff_t to) | |||
1768 | struct inode *inode = mapping->host; | 1768 | struct inode *inode = mapping->host; |
1769 | 1769 | ||
1770 | if (to > inode->i_size) { | 1770 | if (to > inode->i_size) { |
1771 | truncate_pagecache(inode, to, inode->i_size); | 1771 | truncate_pagecache(inode, inode->i_size); |
1772 | ntfs_truncate_vfs(inode); | 1772 | ntfs_truncate_vfs(inode); |
1773 | } | 1773 | } |
1774 | } | 1774 | } |
diff --git a/fs/omfs/file.c b/fs/omfs/file.c index e0d9b3e722bd..54d57d6ba68d 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c | |||
@@ -311,7 +311,7 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to) | |||
311 | struct inode *inode = mapping->host; | 311 | struct inode *inode = mapping->host; |
312 | 312 | ||
313 | if (to > inode->i_size) { | 313 | if (to > inode->i_size) { |
314 | truncate_pagecache(inode, to, inode->i_size); | 314 | truncate_pagecache(inode, inode->i_size); |
315 | omfs_truncate(inode); | 315 | omfs_truncate(inode); |
316 | } | 316 | } |
317 | } | 317 | } |
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 5aa847a603c0..59d85d608898 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
@@ -132,13 +132,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
132 | K(i.freeswap), | 132 | K(i.freeswap), |
133 | K(global_page_state(NR_FILE_DIRTY)), | 133 | K(global_page_state(NR_FILE_DIRTY)), |
134 | K(global_page_state(NR_WRITEBACK)), | 134 | K(global_page_state(NR_WRITEBACK)), |
135 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
136 | K(global_page_state(NR_ANON_PAGES) | ||
137 | + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * | ||
138 | HPAGE_PMD_NR), | ||
139 | #else | ||
140 | K(global_page_state(NR_ANON_PAGES)), | 135 | K(global_page_state(NR_ANON_PAGES)), |
141 | #endif | ||
142 | K(global_page_state(NR_FILE_MAPPED)), | 136 | K(global_page_state(NR_FILE_MAPPED)), |
143 | K(global_page_state(NR_SHMEM)), | 137 | K(global_page_state(NR_SHMEM)), |
144 | K(global_page_state(NR_SLAB_RECLAIMABLE) + | 138 | K(global_page_state(NR_SLAB_RECLAIMABLE) + |
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index c1a591a4725b..66bc316927e8 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c | |||
@@ -469,7 +469,7 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to) | |||
469 | struct inode *inode = mapping->host; | 469 | struct inode *inode = mapping->host; |
470 | 470 | ||
471 | if (to > inode->i_size) { | 471 | if (to > inode->i_size) { |
472 | truncate_pagecache(inode, to, inode->i_size); | 472 | truncate_pagecache(inode, inode->i_size); |
473 | sysv_truncate(inode); | 473 | sysv_truncate(inode); |
474 | } | 474 | } |
475 | } | 475 | } |
diff --git a/fs/udf/inode.c b/fs/udf/inode.c index b6d15d349810..062b7925bca0 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c | |||
@@ -172,7 +172,7 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) | |||
172 | loff_t isize = inode->i_size; | 172 | loff_t isize = inode->i_size; |
173 | 173 | ||
174 | if (to > isize) { | 174 | if (to > isize) { |
175 | truncate_pagecache(inode, to, isize); | 175 | truncate_pagecache(inode, isize); |
176 | if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { | 176 | if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { |
177 | down_write(&iinfo->i_data_sem); | 177 | down_write(&iinfo->i_data_sem); |
178 | udf_clear_extent_cache(inode); | 178 | udf_clear_extent_cache(inode); |
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index ff24e4449ece..c8ca96086784 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c | |||
@@ -531,7 +531,7 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to) | |||
531 | struct inode *inode = mapping->host; | 531 | struct inode *inode = mapping->host; |
532 | 532 | ||
533 | if (to > inode->i_size) | 533 | if (to > inode->i_size) |
534 | truncate_pagecache(inode, to, inode->i_size); | 534 | truncate_pagecache(inode, inode->i_size); |
535 | } | 535 | } |
536 | 536 | ||
537 | static int ufs_write_begin(struct file *file, struct address_space *mapping, | 537 | static int ufs_write_begin(struct file *file, struct address_space *mapping, |
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 977da0ec6604..e51e581454e9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c | |||
@@ -1582,7 +1582,7 @@ xfs_vm_write_begin( | |||
1582 | unlock_page(page); | 1582 | unlock_page(page); |
1583 | 1583 | ||
1584 | if (pos + len > i_size_read(inode)) | 1584 | if (pos + len > i_size_read(inode)) |
1585 | truncate_pagecache(inode, pos + len, i_size_read(inode)); | 1585 | truncate_pagecache(inode, i_size_read(inode)); |
1586 | 1586 | ||
1587 | page_cache_release(page); | 1587 | page_cache_release(page); |
1588 | page = NULL; | 1588 | page = NULL; |
@@ -1618,7 +1618,7 @@ xfs_vm_write_end( | |||
1618 | loff_t to = pos + len; | 1618 | loff_t to = pos + len; |
1619 | 1619 | ||
1620 | if (to > isize) { | 1620 | if (to > isize) { |
1621 | truncate_pagecache(inode, to, isize); | 1621 | truncate_pagecache(inode, isize); |
1622 | xfs_vm_kill_delalloc_range(inode, isize, to); | 1622 | xfs_vm_kill_delalloc_range(inode, isize, to); |
1623 | } | 1623 | } |
1624 | } | 1624 | } |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b60de92e2edc..3935428c57cf 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -96,9 +96,6 @@ extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
96 | pmd_t *dst_pmd, pmd_t *src_pmd, | 96 | pmd_t *dst_pmd, pmd_t *src_pmd, |
97 | struct vm_area_struct *vma, | 97 | struct vm_area_struct *vma, |
98 | unsigned long addr, unsigned long end); | 98 | unsigned long addr, unsigned long end); |
99 | extern int handle_pte_fault(struct mm_struct *mm, | ||
100 | struct vm_area_struct *vma, unsigned long address, | ||
101 | pte_t *pte, pmd_t *pmd, unsigned int flags); | ||
102 | extern int split_huge_page_to_list(struct page *page, struct list_head *list); | 99 | extern int split_huge_page_to_list(struct page *page, struct list_head *list); |
103 | static inline int split_huge_page(struct page *page) | 100 | static inline int split_huge_page(struct page *page) |
104 | { | 101 | { |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6c416092e324..60e95872da29 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -30,9 +30,21 @@ struct page; | |||
30 | struct mm_struct; | 30 | struct mm_struct; |
31 | struct kmem_cache; | 31 | struct kmem_cache; |
32 | 32 | ||
33 | /* Stats that can be updated by kernel. */ | 33 | /* |
34 | enum mem_cgroup_page_stat_item { | 34 | * The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c, |
35 | MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ | 35 | * These two lists should keep in accord with each other. |
36 | */ | ||
37 | enum mem_cgroup_stat_index { | ||
38 | /* | ||
39 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. | ||
40 | */ | ||
41 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | ||
42 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | ||
43 | MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ | ||
44 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | ||
45 | MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ | ||
46 | MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ | ||
47 | MEM_CGROUP_STAT_NSTATS, | ||
36 | }; | 48 | }; |
37 | 49 | ||
38 | struct mem_cgroup_reclaim_cookie { | 50 | struct mem_cgroup_reclaim_cookie { |
@@ -41,6 +53,23 @@ struct mem_cgroup_reclaim_cookie { | |||
41 | unsigned int generation; | 53 | unsigned int generation; |
42 | }; | 54 | }; |
43 | 55 | ||
56 | enum mem_cgroup_filter_t { | ||
57 | VISIT, /* visit current node */ | ||
58 | SKIP, /* skip the current node and continue traversal */ | ||
59 | SKIP_TREE, /* skip the whole subtree and continue traversal */ | ||
60 | }; | ||
61 | |||
62 | /* | ||
63 | * mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to | ||
64 | * iterate through the hierarchy tree. Each tree element is checked by the | ||
65 | * predicate before it is returned by the iterator. If a filter returns | ||
66 | * SKIP or SKIP_TREE then the iterator code continues traversal (with the | ||
67 | * next node down the hierarchy or the next node that doesn't belong under the | ||
68 | * memcg's subtree). | ||
69 | */ | ||
70 | typedef enum mem_cgroup_filter_t | ||
71 | (*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root); | ||
72 | |||
44 | #ifdef CONFIG_MEMCG | 73 | #ifdef CONFIG_MEMCG |
45 | /* | 74 | /* |
46 | * All "charge" functions with gfp_mask should use GFP_KERNEL or | 75 | * All "charge" functions with gfp_mask should use GFP_KERNEL or |
@@ -108,9 +137,18 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage, | |||
108 | extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, | 137 | extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, |
109 | struct page *oldpage, struct page *newpage, bool migration_ok); | 138 | struct page *oldpage, struct page *newpage, bool migration_ok); |
110 | 139 | ||
111 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, | 140 | struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, |
112 | struct mem_cgroup *, | 141 | struct mem_cgroup *prev, |
113 | struct mem_cgroup_reclaim_cookie *); | 142 | struct mem_cgroup_reclaim_cookie *reclaim, |
143 | mem_cgroup_iter_filter cond); | ||
144 | |||
145 | static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | ||
146 | struct mem_cgroup *prev, | ||
147 | struct mem_cgroup_reclaim_cookie *reclaim) | ||
148 | { | ||
149 | return mem_cgroup_iter_cond(root, prev, reclaim, NULL); | ||
150 | } | ||
151 | |||
114 | void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); | 152 | void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); |
115 | 153 | ||
116 | /* | 154 | /* |
@@ -125,6 +163,48 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, | |||
125 | extern void mem_cgroup_replace_page_cache(struct page *oldpage, | 163 | extern void mem_cgroup_replace_page_cache(struct page *oldpage, |
126 | struct page *newpage); | 164 | struct page *newpage); |
127 | 165 | ||
166 | /** | ||
167 | * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task | ||
168 | * @new: true to enable, false to disable | ||
169 | * | ||
170 | * Toggle whether a failed memcg charge should invoke the OOM killer | ||
171 | * or just return -ENOMEM. Returns the previous toggle state. | ||
172 | * | ||
173 | * NOTE: Any path that enables the OOM killer before charging must | ||
174 | * call mem_cgroup_oom_synchronize() afterward to finalize the | ||
175 | * OOM handling and clean up. | ||
176 | */ | ||
177 | static inline bool mem_cgroup_toggle_oom(bool new) | ||
178 | { | ||
179 | bool old; | ||
180 | |||
181 | old = current->memcg_oom.may_oom; | ||
182 | current->memcg_oom.may_oom = new; | ||
183 | |||
184 | return old; | ||
185 | } | ||
186 | |||
187 | static inline void mem_cgroup_enable_oom(void) | ||
188 | { | ||
189 | bool old = mem_cgroup_toggle_oom(true); | ||
190 | |||
191 | WARN_ON(old == true); | ||
192 | } | ||
193 | |||
194 | static inline void mem_cgroup_disable_oom(void) | ||
195 | { | ||
196 | bool old = mem_cgroup_toggle_oom(false); | ||
197 | |||
198 | WARN_ON(old == false); | ||
199 | } | ||
200 | |||
201 | static inline bool task_in_memcg_oom(struct task_struct *p) | ||
202 | { | ||
203 | return p->memcg_oom.in_memcg_oom; | ||
204 | } | ||
205 | |||
206 | bool mem_cgroup_oom_synchronize(void); | ||
207 | |||
128 | #ifdef CONFIG_MEMCG_SWAP | 208 | #ifdef CONFIG_MEMCG_SWAP |
129 | extern int do_swap_account; | 209 | extern int do_swap_account; |
130 | #endif | 210 | #endif |
@@ -165,24 +245,24 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page, | |||
165 | } | 245 | } |
166 | 246 | ||
167 | void mem_cgroup_update_page_stat(struct page *page, | 247 | void mem_cgroup_update_page_stat(struct page *page, |
168 | enum mem_cgroup_page_stat_item idx, | 248 | enum mem_cgroup_stat_index idx, |
169 | int val); | 249 | int val); |
170 | 250 | ||
171 | static inline void mem_cgroup_inc_page_stat(struct page *page, | 251 | static inline void mem_cgroup_inc_page_stat(struct page *page, |
172 | enum mem_cgroup_page_stat_item idx) | 252 | enum mem_cgroup_stat_index idx) |
173 | { | 253 | { |
174 | mem_cgroup_update_page_stat(page, idx, 1); | 254 | mem_cgroup_update_page_stat(page, idx, 1); |
175 | } | 255 | } |
176 | 256 | ||
177 | static inline void mem_cgroup_dec_page_stat(struct page *page, | 257 | static inline void mem_cgroup_dec_page_stat(struct page *page, |
178 | enum mem_cgroup_page_stat_item idx) | 258 | enum mem_cgroup_stat_index idx) |
179 | { | 259 | { |
180 | mem_cgroup_update_page_stat(page, idx, -1); | 260 | mem_cgroup_update_page_stat(page, idx, -1); |
181 | } | 261 | } |
182 | 262 | ||
183 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | 263 | enum mem_cgroup_filter_t |
184 | gfp_t gfp_mask, | 264 | mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, |
185 | unsigned long *total_scanned); | 265 | struct mem_cgroup *root); |
186 | 266 | ||
187 | void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); | 267 | void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); |
188 | static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, | 268 | static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, |
@@ -296,6 +376,15 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
296 | struct page *oldpage, struct page *newpage, bool migration_ok) | 376 | struct page *oldpage, struct page *newpage, bool migration_ok) |
297 | { | 377 | { |
298 | } | 378 | } |
379 | static inline struct mem_cgroup * | ||
380 | mem_cgroup_iter_cond(struct mem_cgroup *root, | ||
381 | struct mem_cgroup *prev, | ||
382 | struct mem_cgroup_reclaim_cookie *reclaim, | ||
383 | mem_cgroup_iter_filter cond) | ||
384 | { | ||
385 | /* first call must return non-NULL, second return NULL */ | ||
386 | return (struct mem_cgroup *)(unsigned long)!prev; | ||
387 | } | ||
299 | 388 | ||
300 | static inline struct mem_cgroup * | 389 | static inline struct mem_cgroup * |
301 | mem_cgroup_iter(struct mem_cgroup *root, | 390 | mem_cgroup_iter(struct mem_cgroup *root, |
@@ -348,22 +437,45 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page, | |||
348 | { | 437 | { |
349 | } | 438 | } |
350 | 439 | ||
440 | static inline bool mem_cgroup_toggle_oom(bool new) | ||
441 | { | ||
442 | return false; | ||
443 | } | ||
444 | |||
445 | static inline void mem_cgroup_enable_oom(void) | ||
446 | { | ||
447 | } | ||
448 | |||
449 | static inline void mem_cgroup_disable_oom(void) | ||
450 | { | ||
451 | } | ||
452 | |||
453 | static inline bool task_in_memcg_oom(struct task_struct *p) | ||
454 | { | ||
455 | return false; | ||
456 | } | ||
457 | |||
458 | static inline bool mem_cgroup_oom_synchronize(void) | ||
459 | { | ||
460 | return false; | ||
461 | } | ||
462 | |||
351 | static inline void mem_cgroup_inc_page_stat(struct page *page, | 463 | static inline void mem_cgroup_inc_page_stat(struct page *page, |
352 | enum mem_cgroup_page_stat_item idx) | 464 | enum mem_cgroup_stat_index idx) |
353 | { | 465 | { |
354 | } | 466 | } |
355 | 467 | ||
356 | static inline void mem_cgroup_dec_page_stat(struct page *page, | 468 | static inline void mem_cgroup_dec_page_stat(struct page *page, |
357 | enum mem_cgroup_page_stat_item idx) | 469 | enum mem_cgroup_stat_index idx) |
358 | { | 470 | { |
359 | } | 471 | } |
360 | 472 | ||
361 | static inline | 473 | static inline |
362 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | 474 | enum mem_cgroup_filter_t |
363 | gfp_t gfp_mask, | 475 | mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, |
364 | unsigned long *total_scanned) | 476 | struct mem_cgroup *root) |
365 | { | 477 | { |
366 | return 0; | 478 | return VISIT; |
367 | } | 479 | } |
368 | 480 | ||
369 | static inline void mem_cgroup_split_huge_fixup(struct page *head) | 481 | static inline void mem_cgroup_split_huge_fixup(struct page *head) |
diff --git a/include/linux/mm.h b/include/linux/mm.h index caf543c7eaa7..8b6e55ee8855 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -176,6 +176,7 @@ extern pgprot_t protection_map[16]; | |||
176 | #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ | 176 | #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ |
177 | #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ | 177 | #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ |
178 | #define FAULT_FLAG_TRIED 0x40 /* second try */ | 178 | #define FAULT_FLAG_TRIED 0x40 /* second try */ |
179 | #define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */ | ||
179 | 180 | ||
180 | /* | 181 | /* |
181 | * vm_fault is filled by the the pagefault handler and passed to the vma's | 182 | * vm_fault is filled by the the pagefault handler and passed to the vma's |
@@ -876,11 +877,12 @@ static inline int page_mapped(struct page *page) | |||
876 | #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ | 877 | #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ |
877 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ | 878 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ |
878 | #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ | 879 | #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ |
880 | #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ | ||
879 | 881 | ||
880 | #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ | 882 | #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ |
881 | 883 | ||
882 | #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ | 884 | #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ |
883 | VM_FAULT_HWPOISON_LARGE) | 885 | VM_FAULT_FALLBACK | VM_FAULT_HWPOISON_LARGE) |
884 | 886 | ||
885 | /* Encode hstate index for a hwpoisoned large page */ | 887 | /* Encode hstate index for a hwpoisoned large page */ |
886 | #define VM_FAULT_SET_HINDEX(x) ((x) << 12) | 888 | #define VM_FAULT_SET_HINDEX(x) ((x) << 12) |
@@ -984,7 +986,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, | |||
984 | unmap_mapping_range(mapping, holebegin, holelen, 0); | 986 | unmap_mapping_range(mapping, holebegin, holelen, 0); |
985 | } | 987 | } |
986 | 988 | ||
987 | extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new); | 989 | extern void truncate_pagecache(struct inode *inode, loff_t new); |
988 | extern void truncate_setsize(struct inode *inode, loff_t newsize); | 990 | extern void truncate_setsize(struct inode *inode, loff_t newsize); |
989 | void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); | 991 | void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); |
990 | int truncate_inode_page(struct address_space *mapping, struct page *page); | 992 | int truncate_inode_page(struct address_space *mapping, struct page *page); |
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index 96a509b6be04..201a69749659 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h | |||
@@ -54,7 +54,7 @@ struct res_counter { | |||
54 | struct res_counter *parent; | 54 | struct res_counter *parent; |
55 | }; | 55 | }; |
56 | 56 | ||
57 | #define RESOURCE_MAX (unsigned long long)LLONG_MAX | 57 | #define RES_COUNTER_MAX ULLONG_MAX |
58 | 58 | ||
59 | /** | 59 | /** |
60 | * Helpers to interact with userspace | 60 | * Helpers to interact with userspace |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 45f254dddafc..6682da36b293 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1393,6 +1393,13 @@ struct task_struct { | |||
1393 | unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ | 1393 | unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ |
1394 | } memcg_batch; | 1394 | } memcg_batch; |
1395 | unsigned int memcg_kmem_skip_account; | 1395 | unsigned int memcg_kmem_skip_account; |
1396 | struct memcg_oom_info { | ||
1397 | unsigned int may_oom:1; | ||
1398 | unsigned int in_memcg_oom:1; | ||
1399 | unsigned int oom_locked:1; | ||
1400 | int wakeups; | ||
1401 | struct mem_cgroup *wait_on_memcg; | ||
1402 | } memcg_oom; | ||
1396 | #endif | 1403 | #endif |
1397 | #ifdef CONFIG_UPROBES | 1404 | #ifdef CONFIG_UPROBES |
1398 | struct uprobe_task *utask; | 1405 | struct uprobe_task *utask; |
diff --git a/include/linux/swap.h b/include/linux/swap.h index c03c139219c9..46ba0c6c219f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -280,7 +280,7 @@ extern void activate_page(struct page *); | |||
280 | extern void mark_page_accessed(struct page *); | 280 | extern void mark_page_accessed(struct page *); |
281 | extern void lru_add_drain(void); | 281 | extern void lru_add_drain(void); |
282 | extern void lru_add_drain_cpu(int cpu); | 282 | extern void lru_add_drain_cpu(int cpu); |
283 | extern int lru_add_drain_all(void); | 283 | extern void lru_add_drain_all(void); |
284 | extern void rotate_reclaimable_page(struct page *page); | 284 | extern void rotate_reclaimable_page(struct page *page); |
285 | extern void deactivate_page(struct page *page); | 285 | extern void deactivate_page(struct page *page); |
286 | extern void swap_setup(void); | 286 | extern void swap_setup(void); |
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 9bd0934f6c33..7a7d2ee96d42 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c | |||
@@ -74,7 +74,7 @@ static int __init gcov_persist_setup(char *str) | |||
74 | { | 74 | { |
75 | unsigned long val; | 75 | unsigned long val; |
76 | 76 | ||
77 | if (strict_strtoul(str, 0, &val)) { | 77 | if (kstrtoul(str, 0, &val)) { |
78 | pr_warning("invalid gcov_persist parameter '%s'\n", str); | 78 | pr_warning("invalid gcov_persist parameter '%s'\n", str); |
79 | return 0; | 79 | return 0; |
80 | } | 80 | } |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 6ada93c23a9a..9659d38e008f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -113,7 +113,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, | |||
113 | unsigned long cnt; | 113 | unsigned long cnt; |
114 | int ret; | 114 | int ret; |
115 | 115 | ||
116 | if (strict_strtoul(buf, 0, &cnt)) | 116 | if (kstrtoul(buf, 0, &cnt)) |
117 | return -EINVAL; | 117 | return -EINVAL; |
118 | 118 | ||
119 | ret = crash_shrink_memory(cnt); | 119 | ret = crash_shrink_memory(cnt); |
diff --git a/kernel/params.c b/kernel/params.c index 501bde4f3bee..81c4e78c8f4c 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -253,13 +253,13 @@ int parse_args(const char *doing, | |||
253 | EXPORT_SYMBOL(param_ops_##name) | 253 | EXPORT_SYMBOL(param_ops_##name) |
254 | 254 | ||
255 | 255 | ||
256 | STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, strict_strtoul); | 256 | STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); |
257 | STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); | 257 | STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtoul); |
258 | STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); | 258 | STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); |
259 | STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); | 259 | STANDARD_PARAM_DEF(int, int, "%i", long, kstrtoul); |
260 | STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); | 260 | STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); |
261 | STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); | 261 | STANDARD_PARAM_DEF(long, long, "%li", long, kstrtoul); |
262 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); | 262 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); |
263 | 263 | ||
264 | int param_set_charp(const char *val, const struct kernel_param *kp) | 264 | int param_set_charp(const char *val, const struct kernel_param *kp) |
265 | { | 265 | { |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index ff55247e7049..4aa8a305aede 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -17,8 +17,8 @@ | |||
17 | void res_counter_init(struct res_counter *counter, struct res_counter *parent) | 17 | void res_counter_init(struct res_counter *counter, struct res_counter *parent) |
18 | { | 18 | { |
19 | spin_lock_init(&counter->lock); | 19 | spin_lock_init(&counter->lock); |
20 | counter->limit = RESOURCE_MAX; | 20 | counter->limit = RES_COUNTER_MAX; |
21 | counter->soft_limit = RESOURCE_MAX; | 21 | counter->soft_limit = RES_COUNTER_MAX; |
22 | counter->parent = parent; | 22 | counter->parent = parent; |
23 | } | 23 | } |
24 | 24 | ||
@@ -178,23 +178,30 @@ u64 res_counter_read_u64(struct res_counter *counter, int member) | |||
178 | #endif | 178 | #endif |
179 | 179 | ||
180 | int res_counter_memparse_write_strategy(const char *buf, | 180 | int res_counter_memparse_write_strategy(const char *buf, |
181 | unsigned long long *res) | 181 | unsigned long long *resp) |
182 | { | 182 | { |
183 | char *end; | 183 | char *end; |
184 | unsigned long long res; | ||
184 | 185 | ||
185 | /* return RESOURCE_MAX(unlimited) if "-1" is specified */ | 186 | /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ |
186 | if (*buf == '-') { | 187 | if (*buf == '-') { |
187 | *res = simple_strtoull(buf + 1, &end, 10); | 188 | res = simple_strtoull(buf + 1, &end, 10); |
188 | if (*res != 1 || *end != '\0') | 189 | if (res != 1 || *end != '\0') |
189 | return -EINVAL; | 190 | return -EINVAL; |
190 | *res = RESOURCE_MAX; | 191 | *resp = RES_COUNTER_MAX; |
191 | return 0; | 192 | return 0; |
192 | } | 193 | } |
193 | 194 | ||
194 | *res = memparse(buf, &end); | 195 | res = memparse(buf, &end); |
195 | if (*end != '\0') | 196 | if (*end != '\0') |
196 | return -EINVAL; | 197 | return -EINVAL; |
197 | 198 | ||
198 | *res = PAGE_ALIGN(*res); | 199 | if (PAGE_ALIGN(res) >= res) |
200 | res = PAGE_ALIGN(res); | ||
201 | else | ||
202 | res = RES_COUNTER_MAX; | ||
203 | |||
204 | *resp = res; | ||
205 | |||
199 | return 0; | 206 | return 0; |
200 | } | 207 | } |
diff --git a/mm/Kconfig b/mm/Kconfig index 6cdd27043303..026771a9b097 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -245,7 +245,7 @@ config COMPACTION | |||
245 | config MIGRATION | 245 | config MIGRATION |
246 | bool "Page migration" | 246 | bool "Page migration" |
247 | def_bool y | 247 | def_bool y |
248 | depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA | 248 | depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU |
249 | help | 249 | help |
250 | Allows the migration of the physical location of pages of processes | 250 | Allows the migration of the physical location of pages of processes |
251 | while the virtual addresses are not changed. This is useful in | 251 | while the virtual addresses are not changed. This is useful in |
@@ -480,7 +480,7 @@ config FRONTSWAP | |||
480 | 480 | ||
481 | config CMA | 481 | config CMA |
482 | bool "Contiguous Memory Allocator" | 482 | bool "Contiguous Memory Allocator" |
483 | depends on HAVE_MEMBLOCK | 483 | depends on HAVE_MEMBLOCK && MMU |
484 | select MIGRATION | 484 | select MIGRATION |
485 | select MEMORY_ISOLATION | 485 | select MEMORY_ISOLATION |
486 | help | 486 | help |
diff --git a/mm/filemap.c b/mm/filemap.c index e607728db4a8..1e6aec4a2d2e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -467,32 +467,34 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
467 | error = mem_cgroup_cache_charge(page, current->mm, | 467 | error = mem_cgroup_cache_charge(page, current->mm, |
468 | gfp_mask & GFP_RECLAIM_MASK); | 468 | gfp_mask & GFP_RECLAIM_MASK); |
469 | if (error) | 469 | if (error) |
470 | goto out; | 470 | return error; |
471 | 471 | ||
472 | error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); | 472 | error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); |
473 | if (error == 0) { | 473 | if (error) { |
474 | page_cache_get(page); | ||
475 | page->mapping = mapping; | ||
476 | page->index = offset; | ||
477 | |||
478 | spin_lock_irq(&mapping->tree_lock); | ||
479 | error = radix_tree_insert(&mapping->page_tree, offset, page); | ||
480 | if (likely(!error)) { | ||
481 | mapping->nrpages++; | ||
482 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
483 | spin_unlock_irq(&mapping->tree_lock); | ||
484 | trace_mm_filemap_add_to_page_cache(page); | ||
485 | } else { | ||
486 | page->mapping = NULL; | ||
487 | /* Leave page->index set: truncation relies upon it */ | ||
488 | spin_unlock_irq(&mapping->tree_lock); | ||
489 | mem_cgroup_uncharge_cache_page(page); | ||
490 | page_cache_release(page); | ||
491 | } | ||
492 | radix_tree_preload_end(); | ||
493 | } else | ||
494 | mem_cgroup_uncharge_cache_page(page); | 474 | mem_cgroup_uncharge_cache_page(page); |
495 | out: | 475 | return error; |
476 | } | ||
477 | |||
478 | page_cache_get(page); | ||
479 | page->mapping = mapping; | ||
480 | page->index = offset; | ||
481 | |||
482 | spin_lock_irq(&mapping->tree_lock); | ||
483 | error = radix_tree_insert(&mapping->page_tree, offset, page); | ||
484 | radix_tree_preload_end(); | ||
485 | if (unlikely(error)) | ||
486 | goto err_insert; | ||
487 | mapping->nrpages++; | ||
488 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
489 | spin_unlock_irq(&mapping->tree_lock); | ||
490 | trace_mm_filemap_add_to_page_cache(page); | ||
491 | return 0; | ||
492 | err_insert: | ||
493 | page->mapping = NULL; | ||
494 | /* Leave page->index set: truncation relies upon it */ | ||
495 | spin_unlock_irq(&mapping->tree_lock); | ||
496 | mem_cgroup_uncharge_cache_page(page); | ||
497 | page_cache_release(page); | ||
496 | return error; | 498 | return error; |
497 | } | 499 | } |
498 | EXPORT_SYMBOL(add_to_page_cache_locked); | 500 | EXPORT_SYMBOL(add_to_page_cache_locked); |
@@ -1614,6 +1616,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1614 | struct inode *inode = mapping->host; | 1616 | struct inode *inode = mapping->host; |
1615 | pgoff_t offset = vmf->pgoff; | 1617 | pgoff_t offset = vmf->pgoff; |
1616 | struct page *page; | 1618 | struct page *page; |
1619 | bool memcg_oom; | ||
1617 | pgoff_t size; | 1620 | pgoff_t size; |
1618 | int ret = 0; | 1621 | int ret = 0; |
1619 | 1622 | ||
@@ -1622,7 +1625,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1622 | return VM_FAULT_SIGBUS; | 1625 | return VM_FAULT_SIGBUS; |
1623 | 1626 | ||
1624 | /* | 1627 | /* |
1625 | * Do we have something in the page cache already? | 1628 | * Do we have something in the page cache already? Either |
1629 | * way, try readahead, but disable the memcg OOM killer for it | ||
1630 | * as readahead is optional and no errors are propagated up | ||
1631 | * the fault stack. The OOM killer is enabled while trying to | ||
1632 | * instantiate the faulting page individually below. | ||
1626 | */ | 1633 | */ |
1627 | page = find_get_page(mapping, offset); | 1634 | page = find_get_page(mapping, offset); |
1628 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { | 1635 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { |
@@ -1630,10 +1637,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1630 | * We found the page, so try async readahead before | 1637 | * We found the page, so try async readahead before |
1631 | * waiting for the lock. | 1638 | * waiting for the lock. |
1632 | */ | 1639 | */ |
1640 | memcg_oom = mem_cgroup_toggle_oom(false); | ||
1633 | do_async_mmap_readahead(vma, ra, file, page, offset); | 1641 | do_async_mmap_readahead(vma, ra, file, page, offset); |
1642 | mem_cgroup_toggle_oom(memcg_oom); | ||
1634 | } else if (!page) { | 1643 | } else if (!page) { |
1635 | /* No page in the page cache at all */ | 1644 | /* No page in the page cache at all */ |
1645 | memcg_oom = mem_cgroup_toggle_oom(false); | ||
1636 | do_sync_mmap_readahead(vma, ra, file, offset); | 1646 | do_sync_mmap_readahead(vma, ra, file, offset); |
1647 | mem_cgroup_toggle_oom(memcg_oom); | ||
1637 | count_vm_event(PGMAJFAULT); | 1648 | count_vm_event(PGMAJFAULT); |
1638 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | 1649 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
1639 | ret = VM_FAULT_MAJOR; | 1650 | ret = VM_FAULT_MAJOR; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d66010e0049d..7489884682d8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -695,11 +695,10 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | |||
695 | return pmd; | 695 | return pmd; |
696 | } | 696 | } |
697 | 697 | ||
698 | static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) | 698 | static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) |
699 | { | 699 | { |
700 | pmd_t entry; | 700 | pmd_t entry; |
701 | entry = mk_pmd(page, vma->vm_page_prot); | 701 | entry = mk_pmd(page, prot); |
702 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
703 | entry = pmd_mkhuge(entry); | 702 | entry = pmd_mkhuge(entry); |
704 | return entry; | 703 | return entry; |
705 | } | 704 | } |
@@ -732,7 +731,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
732 | pte_free(mm, pgtable); | 731 | pte_free(mm, pgtable); |
733 | } else { | 732 | } else { |
734 | pmd_t entry; | 733 | pmd_t entry; |
735 | entry = mk_huge_pmd(page, vma); | 734 | entry = mk_huge_pmd(page, vma->vm_page_prot); |
735 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
736 | page_add_new_anon_rmap(page, vma, haddr); | 736 | page_add_new_anon_rmap(page, vma, haddr); |
737 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 737 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
738 | set_pmd_at(mm, haddr, pmd, entry); | 738 | set_pmd_at(mm, haddr, pmd, entry); |
@@ -788,77 +788,57 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
788 | { | 788 | { |
789 | struct page *page; | 789 | struct page *page; |
790 | unsigned long haddr = address & HPAGE_PMD_MASK; | 790 | unsigned long haddr = address & HPAGE_PMD_MASK; |
791 | pte_t *pte; | ||
792 | 791 | ||
793 | if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { | 792 | if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) |
794 | if (unlikely(anon_vma_prepare(vma))) | 793 | return VM_FAULT_FALLBACK; |
795 | return VM_FAULT_OOM; | 794 | if (unlikely(anon_vma_prepare(vma))) |
796 | if (unlikely(khugepaged_enter(vma))) | 795 | return VM_FAULT_OOM; |
796 | if (unlikely(khugepaged_enter(vma))) | ||
797 | return VM_FAULT_OOM; | ||
798 | if (!(flags & FAULT_FLAG_WRITE) && | ||
799 | transparent_hugepage_use_zero_page()) { | ||
800 | pgtable_t pgtable; | ||
801 | struct page *zero_page; | ||
802 | bool set; | ||
803 | pgtable = pte_alloc_one(mm, haddr); | ||
804 | if (unlikely(!pgtable)) | ||
797 | return VM_FAULT_OOM; | 805 | return VM_FAULT_OOM; |
798 | if (!(flags & FAULT_FLAG_WRITE) && | 806 | zero_page = get_huge_zero_page(); |
799 | transparent_hugepage_use_zero_page()) { | 807 | if (unlikely(!zero_page)) { |
800 | pgtable_t pgtable; | 808 | pte_free(mm, pgtable); |
801 | struct page *zero_page; | ||
802 | bool set; | ||
803 | pgtable = pte_alloc_one(mm, haddr); | ||
804 | if (unlikely(!pgtable)) | ||
805 | return VM_FAULT_OOM; | ||
806 | zero_page = get_huge_zero_page(); | ||
807 | if (unlikely(!zero_page)) { | ||
808 | pte_free(mm, pgtable); | ||
809 | count_vm_event(THP_FAULT_FALLBACK); | ||
810 | goto out; | ||
811 | } | ||
812 | spin_lock(&mm->page_table_lock); | ||
813 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, | ||
814 | zero_page); | ||
815 | spin_unlock(&mm->page_table_lock); | ||
816 | if (!set) { | ||
817 | pte_free(mm, pgtable); | ||
818 | put_huge_zero_page(); | ||
819 | } | ||
820 | return 0; | ||
821 | } | ||
822 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | ||
823 | vma, haddr, numa_node_id(), 0); | ||
824 | if (unlikely(!page)) { | ||
825 | count_vm_event(THP_FAULT_FALLBACK); | 809 | count_vm_event(THP_FAULT_FALLBACK); |
826 | goto out; | 810 | return VM_FAULT_FALLBACK; |
827 | } | ||
828 | count_vm_event(THP_FAULT_ALLOC); | ||
829 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { | ||
830 | put_page(page); | ||
831 | goto out; | ||
832 | } | 811 | } |
833 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, | 812 | spin_lock(&mm->page_table_lock); |
834 | page))) { | 813 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, |
835 | mem_cgroup_uncharge_page(page); | 814 | zero_page); |
836 | put_page(page); | 815 | spin_unlock(&mm->page_table_lock); |
837 | goto out; | 816 | if (!set) { |
817 | pte_free(mm, pgtable); | ||
818 | put_huge_zero_page(); | ||
838 | } | 819 | } |
839 | |||
840 | return 0; | 820 | return 0; |
841 | } | 821 | } |
842 | out: | 822 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
843 | /* | 823 | vma, haddr, numa_node_id(), 0); |
844 | * Use __pte_alloc instead of pte_alloc_map, because we can't | 824 | if (unlikely(!page)) { |
845 | * run pte_offset_map on the pmd, if an huge pmd could | 825 | count_vm_event(THP_FAULT_FALLBACK); |
846 | * materialize from under us from a different thread. | 826 | return VM_FAULT_FALLBACK; |
847 | */ | 827 | } |
848 | if (unlikely(pmd_none(*pmd)) && | 828 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { |
849 | unlikely(__pte_alloc(mm, vma, pmd, address))) | 829 | put_page(page); |
850 | return VM_FAULT_OOM; | 830 | count_vm_event(THP_FAULT_FALLBACK); |
851 | /* if an huge pmd materialized from under us just retry later */ | 831 | return VM_FAULT_FALLBACK; |
852 | if (unlikely(pmd_trans_huge(*pmd))) | 832 | } |
853 | return 0; | 833 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { |
854 | /* | 834 | mem_cgroup_uncharge_page(page); |
855 | * A regular pmd is established and it can't morph into a huge pmd | 835 | put_page(page); |
856 | * from under us anymore at this point because we hold the mmap_sem | 836 | count_vm_event(THP_FAULT_FALLBACK); |
857 | * read mode and khugepaged takes it in write mode. So now it's | 837 | return VM_FAULT_FALLBACK; |
858 | * safe to run pte_offset_map(). | 838 | } |
859 | */ | 839 | |
860 | pte = pte_offset_map(pmd, address); | 840 | count_vm_event(THP_FAULT_ALLOC); |
861 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 841 | return 0; |
862 | } | 842 | } |
863 | 843 | ||
864 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 844 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
@@ -1170,7 +1150,6 @@ alloc: | |||
1170 | new_page = NULL; | 1150 | new_page = NULL; |
1171 | 1151 | ||
1172 | if (unlikely(!new_page)) { | 1152 | if (unlikely(!new_page)) { |
1173 | count_vm_event(THP_FAULT_FALLBACK); | ||
1174 | if (is_huge_zero_pmd(orig_pmd)) { | 1153 | if (is_huge_zero_pmd(orig_pmd)) { |
1175 | ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, | 1154 | ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, |
1176 | address, pmd, orig_pmd, haddr); | 1155 | address, pmd, orig_pmd, haddr); |
@@ -1181,9 +1160,9 @@ alloc: | |||
1181 | split_huge_page(page); | 1160 | split_huge_page(page); |
1182 | put_page(page); | 1161 | put_page(page); |
1183 | } | 1162 | } |
1163 | count_vm_event(THP_FAULT_FALLBACK); | ||
1184 | goto out; | 1164 | goto out; |
1185 | } | 1165 | } |
1186 | count_vm_event(THP_FAULT_ALLOC); | ||
1187 | 1166 | ||
1188 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1167 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
1189 | put_page(new_page); | 1168 | put_page(new_page); |
@@ -1191,10 +1170,13 @@ alloc: | |||
1191 | split_huge_page(page); | 1170 | split_huge_page(page); |
1192 | put_page(page); | 1171 | put_page(page); |
1193 | } | 1172 | } |
1173 | count_vm_event(THP_FAULT_FALLBACK); | ||
1194 | ret |= VM_FAULT_OOM; | 1174 | ret |= VM_FAULT_OOM; |
1195 | goto out; | 1175 | goto out; |
1196 | } | 1176 | } |
1197 | 1177 | ||
1178 | count_vm_event(THP_FAULT_ALLOC); | ||
1179 | |||
1198 | if (is_huge_zero_pmd(orig_pmd)) | 1180 | if (is_huge_zero_pmd(orig_pmd)) |
1199 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); | 1181 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); |
1200 | else | 1182 | else |
@@ -1215,7 +1197,8 @@ alloc: | |||
1215 | goto out_mn; | 1197 | goto out_mn; |
1216 | } else { | 1198 | } else { |
1217 | pmd_t entry; | 1199 | pmd_t entry; |
1218 | entry = mk_huge_pmd(new_page, vma); | 1200 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); |
1201 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
1219 | pmdp_clear_flush(vma, haddr, pmd); | 1202 | pmdp_clear_flush(vma, haddr, pmd); |
1220 | page_add_new_anon_rmap(new_page, vma, haddr); | 1203 | page_add_new_anon_rmap(new_page, vma, haddr); |
1221 | set_pmd_at(mm, haddr, pmd, entry); | 1204 | set_pmd_at(mm, haddr, pmd, entry); |
@@ -1666,7 +1649,6 @@ static void __split_huge_page_refcount(struct page *page, | |||
1666 | BUG_ON(atomic_read(&page->_count) <= 0); | 1649 | BUG_ON(atomic_read(&page->_count) <= 0); |
1667 | 1650 | ||
1668 | __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); | 1651 | __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); |
1669 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); | ||
1670 | 1652 | ||
1671 | ClearPageCompound(page); | 1653 | ClearPageCompound(page); |
1672 | compound_unlock(page); | 1654 | compound_unlock(page); |
@@ -2364,7 +2346,8 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2364 | __SetPageUptodate(new_page); | 2346 | __SetPageUptodate(new_page); |
2365 | pgtable = pmd_pgtable(_pmd); | 2347 | pgtable = pmd_pgtable(_pmd); |
2366 | 2348 | ||
2367 | _pmd = mk_huge_pmd(new_page, vma); | 2349 | _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); |
2350 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | ||
2368 | 2351 | ||
2369 | /* | 2352 | /* |
2370 | * spin_lock() below is not the equivalent of smp_wmb(), so | 2353 | * spin_lock() below is not the equivalent of smp_wmb(), so |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c6bd28edd533..d5ff3ce13029 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -39,7 +39,6 @@ | |||
39 | #include <linux/limits.h> | 39 | #include <linux/limits.h> |
40 | #include <linux/export.h> | 40 | #include <linux/export.h> |
41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
42 | #include <linux/rbtree.h> | ||
43 | #include <linux/slab.h> | 42 | #include <linux/slab.h> |
44 | #include <linux/swap.h> | 43 | #include <linux/swap.h> |
45 | #include <linux/swapops.h> | 44 | #include <linux/swapops.h> |
@@ -85,26 +84,12 @@ static int really_do_swap_account __initdata = 0; | |||
85 | #endif | 84 | #endif |
86 | 85 | ||
87 | 86 | ||
88 | /* | ||
89 | * Statistics for memory cgroup. | ||
90 | */ | ||
91 | enum mem_cgroup_stat_index { | ||
92 | /* | ||
93 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. | ||
94 | */ | ||
95 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | ||
96 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | ||
97 | MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ | ||
98 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | ||
99 | MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ | ||
100 | MEM_CGROUP_STAT_NSTATS, | ||
101 | }; | ||
102 | |||
103 | static const char * const mem_cgroup_stat_names[] = { | 87 | static const char * const mem_cgroup_stat_names[] = { |
104 | "cache", | 88 | "cache", |
105 | "rss", | 89 | "rss", |
106 | "rss_huge", | 90 | "rss_huge", |
107 | "mapped_file", | 91 | "mapped_file", |
92 | "writeback", | ||
108 | "swap", | 93 | "swap", |
109 | }; | 94 | }; |
110 | 95 | ||
@@ -175,10 +160,6 @@ struct mem_cgroup_per_zone { | |||
175 | 160 | ||
176 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 161 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; |
177 | 162 | ||
178 | struct rb_node tree_node; /* RB tree node */ | ||
179 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
180 | /* the soft limit is exceeded*/ | ||
181 | bool on_tree; | ||
182 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ | 163 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ |
183 | /* use container_of */ | 164 | /* use container_of */ |
184 | }; | 165 | }; |
@@ -187,26 +168,6 @@ struct mem_cgroup_per_node { | |||
187 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 168 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
188 | }; | 169 | }; |
189 | 170 | ||
190 | /* | ||
191 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
192 | * their hierarchy representation | ||
193 | */ | ||
194 | |||
195 | struct mem_cgroup_tree_per_zone { | ||
196 | struct rb_root rb_root; | ||
197 | spinlock_t lock; | ||
198 | }; | ||
199 | |||
200 | struct mem_cgroup_tree_per_node { | ||
201 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
202 | }; | ||
203 | |||
204 | struct mem_cgroup_tree { | ||
205 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
206 | }; | ||
207 | |||
208 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
209 | |||
210 | struct mem_cgroup_threshold { | 171 | struct mem_cgroup_threshold { |
211 | struct eventfd_ctx *eventfd; | 172 | struct eventfd_ctx *eventfd; |
212 | u64 threshold; | 173 | u64 threshold; |
@@ -280,6 +241,7 @@ struct mem_cgroup { | |||
280 | 241 | ||
281 | bool oom_lock; | 242 | bool oom_lock; |
282 | atomic_t under_oom; | 243 | atomic_t under_oom; |
244 | atomic_t oom_wakeups; | ||
283 | 245 | ||
284 | int swappiness; | 246 | int swappiness; |
285 | /* OOM-Killer disable */ | 247 | /* OOM-Killer disable */ |
@@ -304,7 +266,7 @@ struct mem_cgroup { | |||
304 | * Should we move charges of a task when a task is moved into this | 266 | * Should we move charges of a task when a task is moved into this |
305 | * mem_cgroup ? And what type of charges should we move ? | 267 | * mem_cgroup ? And what type of charges should we move ? |
306 | */ | 268 | */ |
307 | unsigned long move_charge_at_immigrate; | 269 | unsigned long move_charge_at_immigrate; |
308 | /* | 270 | /* |
309 | * set > 0 if pages under this cgroup are moving to other cgroup. | 271 | * set > 0 if pages under this cgroup are moving to other cgroup. |
310 | */ | 272 | */ |
@@ -341,6 +303,22 @@ struct mem_cgroup { | |||
341 | atomic_t numainfo_events; | 303 | atomic_t numainfo_events; |
342 | atomic_t numainfo_updating; | 304 | atomic_t numainfo_updating; |
343 | #endif | 305 | #endif |
306 | /* | ||
307 | * Protects soft_contributed transitions. | ||
308 | * See mem_cgroup_update_soft_limit | ||
309 | */ | ||
310 | spinlock_t soft_lock; | ||
311 | |||
312 | /* | ||
313 | * If true then this group has increased parents' children_in_excess | ||
314 | * when it got over the soft limit. | ||
315 | * When a group falls bellow the soft limit, parents' children_in_excess | ||
316 | * is decreased and soft_contributed changed to false. | ||
317 | */ | ||
318 | bool soft_contributed; | ||
319 | |||
320 | /* Number of children that are in soft limit excess */ | ||
321 | atomic_t children_in_excess; | ||
344 | 322 | ||
345 | struct mem_cgroup_per_node *nodeinfo[0]; | 323 | struct mem_cgroup_per_node *nodeinfo[0]; |
346 | /* WARNING: nodeinfo must be the last member here */ | 324 | /* WARNING: nodeinfo must be the last member here */ |
@@ -444,7 +422,6 @@ static bool move_file(void) | |||
444 | * limit reclaim to prevent infinite loops, if they ever occur. | 422 | * limit reclaim to prevent infinite loops, if they ever occur. |
445 | */ | 423 | */ |
446 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 | 424 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 |
447 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 | ||
448 | 425 | ||
449 | enum charge_type { | 426 | enum charge_type { |
450 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 427 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
@@ -671,164 +648,6 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) | |||
671 | return mem_cgroup_zoneinfo(memcg, nid, zid); | 648 | return mem_cgroup_zoneinfo(memcg, nid, zid); |
672 | } | 649 | } |
673 | 650 | ||
674 | static struct mem_cgroup_tree_per_zone * | ||
675 | soft_limit_tree_node_zone(int nid, int zid) | ||
676 | { | ||
677 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
678 | } | ||
679 | |||
680 | static struct mem_cgroup_tree_per_zone * | ||
681 | soft_limit_tree_from_page(struct page *page) | ||
682 | { | ||
683 | int nid = page_to_nid(page); | ||
684 | int zid = page_zonenum(page); | ||
685 | |||
686 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
687 | } | ||
688 | |||
689 | static void | ||
690 | __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, | ||
691 | struct mem_cgroup_per_zone *mz, | ||
692 | struct mem_cgroup_tree_per_zone *mctz, | ||
693 | unsigned long long new_usage_in_excess) | ||
694 | { | ||
695 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
696 | struct rb_node *parent = NULL; | ||
697 | struct mem_cgroup_per_zone *mz_node; | ||
698 | |||
699 | if (mz->on_tree) | ||
700 | return; | ||
701 | |||
702 | mz->usage_in_excess = new_usage_in_excess; | ||
703 | if (!mz->usage_in_excess) | ||
704 | return; | ||
705 | while (*p) { | ||
706 | parent = *p; | ||
707 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
708 | tree_node); | ||
709 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
710 | p = &(*p)->rb_left; | ||
711 | /* | ||
712 | * We can't avoid mem cgroups that are over their soft | ||
713 | * limit by the same amount | ||
714 | */ | ||
715 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
716 | p = &(*p)->rb_right; | ||
717 | } | ||
718 | rb_link_node(&mz->tree_node, parent, p); | ||
719 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
720 | mz->on_tree = true; | ||
721 | } | ||
722 | |||
723 | static void | ||
724 | __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
725 | struct mem_cgroup_per_zone *mz, | ||
726 | struct mem_cgroup_tree_per_zone *mctz) | ||
727 | { | ||
728 | if (!mz->on_tree) | ||
729 | return; | ||
730 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
731 | mz->on_tree = false; | ||
732 | } | ||
733 | |||
734 | static void | ||
735 | mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
736 | struct mem_cgroup_per_zone *mz, | ||
737 | struct mem_cgroup_tree_per_zone *mctz) | ||
738 | { | ||
739 | spin_lock(&mctz->lock); | ||
740 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
741 | spin_unlock(&mctz->lock); | ||
742 | } | ||
743 | |||
744 | |||
745 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | ||
746 | { | ||
747 | unsigned long long excess; | ||
748 | struct mem_cgroup_per_zone *mz; | ||
749 | struct mem_cgroup_tree_per_zone *mctz; | ||
750 | int nid = page_to_nid(page); | ||
751 | int zid = page_zonenum(page); | ||
752 | mctz = soft_limit_tree_from_page(page); | ||
753 | |||
754 | /* | ||
755 | * Necessary to update all ancestors when hierarchy is used. | ||
756 | * because their event counter is not touched. | ||
757 | */ | ||
758 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { | ||
759 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
760 | excess = res_counter_soft_limit_excess(&memcg->res); | ||
761 | /* | ||
762 | * We have to update the tree if mz is on RB-tree or | ||
763 | * mem is over its softlimit. | ||
764 | */ | ||
765 | if (excess || mz->on_tree) { | ||
766 | spin_lock(&mctz->lock); | ||
767 | /* if on-tree, remove it */ | ||
768 | if (mz->on_tree) | ||
769 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
770 | /* | ||
771 | * Insert again. mz->usage_in_excess will be updated. | ||
772 | * If excess is 0, no tree ops. | ||
773 | */ | ||
774 | __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); | ||
775 | spin_unlock(&mctz->lock); | ||
776 | } | ||
777 | } | ||
778 | } | ||
779 | |||
780 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) | ||
781 | { | ||
782 | int node, zone; | ||
783 | struct mem_cgroup_per_zone *mz; | ||
784 | struct mem_cgroup_tree_per_zone *mctz; | ||
785 | |||
786 | for_each_node(node) { | ||
787 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
788 | mz = mem_cgroup_zoneinfo(memcg, node, zone); | ||
789 | mctz = soft_limit_tree_node_zone(node, zone); | ||
790 | mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
791 | } | ||
792 | } | ||
793 | } | ||
794 | |||
795 | static struct mem_cgroup_per_zone * | ||
796 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
797 | { | ||
798 | struct rb_node *rightmost = NULL; | ||
799 | struct mem_cgroup_per_zone *mz; | ||
800 | |||
801 | retry: | ||
802 | mz = NULL; | ||
803 | rightmost = rb_last(&mctz->rb_root); | ||
804 | if (!rightmost) | ||
805 | goto done; /* Nothing to reclaim from */ | ||
806 | |||
807 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | ||
808 | /* | ||
809 | * Remove the node now but someone else can add it back, | ||
810 | * we will to add it back at the end of reclaim to its correct | ||
811 | * position in the tree. | ||
812 | */ | ||
813 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
814 | if (!res_counter_soft_limit_excess(&mz->memcg->res) || | ||
815 | !css_tryget(&mz->memcg->css)) | ||
816 | goto retry; | ||
817 | done: | ||
818 | return mz; | ||
819 | } | ||
820 | |||
821 | static struct mem_cgroup_per_zone * | ||
822 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
823 | { | ||
824 | struct mem_cgroup_per_zone *mz; | ||
825 | |||
826 | spin_lock(&mctz->lock); | ||
827 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | ||
828 | spin_unlock(&mctz->lock); | ||
829 | return mz; | ||
830 | } | ||
831 | |||
832 | /* | 651 | /* |
833 | * Implementation Note: reading percpu statistics for memcg. | 652 | * Implementation Note: reading percpu statistics for memcg. |
834 | * | 653 | * |
@@ -1003,6 +822,48 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, | |||
1003 | } | 822 | } |
1004 | 823 | ||
1005 | /* | 824 | /* |
825 | * Called from rate-limited memcg_check_events when enough | ||
826 | * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure | ||
827 | * that all the parents up the hierarchy will be notified that this group | ||
828 | * is in excess or that it is not in excess anymore. mmecg->soft_contributed | ||
829 | * makes the transition a single action whenever the state flips from one to | ||
830 | * the other. | ||
831 | */ | ||
832 | static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) | ||
833 | { | ||
834 | unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); | ||
835 | struct mem_cgroup *parent = memcg; | ||
836 | int delta = 0; | ||
837 | |||
838 | spin_lock(&memcg->soft_lock); | ||
839 | if (excess) { | ||
840 | if (!memcg->soft_contributed) { | ||
841 | delta = 1; | ||
842 | memcg->soft_contributed = true; | ||
843 | } | ||
844 | } else { | ||
845 | if (memcg->soft_contributed) { | ||
846 | delta = -1; | ||
847 | memcg->soft_contributed = false; | ||
848 | } | ||
849 | } | ||
850 | |||
851 | /* | ||
852 | * Necessary to update all ancestors when hierarchy is used | ||
853 | * because their event counter is not touched. | ||
854 | * We track children even outside the hierarchy for the root | ||
855 | * cgroup because tree walk starting at root should visit | ||
856 | * all cgroups and we want to prevent from pointless tree | ||
857 | * walk if no children is below the limit. | ||
858 | */ | ||
859 | while (delta && (parent = parent_mem_cgroup(parent))) | ||
860 | atomic_add(delta, &parent->children_in_excess); | ||
861 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
862 | atomic_add(delta, &root_mem_cgroup->children_in_excess); | ||
863 | spin_unlock(&memcg->soft_lock); | ||
864 | } | ||
865 | |||
866 | /* | ||
1006 | * Check events in order. | 867 | * Check events in order. |
1007 | * | 868 | * |
1008 | */ | 869 | */ |
@@ -1025,7 +886,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
1025 | 886 | ||
1026 | mem_cgroup_threshold(memcg); | 887 | mem_cgroup_threshold(memcg); |
1027 | if (unlikely(do_softlimit)) | 888 | if (unlikely(do_softlimit)) |
1028 | mem_cgroup_update_tree(memcg, page); | 889 | mem_cgroup_update_soft_limit(memcg); |
1029 | #if MAX_NUMNODES > 1 | 890 | #if MAX_NUMNODES > 1 |
1030 | if (unlikely(do_numainfo)) | 891 | if (unlikely(do_numainfo)) |
1031 | atomic_inc(&memcg->numainfo_events); | 892 | atomic_inc(&memcg->numainfo_events); |
@@ -1068,6 +929,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
1068 | return memcg; | 929 | return memcg; |
1069 | } | 930 | } |
1070 | 931 | ||
932 | static enum mem_cgroup_filter_t | ||
933 | mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, | ||
934 | mem_cgroup_iter_filter cond) | ||
935 | { | ||
936 | if (!cond) | ||
937 | return VISIT; | ||
938 | return cond(memcg, root); | ||
939 | } | ||
940 | |||
1071 | /* | 941 | /* |
1072 | * Returns a next (in a pre-order walk) alive memcg (with elevated css | 942 | * Returns a next (in a pre-order walk) alive memcg (with elevated css |
1073 | * ref. count) or NULL if the whole root's subtree has been visited. | 943 | * ref. count) or NULL if the whole root's subtree has been visited. |
@@ -1075,7 +945,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
1075 | * helper function to be used by mem_cgroup_iter | 945 | * helper function to be used by mem_cgroup_iter |
1076 | */ | 946 | */ |
1077 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, | 947 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, |
1078 | struct mem_cgroup *last_visited) | 948 | struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) |
1079 | { | 949 | { |
1080 | struct cgroup_subsys_state *prev_css, *next_css; | 950 | struct cgroup_subsys_state *prev_css, *next_css; |
1081 | 951 | ||
@@ -1093,11 +963,31 @@ skip_node: | |||
1093 | if (next_css) { | 963 | if (next_css) { |
1094 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); | 964 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); |
1095 | 965 | ||
1096 | if (css_tryget(&mem->css)) | 966 | switch (mem_cgroup_filter(mem, root, cond)) { |
1097 | return mem; | 967 | case SKIP: |
1098 | else { | ||
1099 | prev_css = next_css; | 968 | prev_css = next_css; |
1100 | goto skip_node; | 969 | goto skip_node; |
970 | case SKIP_TREE: | ||
971 | if (mem == root) | ||
972 | return NULL; | ||
973 | /* | ||
974 | * css_rightmost_descendant is not an optimal way to | ||
975 | * skip through a subtree (especially for imbalanced | ||
976 | * trees leaning to right) but that's what we have right | ||
977 | * now. More effective solution would be traversing | ||
978 | * right-up for first non-NULL without calling | ||
979 | * css_next_descendant_pre afterwards. | ||
980 | */ | ||
981 | prev_css = css_rightmost_descendant(next_css); | ||
982 | goto skip_node; | ||
983 | case VISIT: | ||
984 | if (css_tryget(&mem->css)) | ||
985 | return mem; | ||
986 | else { | ||
987 | prev_css = next_css; | ||
988 | goto skip_node; | ||
989 | } | ||
990 | break; | ||
1101 | } | 991 | } |
1102 | } | 992 | } |
1103 | 993 | ||
@@ -1161,6 +1051,7 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
1161 | * @root: hierarchy root | 1051 | * @root: hierarchy root |
1162 | * @prev: previously returned memcg, NULL on first invocation | 1052 | * @prev: previously returned memcg, NULL on first invocation |
1163 | * @reclaim: cookie for shared reclaim walks, NULL for full walks | 1053 | * @reclaim: cookie for shared reclaim walks, NULL for full walks |
1054 | * @cond: filter for visited nodes, NULL for no filter | ||
1164 | * | 1055 | * |
1165 | * Returns references to children of the hierarchy below @root, or | 1056 | * Returns references to children of the hierarchy below @root, or |
1166 | * @root itself, or %NULL after a full round-trip. | 1057 | * @root itself, or %NULL after a full round-trip. |
@@ -1173,15 +1064,18 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
1173 | * divide up the memcgs in the hierarchy among all concurrent | 1064 | * divide up the memcgs in the hierarchy among all concurrent |
1174 | * reclaimers operating on the same zone and priority. | 1065 | * reclaimers operating on the same zone and priority. |
1175 | */ | 1066 | */ |
1176 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | 1067 | struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, |
1177 | struct mem_cgroup *prev, | 1068 | struct mem_cgroup *prev, |
1178 | struct mem_cgroup_reclaim_cookie *reclaim) | 1069 | struct mem_cgroup_reclaim_cookie *reclaim, |
1070 | mem_cgroup_iter_filter cond) | ||
1179 | { | 1071 | { |
1180 | struct mem_cgroup *memcg = NULL; | 1072 | struct mem_cgroup *memcg = NULL; |
1181 | struct mem_cgroup *last_visited = NULL; | 1073 | struct mem_cgroup *last_visited = NULL; |
1182 | 1074 | ||
1183 | if (mem_cgroup_disabled()) | 1075 | if (mem_cgroup_disabled()) { |
1184 | return NULL; | 1076 | /* first call must return non-NULL, second return NULL */ |
1077 | return (struct mem_cgroup *)(unsigned long)!prev; | ||
1078 | } | ||
1185 | 1079 | ||
1186 | if (!root) | 1080 | if (!root) |
1187 | root = root_mem_cgroup; | 1081 | root = root_mem_cgroup; |
@@ -1192,7 +1086,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1192 | if (!root->use_hierarchy && root != root_mem_cgroup) { | 1086 | if (!root->use_hierarchy && root != root_mem_cgroup) { |
1193 | if (prev) | 1087 | if (prev) |
1194 | goto out_css_put; | 1088 | goto out_css_put; |
1195 | return root; | 1089 | if (mem_cgroup_filter(root, root, cond) == VISIT) |
1090 | return root; | ||
1091 | return NULL; | ||
1196 | } | 1092 | } |
1197 | 1093 | ||
1198 | rcu_read_lock(); | 1094 | rcu_read_lock(); |
@@ -1215,7 +1111,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1215 | last_visited = mem_cgroup_iter_load(iter, root, &seq); | 1111 | last_visited = mem_cgroup_iter_load(iter, root, &seq); |
1216 | } | 1112 | } |
1217 | 1113 | ||
1218 | memcg = __mem_cgroup_iter_next(root, last_visited); | 1114 | memcg = __mem_cgroup_iter_next(root, last_visited, cond); |
1219 | 1115 | ||
1220 | if (reclaim) { | 1116 | if (reclaim) { |
1221 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); | 1117 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); |
@@ -1226,7 +1122,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1226 | reclaim->generation = iter->generation; | 1122 | reclaim->generation = iter->generation; |
1227 | } | 1123 | } |
1228 | 1124 | ||
1229 | if (prev && !memcg) | 1125 | /* |
1126 | * We have finished the whole tree walk or no group has been | ||
1127 | * visited because filter told us to skip the root node. | ||
1128 | */ | ||
1129 | if (!memcg && (prev || (cond && !last_visited))) | ||
1230 | goto out_unlock; | 1130 | goto out_unlock; |
1231 | } | 1131 | } |
1232 | out_unlock: | 1132 | out_unlock: |
@@ -1867,6 +1767,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | |||
1867 | return total; | 1767 | return total; |
1868 | } | 1768 | } |
1869 | 1769 | ||
1770 | #if MAX_NUMNODES > 1 | ||
1870 | /** | 1771 | /** |
1871 | * test_mem_cgroup_node_reclaimable | 1772 | * test_mem_cgroup_node_reclaimable |
1872 | * @memcg: the target memcg | 1773 | * @memcg: the target memcg |
@@ -1889,7 +1790,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, | |||
1889 | return false; | 1790 | return false; |
1890 | 1791 | ||
1891 | } | 1792 | } |
1892 | #if MAX_NUMNODES > 1 | ||
1893 | 1793 | ||
1894 | /* | 1794 | /* |
1895 | * Always updating the nodemask is not very good - even if we have an empty | 1795 | * Always updating the nodemask is not very good - even if we have an empty |
@@ -1957,115 +1857,64 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
1957 | return node; | 1857 | return node; |
1958 | } | 1858 | } |
1959 | 1859 | ||
1960 | /* | ||
1961 | * Check all nodes whether it contains reclaimable pages or not. | ||
1962 | * For quick scan, we make use of scan_nodes. This will allow us to skip | ||
1963 | * unused nodes. But scan_nodes is lazily updated and may not cotain | ||
1964 | * enough new information. We need to do double check. | ||
1965 | */ | ||
1966 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
1967 | { | ||
1968 | int nid; | ||
1969 | |||
1970 | /* | ||
1971 | * quick check...making use of scan_node. | ||
1972 | * We can skip unused nodes. | ||
1973 | */ | ||
1974 | if (!nodes_empty(memcg->scan_nodes)) { | ||
1975 | for (nid = first_node(memcg->scan_nodes); | ||
1976 | nid < MAX_NUMNODES; | ||
1977 | nid = next_node(nid, memcg->scan_nodes)) { | ||
1978 | |||
1979 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1980 | return true; | ||
1981 | } | ||
1982 | } | ||
1983 | /* | ||
1984 | * Check rest of nodes. | ||
1985 | */ | ||
1986 | for_each_node_state(nid, N_MEMORY) { | ||
1987 | if (node_isset(nid, memcg->scan_nodes)) | ||
1988 | continue; | ||
1989 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1990 | return true; | ||
1991 | } | ||
1992 | return false; | ||
1993 | } | ||
1994 | |||
1995 | #else | 1860 | #else |
1996 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | 1861 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
1997 | { | 1862 | { |
1998 | return 0; | 1863 | return 0; |
1999 | } | 1864 | } |
2000 | 1865 | ||
2001 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
2002 | { | ||
2003 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); | ||
2004 | } | ||
2005 | #endif | 1866 | #endif |
2006 | 1867 | ||
2007 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, | 1868 | /* |
2008 | struct zone *zone, | 1869 | * A group is eligible for the soft limit reclaim under the given root |
2009 | gfp_t gfp_mask, | 1870 | * hierarchy if |
2010 | unsigned long *total_scanned) | 1871 | * a) it is over its soft limit |
2011 | { | 1872 | * b) any parent up the hierarchy is over its soft limit |
2012 | struct mem_cgroup *victim = NULL; | 1873 | * |
2013 | int total = 0; | 1874 | * If the given group doesn't have any children over the limit then it |
2014 | int loop = 0; | 1875 | * doesn't make any sense to iterate its subtree. |
2015 | unsigned long excess; | 1876 | */ |
2016 | unsigned long nr_scanned; | 1877 | enum mem_cgroup_filter_t |
2017 | struct mem_cgroup_reclaim_cookie reclaim = { | 1878 | mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, |
2018 | .zone = zone, | 1879 | struct mem_cgroup *root) |
2019 | .priority = 0, | 1880 | { |
2020 | }; | 1881 | struct mem_cgroup *parent; |
2021 | 1882 | ||
2022 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; | 1883 | if (!memcg) |
2023 | 1884 | memcg = root_mem_cgroup; | |
2024 | while (1) { | 1885 | parent = memcg; |
2025 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); | 1886 | |
2026 | if (!victim) { | 1887 | if (res_counter_soft_limit_excess(&memcg->res)) |
2027 | loop++; | 1888 | return VISIT; |
2028 | if (loop >= 2) { | 1889 | |
2029 | /* | 1890 | /* |
2030 | * If we have not been able to reclaim | 1891 | * If any parent up to the root in the hierarchy is over its soft limit |
2031 | * anything, it might because there are | 1892 | * then we have to obey and reclaim from this group as well. |
2032 | * no reclaimable pages under this hierarchy | 1893 | */ |
2033 | */ | 1894 | while ((parent = parent_mem_cgroup(parent))) { |
2034 | if (!total) | 1895 | if (res_counter_soft_limit_excess(&parent->res)) |
2035 | break; | 1896 | return VISIT; |
2036 | /* | 1897 | if (parent == root) |
2037 | * We want to do more targeted reclaim. | ||
2038 | * excess >> 2 is not to excessive so as to | ||
2039 | * reclaim too much, nor too less that we keep | ||
2040 | * coming back to reclaim from this cgroup | ||
2041 | */ | ||
2042 | if (total >= (excess >> 2) || | ||
2043 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) | ||
2044 | break; | ||
2045 | } | ||
2046 | continue; | ||
2047 | } | ||
2048 | if (!mem_cgroup_reclaimable(victim, false)) | ||
2049 | continue; | ||
2050 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, | ||
2051 | zone, &nr_scanned); | ||
2052 | *total_scanned += nr_scanned; | ||
2053 | if (!res_counter_soft_limit_excess(&root_memcg->res)) | ||
2054 | break; | 1898 | break; |
2055 | } | 1899 | } |
2056 | mem_cgroup_iter_break(root_memcg, victim); | 1900 | |
2057 | return total; | 1901 | if (!atomic_read(&memcg->children_in_excess)) |
1902 | return SKIP_TREE; | ||
1903 | return SKIP; | ||
2058 | } | 1904 | } |
2059 | 1905 | ||
1906 | static DEFINE_SPINLOCK(memcg_oom_lock); | ||
1907 | |||
2060 | /* | 1908 | /* |
2061 | * Check OOM-Killer is already running under our hierarchy. | 1909 | * Check OOM-Killer is already running under our hierarchy. |
2062 | * If someone is running, return false. | 1910 | * If someone is running, return false. |
2063 | * Has to be called with memcg_oom_lock | ||
2064 | */ | 1911 | */ |
2065 | static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) | 1912 | static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) |
2066 | { | 1913 | { |
2067 | struct mem_cgroup *iter, *failed = NULL; | 1914 | struct mem_cgroup *iter, *failed = NULL; |
2068 | 1915 | ||
1916 | spin_lock(&memcg_oom_lock); | ||
1917 | |||
2069 | for_each_mem_cgroup_tree(iter, memcg) { | 1918 | for_each_mem_cgroup_tree(iter, memcg) { |
2070 | if (iter->oom_lock) { | 1919 | if (iter->oom_lock) { |
2071 | /* | 1920 | /* |
@@ -2079,33 +1928,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) | |||
2079 | iter->oom_lock = true; | 1928 | iter->oom_lock = true; |
2080 | } | 1929 | } |
2081 | 1930 | ||
2082 | if (!failed) | 1931 | if (failed) { |
2083 | return true; | 1932 | /* |
2084 | 1933 | * OK, we failed to lock the whole subtree so we have | |
2085 | /* | 1934 | * to clean up what we set up to the failing subtree |
2086 | * OK, we failed to lock the whole subtree so we have to clean up | 1935 | */ |
2087 | * what we set up to the failing subtree | 1936 | for_each_mem_cgroup_tree(iter, memcg) { |
2088 | */ | 1937 | if (iter == failed) { |
2089 | for_each_mem_cgroup_tree(iter, memcg) { | 1938 | mem_cgroup_iter_break(memcg, iter); |
2090 | if (iter == failed) { | 1939 | break; |
2091 | mem_cgroup_iter_break(memcg, iter); | 1940 | } |
2092 | break; | 1941 | iter->oom_lock = false; |
2093 | } | 1942 | } |
2094 | iter->oom_lock = false; | ||
2095 | } | 1943 | } |
2096 | return false; | 1944 | |
1945 | spin_unlock(&memcg_oom_lock); | ||
1946 | |||
1947 | return !failed; | ||
2097 | } | 1948 | } |
2098 | 1949 | ||
2099 | /* | 1950 | static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) |
2100 | * Has to be called with memcg_oom_lock | ||
2101 | */ | ||
2102 | static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) | ||
2103 | { | 1951 | { |
2104 | struct mem_cgroup *iter; | 1952 | struct mem_cgroup *iter; |
2105 | 1953 | ||
1954 | spin_lock(&memcg_oom_lock); | ||
2106 | for_each_mem_cgroup_tree(iter, memcg) | 1955 | for_each_mem_cgroup_tree(iter, memcg) |
2107 | iter->oom_lock = false; | 1956 | iter->oom_lock = false; |
2108 | return 0; | 1957 | spin_unlock(&memcg_oom_lock); |
2109 | } | 1958 | } |
2110 | 1959 | ||
2111 | static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) | 1960 | static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) |
@@ -2129,7 +1978,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) | |||
2129 | atomic_add_unless(&iter->under_oom, -1, 0); | 1978 | atomic_add_unless(&iter->under_oom, -1, 0); |
2130 | } | 1979 | } |
2131 | 1980 | ||
2132 | static DEFINE_SPINLOCK(memcg_oom_lock); | ||
2133 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1981 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
2134 | 1982 | ||
2135 | struct oom_wait_info { | 1983 | struct oom_wait_info { |
@@ -2159,6 +2007,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait, | |||
2159 | 2007 | ||
2160 | static void memcg_wakeup_oom(struct mem_cgroup *memcg) | 2008 | static void memcg_wakeup_oom(struct mem_cgroup *memcg) |
2161 | { | 2009 | { |
2010 | atomic_inc(&memcg->oom_wakeups); | ||
2162 | /* for filtering, pass "memcg" as argument. */ | 2011 | /* for filtering, pass "memcg" as argument. */ |
2163 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); | 2012 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); |
2164 | } | 2013 | } |
@@ -2170,56 +2019,136 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
2170 | } | 2019 | } |
2171 | 2020 | ||
2172 | /* | 2021 | /* |
2173 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 2022 | * try to call OOM killer |
2174 | */ | 2023 | */ |
2175 | static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, | 2024 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
2176 | int order) | ||
2177 | { | 2025 | { |
2178 | struct oom_wait_info owait; | 2026 | bool locked; |
2179 | bool locked, need_to_kill; | 2027 | int wakeups; |
2180 | 2028 | ||
2181 | owait.memcg = memcg; | 2029 | if (!current->memcg_oom.may_oom) |
2182 | owait.wait.flags = 0; | 2030 | return; |
2183 | owait.wait.func = memcg_oom_wake_function; | 2031 | |
2184 | owait.wait.private = current; | 2032 | current->memcg_oom.in_memcg_oom = 1; |
2185 | INIT_LIST_HEAD(&owait.wait.task_list); | ||
2186 | need_to_kill = true; | ||
2187 | mem_cgroup_mark_under_oom(memcg); | ||
2188 | 2033 | ||
2189 | /* At first, try to OOM lock hierarchy under memcg.*/ | ||
2190 | spin_lock(&memcg_oom_lock); | ||
2191 | locked = mem_cgroup_oom_lock(memcg); | ||
2192 | /* | 2034 | /* |
2193 | * Even if signal_pending(), we can't quit charge() loop without | 2035 | * As with any blocking lock, a contender needs to start |
2194 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | 2036 | * listening for wakeups before attempting the trylock, |
2195 | * under OOM is always welcomed, use TASK_KILLABLE here. | 2037 | * otherwise it can miss the wakeup from the unlock and sleep |
2038 | * indefinitely. This is just open-coded because our locking | ||
2039 | * is so particular to memcg hierarchies. | ||
2196 | */ | 2040 | */ |
2197 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 2041 | wakeups = atomic_read(&memcg->oom_wakeups); |
2198 | if (!locked || memcg->oom_kill_disable) | 2042 | mem_cgroup_mark_under_oom(memcg); |
2199 | need_to_kill = false; | 2043 | |
2044 | locked = mem_cgroup_oom_trylock(memcg); | ||
2045 | |||
2200 | if (locked) | 2046 | if (locked) |
2201 | mem_cgroup_oom_notify(memcg); | 2047 | mem_cgroup_oom_notify(memcg); |
2202 | spin_unlock(&memcg_oom_lock); | ||
2203 | 2048 | ||
2204 | if (need_to_kill) { | 2049 | if (locked && !memcg->oom_kill_disable) { |
2205 | finish_wait(&memcg_oom_waitq, &owait.wait); | 2050 | mem_cgroup_unmark_under_oom(memcg); |
2206 | mem_cgroup_out_of_memory(memcg, mask, order); | 2051 | mem_cgroup_out_of_memory(memcg, mask, order); |
2052 | mem_cgroup_oom_unlock(memcg); | ||
2053 | /* | ||
2054 | * There is no guarantee that an OOM-lock contender | ||
2055 | * sees the wakeups triggered by the OOM kill | ||
2056 | * uncharges. Wake any sleepers explicitely. | ||
2057 | */ | ||
2058 | memcg_oom_recover(memcg); | ||
2207 | } else { | 2059 | } else { |
2208 | schedule(); | 2060 | /* |
2209 | finish_wait(&memcg_oom_waitq, &owait.wait); | 2061 | * A system call can just return -ENOMEM, but if this |
2062 | * is a page fault and somebody else is handling the | ||
2063 | * OOM already, we need to sleep on the OOM waitqueue | ||
2064 | * for this memcg until the situation is resolved. | ||
2065 | * Which can take some time because it might be | ||
2066 | * handled by a userspace task. | ||
2067 | * | ||
2068 | * However, this is the charge context, which means | ||
2069 | * that we may sit on a large call stack and hold | ||
2070 | * various filesystem locks, the mmap_sem etc. and we | ||
2071 | * don't want the OOM handler to deadlock on them | ||
2072 | * while we sit here and wait. Store the current OOM | ||
2073 | * context in the task_struct, then return -ENOMEM. | ||
2074 | * At the end of the page fault handler, with the | ||
2075 | * stack unwound, pagefault_out_of_memory() will check | ||
2076 | * back with us by calling | ||
2077 | * mem_cgroup_oom_synchronize(), possibly putting the | ||
2078 | * task to sleep. | ||
2079 | */ | ||
2080 | current->memcg_oom.oom_locked = locked; | ||
2081 | current->memcg_oom.wakeups = wakeups; | ||
2082 | css_get(&memcg->css); | ||
2083 | current->memcg_oom.wait_on_memcg = memcg; | ||
2210 | } | 2084 | } |
2211 | spin_lock(&memcg_oom_lock); | 2085 | } |
2212 | if (locked) | ||
2213 | mem_cgroup_oom_unlock(memcg); | ||
2214 | memcg_wakeup_oom(memcg); | ||
2215 | spin_unlock(&memcg_oom_lock); | ||
2216 | 2086 | ||
2217 | mem_cgroup_unmark_under_oom(memcg); | 2087 | /** |
2088 | * mem_cgroup_oom_synchronize - complete memcg OOM handling | ||
2089 | * | ||
2090 | * This has to be called at the end of a page fault if the the memcg | ||
2091 | * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. | ||
2092 | * | ||
2093 | * Memcg supports userspace OOM handling, so failed allocations must | ||
2094 | * sleep on a waitqueue until the userspace task resolves the | ||
2095 | * situation. Sleeping directly in the charge context with all kinds | ||
2096 | * of locks held is not a good idea, instead we remember an OOM state | ||
2097 | * in the task and mem_cgroup_oom_synchronize() has to be called at | ||
2098 | * the end of the page fault to put the task to sleep and clean up the | ||
2099 | * OOM state. | ||
2100 | * | ||
2101 | * Returns %true if an ongoing memcg OOM situation was detected and | ||
2102 | * finalized, %false otherwise. | ||
2103 | */ | ||
2104 | bool mem_cgroup_oom_synchronize(void) | ||
2105 | { | ||
2106 | struct oom_wait_info owait; | ||
2107 | struct mem_cgroup *memcg; | ||
2218 | 2108 | ||
2219 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 2109 | /* OOM is global, do not handle */ |
2110 | if (!current->memcg_oom.in_memcg_oom) | ||
2220 | return false; | 2111 | return false; |
2221 | /* Give chance to dying process */ | 2112 | |
2222 | schedule_timeout_uninterruptible(1); | 2113 | /* |
2114 | * We invoked the OOM killer but there is a chance that a kill | ||
2115 | * did not free up any charges. Everybody else might already | ||
2116 | * be sleeping, so restart the fault and keep the rampage | ||
2117 | * going until some charges are released. | ||
2118 | */ | ||
2119 | memcg = current->memcg_oom.wait_on_memcg; | ||
2120 | if (!memcg) | ||
2121 | goto out; | ||
2122 | |||
2123 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | ||
2124 | goto out_memcg; | ||
2125 | |||
2126 | owait.memcg = memcg; | ||
2127 | owait.wait.flags = 0; | ||
2128 | owait.wait.func = memcg_oom_wake_function; | ||
2129 | owait.wait.private = current; | ||
2130 | INIT_LIST_HEAD(&owait.wait.task_list); | ||
2131 | |||
2132 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | ||
2133 | /* Only sleep if we didn't miss any wakeups since OOM */ | ||
2134 | if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) | ||
2135 | schedule(); | ||
2136 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
2137 | out_memcg: | ||
2138 | mem_cgroup_unmark_under_oom(memcg); | ||
2139 | if (current->memcg_oom.oom_locked) { | ||
2140 | mem_cgroup_oom_unlock(memcg); | ||
2141 | /* | ||
2142 | * There is no guarantee that an OOM-lock contender | ||
2143 | * sees the wakeups triggered by the OOM kill | ||
2144 | * uncharges. Wake any sleepers explicitely. | ||
2145 | */ | ||
2146 | memcg_oom_recover(memcg); | ||
2147 | } | ||
2148 | css_put(&memcg->css); | ||
2149 | current->memcg_oom.wait_on_memcg = NULL; | ||
2150 | out: | ||
2151 | current->memcg_oom.in_memcg_oom = 0; | ||
2223 | return true; | 2152 | return true; |
2224 | } | 2153 | } |
2225 | 2154 | ||
@@ -2288,7 +2217,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) | |||
2288 | } | 2217 | } |
2289 | 2218 | ||
2290 | void mem_cgroup_update_page_stat(struct page *page, | 2219 | void mem_cgroup_update_page_stat(struct page *page, |
2291 | enum mem_cgroup_page_stat_item idx, int val) | 2220 | enum mem_cgroup_stat_index idx, int val) |
2292 | { | 2221 | { |
2293 | struct mem_cgroup *memcg; | 2222 | struct mem_cgroup *memcg; |
2294 | struct page_cgroup *pc = lookup_page_cgroup(page); | 2223 | struct page_cgroup *pc = lookup_page_cgroup(page); |
@@ -2297,18 +2226,11 @@ void mem_cgroup_update_page_stat(struct page *page, | |||
2297 | if (mem_cgroup_disabled()) | 2226 | if (mem_cgroup_disabled()) |
2298 | return; | 2227 | return; |
2299 | 2228 | ||
2229 | VM_BUG_ON(!rcu_read_lock_held()); | ||
2300 | memcg = pc->mem_cgroup; | 2230 | memcg = pc->mem_cgroup; |
2301 | if (unlikely(!memcg || !PageCgroupUsed(pc))) | 2231 | if (unlikely(!memcg || !PageCgroupUsed(pc))) |
2302 | return; | 2232 | return; |
2303 | 2233 | ||
2304 | switch (idx) { | ||
2305 | case MEMCG_NR_FILE_MAPPED: | ||
2306 | idx = MEM_CGROUP_STAT_FILE_MAPPED; | ||
2307 | break; | ||
2308 | default: | ||
2309 | BUG(); | ||
2310 | } | ||
2311 | |||
2312 | this_cpu_add(memcg->stat->count[idx], val); | 2234 | this_cpu_add(memcg->stat->count[idx], val); |
2313 | } | 2235 | } |
2314 | 2236 | ||
@@ -2450,7 +2372,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) | |||
2450 | flush_work(&stock->work); | 2372 | flush_work(&stock->work); |
2451 | } | 2373 | } |
2452 | out: | 2374 | out: |
2453 | put_online_cpus(); | 2375 | put_online_cpus(); |
2454 | } | 2376 | } |
2455 | 2377 | ||
2456 | /* | 2378 | /* |
@@ -2532,12 +2454,11 @@ enum { | |||
2532 | CHARGE_RETRY, /* need to retry but retry is not bad */ | 2454 | CHARGE_RETRY, /* need to retry but retry is not bad */ |
2533 | CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ | 2455 | CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ |
2534 | CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ | 2456 | CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ |
2535 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ | ||
2536 | }; | 2457 | }; |
2537 | 2458 | ||
2538 | static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | 2459 | static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, |
2539 | unsigned int nr_pages, unsigned int min_pages, | 2460 | unsigned int nr_pages, unsigned int min_pages, |
2540 | bool oom_check) | 2461 | bool invoke_oom) |
2541 | { | 2462 | { |
2542 | unsigned long csize = nr_pages * PAGE_SIZE; | 2463 | unsigned long csize = nr_pages * PAGE_SIZE; |
2543 | struct mem_cgroup *mem_over_limit; | 2464 | struct mem_cgroup *mem_over_limit; |
@@ -2594,14 +2515,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2594 | if (mem_cgroup_wait_acct_move(mem_over_limit)) | 2515 | if (mem_cgroup_wait_acct_move(mem_over_limit)) |
2595 | return CHARGE_RETRY; | 2516 | return CHARGE_RETRY; |
2596 | 2517 | ||
2597 | /* If we don't need to call oom-killer at el, return immediately */ | 2518 | if (invoke_oom) |
2598 | if (!oom_check) | 2519 | mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); |
2599 | return CHARGE_NOMEM; | ||
2600 | /* check OOM */ | ||
2601 | if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) | ||
2602 | return CHARGE_OOM_DIE; | ||
2603 | 2520 | ||
2604 | return CHARGE_RETRY; | 2521 | return CHARGE_NOMEM; |
2605 | } | 2522 | } |
2606 | 2523 | ||
2607 | /* | 2524 | /* |
@@ -2704,7 +2621,7 @@ again: | |||
2704 | } | 2621 | } |
2705 | 2622 | ||
2706 | do { | 2623 | do { |
2707 | bool oom_check; | 2624 | bool invoke_oom = oom && !nr_oom_retries; |
2708 | 2625 | ||
2709 | /* If killed, bypass charge */ | 2626 | /* If killed, bypass charge */ |
2710 | if (fatal_signal_pending(current)) { | 2627 | if (fatal_signal_pending(current)) { |
@@ -2712,14 +2629,8 @@ again: | |||
2712 | goto bypass; | 2629 | goto bypass; |
2713 | } | 2630 | } |
2714 | 2631 | ||
2715 | oom_check = false; | 2632 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, |
2716 | if (oom && !nr_oom_retries) { | 2633 | nr_pages, invoke_oom); |
2717 | oom_check = true; | ||
2718 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
2719 | } | ||
2720 | |||
2721 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, | ||
2722 | oom_check); | ||
2723 | switch (ret) { | 2634 | switch (ret) { |
2724 | case CHARGE_OK: | 2635 | case CHARGE_OK: |
2725 | break; | 2636 | break; |
@@ -2732,16 +2643,12 @@ again: | |||
2732 | css_put(&memcg->css); | 2643 | css_put(&memcg->css); |
2733 | goto nomem; | 2644 | goto nomem; |
2734 | case CHARGE_NOMEM: /* OOM routine works */ | 2645 | case CHARGE_NOMEM: /* OOM routine works */ |
2735 | if (!oom) { | 2646 | if (!oom || invoke_oom) { |
2736 | css_put(&memcg->css); | 2647 | css_put(&memcg->css); |
2737 | goto nomem; | 2648 | goto nomem; |
2738 | } | 2649 | } |
2739 | /* If oom, we never return -ENOMEM */ | ||
2740 | nr_oom_retries--; | 2650 | nr_oom_retries--; |
2741 | break; | 2651 | break; |
2742 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ | ||
2743 | css_put(&memcg->css); | ||
2744 | goto bypass; | ||
2745 | } | 2652 | } |
2746 | } while (ret != CHARGE_OK); | 2653 | } while (ret != CHARGE_OK); |
2747 | 2654 | ||
@@ -2882,7 +2789,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2882 | * is accessed after testing USED bit. To make pc->mem_cgroup visible | 2789 | * is accessed after testing USED bit. To make pc->mem_cgroup visible |
2883 | * before USED bit, we need memory barrier here. | 2790 | * before USED bit, we need memory barrier here. |
2884 | * See mem_cgroup_add_lru_list(), etc. | 2791 | * See mem_cgroup_add_lru_list(), etc. |
2885 | */ | 2792 | */ |
2886 | smp_wmb(); | 2793 | smp_wmb(); |
2887 | SetPageCgroupUsed(pc); | 2794 | SetPageCgroupUsed(pc); |
2888 | 2795 | ||
@@ -2905,9 +2812,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2905 | unlock_page_cgroup(pc); | 2812 | unlock_page_cgroup(pc); |
2906 | 2813 | ||
2907 | /* | 2814 | /* |
2908 | * "charge_statistics" updated event counter. Then, check it. | 2815 | * "charge_statistics" updated event counter. |
2909 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
2910 | * if they exceeds softlimit. | ||
2911 | */ | 2816 | */ |
2912 | memcg_check_events(memcg, page); | 2817 | memcg_check_events(memcg, page); |
2913 | } | 2818 | } |
@@ -3626,9 +3531,9 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
3626 | * the page allocator. Therefore, the following sequence when backed by | 3531 | * the page allocator. Therefore, the following sequence when backed by |
3627 | * the SLUB allocator: | 3532 | * the SLUB allocator: |
3628 | * | 3533 | * |
3629 | * memcg_stop_kmem_account(); | 3534 | * memcg_stop_kmem_account(); |
3630 | * kmalloc(<large_number>) | 3535 | * kmalloc(<large_number>) |
3631 | * memcg_resume_kmem_account(); | 3536 | * memcg_resume_kmem_account(); |
3632 | * | 3537 | * |
3633 | * would effectively ignore the fact that we should skip accounting, | 3538 | * would effectively ignore the fact that we should skip accounting, |
3634 | * since it will drive us directly to this function without passing | 3539 | * since it will drive us directly to this function without passing |
@@ -3750,6 +3655,20 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
3750 | } | 3655 | } |
3751 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 3656 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
3752 | 3657 | ||
3658 | static inline | ||
3659 | void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, | ||
3660 | struct mem_cgroup *to, | ||
3661 | unsigned int nr_pages, | ||
3662 | enum mem_cgroup_stat_index idx) | ||
3663 | { | ||
3664 | /* Update stat data for mem_cgroup */ | ||
3665 | preempt_disable(); | ||
3666 | WARN_ON_ONCE(from->stat->count[idx] < nr_pages); | ||
3667 | __this_cpu_add(from->stat->count[idx], -nr_pages); | ||
3668 | __this_cpu_add(to->stat->count[idx], nr_pages); | ||
3669 | preempt_enable(); | ||
3670 | } | ||
3671 | |||
3753 | /** | 3672 | /** |
3754 | * mem_cgroup_move_account - move account of the page | 3673 | * mem_cgroup_move_account - move account of the page |
3755 | * @page: the page | 3674 | * @page: the page |
@@ -3795,13 +3714,14 @@ static int mem_cgroup_move_account(struct page *page, | |||
3795 | 3714 | ||
3796 | move_lock_mem_cgroup(from, &flags); | 3715 | move_lock_mem_cgroup(from, &flags); |
3797 | 3716 | ||
3798 | if (!anon && page_mapped(page)) { | 3717 | if (!anon && page_mapped(page)) |
3799 | /* Update mapped_file data for mem_cgroup */ | 3718 | mem_cgroup_move_account_page_stat(from, to, nr_pages, |
3800 | preempt_disable(); | 3719 | MEM_CGROUP_STAT_FILE_MAPPED); |
3801 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 3720 | |
3802 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 3721 | if (PageWriteback(page)) |
3803 | preempt_enable(); | 3722 | mem_cgroup_move_account_page_stat(from, to, nr_pages, |
3804 | } | 3723 | MEM_CGROUP_STAT_WRITEBACK); |
3724 | |||
3805 | mem_cgroup_charge_statistics(from, page, anon, -nr_pages); | 3725 | mem_cgroup_charge_statistics(from, page, anon, -nr_pages); |
3806 | 3726 | ||
3807 | /* caller should have done css_get */ | 3727 | /* caller should have done css_get */ |
@@ -4657,7 +4577,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
4657 | MEM_CGROUP_RECLAIM_SHRINK); | 4577 | MEM_CGROUP_RECLAIM_SHRINK); |
4658 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 4578 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
4659 | /* Usage is reduced ? */ | 4579 | /* Usage is reduced ? */ |
4660 | if (curusage >= oldusage) | 4580 | if (curusage >= oldusage) |
4661 | retry_count--; | 4581 | retry_count--; |
4662 | else | 4582 | else |
4663 | oldusage = curusage; | 4583 | oldusage = curusage; |
@@ -4678,7 +4598,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
4678 | int enlarge = 0; | 4598 | int enlarge = 0; |
4679 | 4599 | ||
4680 | /* see mem_cgroup_resize_res_limit */ | 4600 | /* see mem_cgroup_resize_res_limit */ |
4681 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; | 4601 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; |
4682 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 4602 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
4683 | while (retry_count) { | 4603 | while (retry_count) { |
4684 | if (signal_pending(current)) { | 4604 | if (signal_pending(current)) { |
@@ -4727,98 +4647,6 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
4727 | return ret; | 4647 | return ret; |
4728 | } | 4648 | } |
4729 | 4649 | ||
4730 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | ||
4731 | gfp_t gfp_mask, | ||
4732 | unsigned long *total_scanned) | ||
4733 | { | ||
4734 | unsigned long nr_reclaimed = 0; | ||
4735 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | ||
4736 | unsigned long reclaimed; | ||
4737 | int loop = 0; | ||
4738 | struct mem_cgroup_tree_per_zone *mctz; | ||
4739 | unsigned long long excess; | ||
4740 | unsigned long nr_scanned; | ||
4741 | |||
4742 | if (order > 0) | ||
4743 | return 0; | ||
4744 | |||
4745 | mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); | ||
4746 | /* | ||
4747 | * This loop can run a while, specially if mem_cgroup's continuously | ||
4748 | * keep exceeding their soft limit and putting the system under | ||
4749 | * pressure | ||
4750 | */ | ||
4751 | do { | ||
4752 | if (next_mz) | ||
4753 | mz = next_mz; | ||
4754 | else | ||
4755 | mz = mem_cgroup_largest_soft_limit_node(mctz); | ||
4756 | if (!mz) | ||
4757 | break; | ||
4758 | |||
4759 | nr_scanned = 0; | ||
4760 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, | ||
4761 | gfp_mask, &nr_scanned); | ||
4762 | nr_reclaimed += reclaimed; | ||
4763 | *total_scanned += nr_scanned; | ||
4764 | spin_lock(&mctz->lock); | ||
4765 | |||
4766 | /* | ||
4767 | * If we failed to reclaim anything from this memory cgroup | ||
4768 | * it is time to move on to the next cgroup | ||
4769 | */ | ||
4770 | next_mz = NULL; | ||
4771 | if (!reclaimed) { | ||
4772 | do { | ||
4773 | /* | ||
4774 | * Loop until we find yet another one. | ||
4775 | * | ||
4776 | * By the time we get the soft_limit lock | ||
4777 | * again, someone might have aded the | ||
4778 | * group back on the RB tree. Iterate to | ||
4779 | * make sure we get a different mem. | ||
4780 | * mem_cgroup_largest_soft_limit_node returns | ||
4781 | * NULL if no other cgroup is present on | ||
4782 | * the tree | ||
4783 | */ | ||
4784 | next_mz = | ||
4785 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
4786 | if (next_mz == mz) | ||
4787 | css_put(&next_mz->memcg->css); | ||
4788 | else /* next_mz == NULL or other memcg */ | ||
4789 | break; | ||
4790 | } while (1); | ||
4791 | } | ||
4792 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
4793 | excess = res_counter_soft_limit_excess(&mz->memcg->res); | ||
4794 | /* | ||
4795 | * One school of thought says that we should not add | ||
4796 | * back the node to the tree if reclaim returns 0. | ||
4797 | * But our reclaim could return 0, simply because due | ||
4798 | * to priority we are exposing a smaller subset of | ||
4799 | * memory to reclaim from. Consider this as a longer | ||
4800 | * term TODO. | ||
4801 | */ | ||
4802 | /* If excess == 0, no tree ops */ | ||
4803 | __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); | ||
4804 | spin_unlock(&mctz->lock); | ||
4805 | css_put(&mz->memcg->css); | ||
4806 | loop++; | ||
4807 | /* | ||
4808 | * Could not reclaim anything and there are no more | ||
4809 | * mem cgroups to try or we seem to be looping without | ||
4810 | * reclaiming anything. | ||
4811 | */ | ||
4812 | if (!nr_reclaimed && | ||
4813 | (next_mz == NULL || | ||
4814 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | ||
4815 | break; | ||
4816 | } while (!nr_reclaimed); | ||
4817 | if (next_mz) | ||
4818 | css_put(&next_mz->memcg->css); | ||
4819 | return nr_reclaimed; | ||
4820 | } | ||
4821 | |||
4822 | /** | 4650 | /** |
4823 | * mem_cgroup_force_empty_list - clears LRU of a group | 4651 | * mem_cgroup_force_empty_list - clears LRU of a group |
4824 | * @memcg: group to clear | 4652 | * @memcg: group to clear |
@@ -4990,18 +4818,12 @@ static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, | |||
4990 | unsigned int event) | 4818 | unsigned int event) |
4991 | { | 4819 | { |
4992 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4820 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
4993 | int ret; | ||
4994 | 4821 | ||
4995 | if (mem_cgroup_is_root(memcg)) | 4822 | if (mem_cgroup_is_root(memcg)) |
4996 | return -EINVAL; | 4823 | return -EINVAL; |
4997 | css_get(&memcg->css); | 4824 | return mem_cgroup_force_empty(memcg); |
4998 | ret = mem_cgroup_force_empty(memcg); | ||
4999 | css_put(&memcg->css); | ||
5000 | |||
5001 | return ret; | ||
5002 | } | 4825 | } |
5003 | 4826 | ||
5004 | |||
5005 | static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, | 4827 | static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, |
5006 | struct cftype *cft) | 4828 | struct cftype *cft) |
5007 | { | 4829 | { |
@@ -5139,7 +4961,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) | |||
5139 | */ | 4961 | */ |
5140 | mutex_lock(&memcg_create_mutex); | 4962 | mutex_lock(&memcg_create_mutex); |
5141 | mutex_lock(&set_limit_mutex); | 4963 | mutex_lock(&set_limit_mutex); |
5142 | if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { | 4964 | if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { |
5143 | if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { | 4965 | if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { |
5144 | ret = -EBUSY; | 4966 | ret = -EBUSY; |
5145 | goto out; | 4967 | goto out; |
@@ -5149,7 +4971,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) | |||
5149 | 4971 | ||
5150 | ret = memcg_update_cache_sizes(memcg); | 4972 | ret = memcg_update_cache_sizes(memcg); |
5151 | if (ret) { | 4973 | if (ret) { |
5152 | res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); | 4974 | res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); |
5153 | goto out; | 4975 | goto out; |
5154 | } | 4976 | } |
5155 | static_key_slow_inc(&memcg_kmem_enabled_key); | 4977 | static_key_slow_inc(&memcg_kmem_enabled_key); |
@@ -6089,8 +5911,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
6089 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 5911 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
6090 | mz = &pn->zoneinfo[zone]; | 5912 | mz = &pn->zoneinfo[zone]; |
6091 | lruvec_init(&mz->lruvec); | 5913 | lruvec_init(&mz->lruvec); |
6092 | mz->usage_in_excess = 0; | ||
6093 | mz->on_tree = false; | ||
6094 | mz->memcg = memcg; | 5914 | mz->memcg = memcg; |
6095 | } | 5915 | } |
6096 | memcg->nodeinfo[node] = pn; | 5916 | memcg->nodeinfo[node] = pn; |
@@ -6146,7 +5966,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
6146 | int node; | 5966 | int node; |
6147 | size_t size = memcg_size(); | 5967 | size_t size = memcg_size(); |
6148 | 5968 | ||
6149 | mem_cgroup_remove_from_trees(memcg); | ||
6150 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 5969 | free_css_id(&mem_cgroup_subsys, &memcg->css); |
6151 | 5970 | ||
6152 | for_each_node(node) | 5971 | for_each_node(node) |
@@ -6183,29 +6002,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
6183 | } | 6002 | } |
6184 | EXPORT_SYMBOL(parent_mem_cgroup); | 6003 | EXPORT_SYMBOL(parent_mem_cgroup); |
6185 | 6004 | ||
6186 | static void __init mem_cgroup_soft_limit_tree_init(void) | ||
6187 | { | ||
6188 | struct mem_cgroup_tree_per_node *rtpn; | ||
6189 | struct mem_cgroup_tree_per_zone *rtpz; | ||
6190 | int tmp, node, zone; | ||
6191 | |||
6192 | for_each_node(node) { | ||
6193 | tmp = node; | ||
6194 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
6195 | tmp = -1; | ||
6196 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
6197 | BUG_ON(!rtpn); | ||
6198 | |||
6199 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
6200 | |||
6201 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
6202 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
6203 | rtpz->rb_root = RB_ROOT; | ||
6204 | spin_lock_init(&rtpz->lock); | ||
6205 | } | ||
6206 | } | ||
6207 | } | ||
6208 | |||
6209 | static struct cgroup_subsys_state * __ref | 6005 | static struct cgroup_subsys_state * __ref |
6210 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | 6006 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) |
6211 | { | 6007 | { |
@@ -6235,6 +6031,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
6235 | mutex_init(&memcg->thresholds_lock); | 6031 | mutex_init(&memcg->thresholds_lock); |
6236 | spin_lock_init(&memcg->move_lock); | 6032 | spin_lock_init(&memcg->move_lock); |
6237 | vmpressure_init(&memcg->vmpressure); | 6033 | vmpressure_init(&memcg->vmpressure); |
6034 | spin_lock_init(&memcg->soft_lock); | ||
6238 | 6035 | ||
6239 | return &memcg->css; | 6036 | return &memcg->css; |
6240 | 6037 | ||
@@ -6312,6 +6109,13 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
6312 | 6109 | ||
6313 | mem_cgroup_invalidate_reclaim_iterators(memcg); | 6110 | mem_cgroup_invalidate_reclaim_iterators(memcg); |
6314 | mem_cgroup_reparent_charges(memcg); | 6111 | mem_cgroup_reparent_charges(memcg); |
6112 | if (memcg->soft_contributed) { | ||
6113 | while ((memcg = parent_mem_cgroup(memcg))) | ||
6114 | atomic_dec(&memcg->children_in_excess); | ||
6115 | |||
6116 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
6117 | atomic_dec(&root_mem_cgroup->children_in_excess); | ||
6118 | } | ||
6315 | mem_cgroup_destroy_all_caches(memcg); | 6119 | mem_cgroup_destroy_all_caches(memcg); |
6316 | vmpressure_cleanup(&memcg->vmpressure); | 6120 | vmpressure_cleanup(&memcg->vmpressure); |
6317 | } | 6121 | } |
@@ -6986,7 +6790,6 @@ static int __init mem_cgroup_init(void) | |||
6986 | { | 6790 | { |
6987 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 6791 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
6988 | enable_swap_cgroup(); | 6792 | enable_swap_cgroup(); |
6989 | mem_cgroup_soft_limit_tree_init(); | ||
6990 | memcg_stock_init(); | 6793 | memcg_stock_init(); |
6991 | return 0; | 6794 | return 0; |
6992 | } | 6795 | } |
diff --git a/mm/memory.c b/mm/memory.c index 2b73dbde2274..ca0003947115 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -3695,7 +3695,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3695 | * but allow concurrent faults), and pte mapped but not yet locked. | 3695 | * but allow concurrent faults), and pte mapped but not yet locked. |
3696 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 3696 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
3697 | */ | 3697 | */ |
3698 | int handle_pte_fault(struct mm_struct *mm, | 3698 | static int handle_pte_fault(struct mm_struct *mm, |
3699 | struct vm_area_struct *vma, unsigned long address, | 3699 | struct vm_area_struct *vma, unsigned long address, |
3700 | pte_t *pte, pmd_t *pmd, unsigned int flags) | 3700 | pte_t *pte, pmd_t *pmd, unsigned int flags) |
3701 | { | 3701 | { |
@@ -3754,22 +3754,14 @@ unlock: | |||
3754 | /* | 3754 | /* |
3755 | * By the time we get here, we already hold the mm semaphore | 3755 | * By the time we get here, we already hold the mm semaphore |
3756 | */ | 3756 | */ |
3757 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3757 | static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
3758 | unsigned long address, unsigned int flags) | 3758 | unsigned long address, unsigned int flags) |
3759 | { | 3759 | { |
3760 | pgd_t *pgd; | 3760 | pgd_t *pgd; |
3761 | pud_t *pud; | 3761 | pud_t *pud; |
3762 | pmd_t *pmd; | 3762 | pmd_t *pmd; |
3763 | pte_t *pte; | 3763 | pte_t *pte; |
3764 | 3764 | ||
3765 | __set_current_state(TASK_RUNNING); | ||
3766 | |||
3767 | count_vm_event(PGFAULT); | ||
3768 | mem_cgroup_count_vm_event(mm, PGFAULT); | ||
3769 | |||
3770 | /* do counter updates before entering really critical section. */ | ||
3771 | check_sync_rss_stat(current); | ||
3772 | |||
3773 | if (unlikely(is_vm_hugetlb_page(vma))) | 3765 | if (unlikely(is_vm_hugetlb_page(vma))) |
3774 | return hugetlb_fault(mm, vma, address, flags); | 3766 | return hugetlb_fault(mm, vma, address, flags); |
3775 | 3767 | ||
@@ -3782,9 +3774,12 @@ retry: | |||
3782 | if (!pmd) | 3774 | if (!pmd) |
3783 | return VM_FAULT_OOM; | 3775 | return VM_FAULT_OOM; |
3784 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { | 3776 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { |
3777 | int ret = VM_FAULT_FALLBACK; | ||
3785 | if (!vma->vm_ops) | 3778 | if (!vma->vm_ops) |
3786 | return do_huge_pmd_anonymous_page(mm, vma, address, | 3779 | ret = do_huge_pmd_anonymous_page(mm, vma, address, |
3787 | pmd, flags); | 3780 | pmd, flags); |
3781 | if (!(ret & VM_FAULT_FALLBACK)) | ||
3782 | return ret; | ||
3788 | } else { | 3783 | } else { |
3789 | pmd_t orig_pmd = *pmd; | 3784 | pmd_t orig_pmd = *pmd; |
3790 | int ret; | 3785 | int ret; |
@@ -3850,6 +3845,37 @@ retry: | |||
3850 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 3845 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); |
3851 | } | 3846 | } |
3852 | 3847 | ||
3848 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3849 | unsigned long address, unsigned int flags) | ||
3850 | { | ||
3851 | int ret; | ||
3852 | |||
3853 | __set_current_state(TASK_RUNNING); | ||
3854 | |||
3855 | count_vm_event(PGFAULT); | ||
3856 | mem_cgroup_count_vm_event(mm, PGFAULT); | ||
3857 | |||
3858 | /* do counter updates before entering really critical section. */ | ||
3859 | check_sync_rss_stat(current); | ||
3860 | |||
3861 | /* | ||
3862 | * Enable the memcg OOM handling for faults triggered in user | ||
3863 | * space. Kernel faults are handled more gracefully. | ||
3864 | */ | ||
3865 | if (flags & FAULT_FLAG_USER) | ||
3866 | mem_cgroup_enable_oom(); | ||
3867 | |||
3868 | ret = __handle_mm_fault(mm, vma, address, flags); | ||
3869 | |||
3870 | if (flags & FAULT_FLAG_USER) | ||
3871 | mem_cgroup_disable_oom(); | ||
3872 | |||
3873 | if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) | ||
3874 | mem_cgroup_oom_synchronize(); | ||
3875 | |||
3876 | return ret; | ||
3877 | } | ||
3878 | |||
3853 | #ifndef __PAGETABLE_PUD_FOLDED | 3879 | #ifndef __PAGETABLE_PUD_FOLDED |
3854 | /* | 3880 | /* |
3855 | * Allocate page upper directory. | 3881 | * Allocate page upper directory. |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 98e75f2ac7bc..314e9d274381 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -678,9 +678,12 @@ out: | |||
678 | */ | 678 | */ |
679 | void pagefault_out_of_memory(void) | 679 | void pagefault_out_of_memory(void) |
680 | { | 680 | { |
681 | struct zonelist *zonelist = node_zonelist(first_online_node, | 681 | struct zonelist *zonelist; |
682 | GFP_KERNEL); | ||
683 | 682 | ||
683 | if (mem_cgroup_oom_synchronize()) | ||
684 | return; | ||
685 | |||
686 | zonelist = node_zonelist(first_online_node, GFP_KERNEL); | ||
684 | if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { | 687 | if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { |
685 | out_of_memory(NULL, 0, 0, NULL, false); | 688 | out_of_memory(NULL, 0, 0, NULL, false); |
686 | clear_zonelist_oom(zonelist, GFP_KERNEL); | 689 | clear_zonelist_oom(zonelist, GFP_KERNEL); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6c7b0187be8e..f5236f804aa6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2143,11 +2143,17 @@ EXPORT_SYMBOL(account_page_dirtied); | |||
2143 | 2143 | ||
2144 | /* | 2144 | /* |
2145 | * Helper function for set_page_writeback family. | 2145 | * Helper function for set_page_writeback family. |
2146 | * | ||
2147 | * The caller must hold mem_cgroup_begin/end_update_page_stat() lock | ||
2148 | * while calling this function. | ||
2149 | * See test_set_page_writeback for example. | ||
2150 | * | ||
2146 | * NOTE: Unlike account_page_dirtied this does not rely on being atomic | 2151 | * NOTE: Unlike account_page_dirtied this does not rely on being atomic |
2147 | * wrt interrupts. | 2152 | * wrt interrupts. |
2148 | */ | 2153 | */ |
2149 | void account_page_writeback(struct page *page) | 2154 | void account_page_writeback(struct page *page) |
2150 | { | 2155 | { |
2156 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); | ||
2151 | inc_zone_page_state(page, NR_WRITEBACK); | 2157 | inc_zone_page_state(page, NR_WRITEBACK); |
2152 | } | 2158 | } |
2153 | EXPORT_SYMBOL(account_page_writeback); | 2159 | EXPORT_SYMBOL(account_page_writeback); |
@@ -2364,7 +2370,10 @@ int test_clear_page_writeback(struct page *page) | |||
2364 | { | 2370 | { |
2365 | struct address_space *mapping = page_mapping(page); | 2371 | struct address_space *mapping = page_mapping(page); |
2366 | int ret; | 2372 | int ret; |
2373 | bool locked; | ||
2374 | unsigned long memcg_flags; | ||
2367 | 2375 | ||
2376 | mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); | ||
2368 | if (mapping) { | 2377 | if (mapping) { |
2369 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 2378 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
2370 | unsigned long flags; | 2379 | unsigned long flags; |
@@ -2385,9 +2394,11 @@ int test_clear_page_writeback(struct page *page) | |||
2385 | ret = TestClearPageWriteback(page); | 2394 | ret = TestClearPageWriteback(page); |
2386 | } | 2395 | } |
2387 | if (ret) { | 2396 | if (ret) { |
2397 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); | ||
2388 | dec_zone_page_state(page, NR_WRITEBACK); | 2398 | dec_zone_page_state(page, NR_WRITEBACK); |
2389 | inc_zone_page_state(page, NR_WRITTEN); | 2399 | inc_zone_page_state(page, NR_WRITTEN); |
2390 | } | 2400 | } |
2401 | mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); | ||
2391 | return ret; | 2402 | return ret; |
2392 | } | 2403 | } |
2393 | 2404 | ||
@@ -2395,7 +2406,10 @@ int test_set_page_writeback(struct page *page) | |||
2395 | { | 2406 | { |
2396 | struct address_space *mapping = page_mapping(page); | 2407 | struct address_space *mapping = page_mapping(page); |
2397 | int ret; | 2408 | int ret; |
2409 | bool locked; | ||
2410 | unsigned long memcg_flags; | ||
2398 | 2411 | ||
2412 | mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); | ||
2399 | if (mapping) { | 2413 | if (mapping) { |
2400 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 2414 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
2401 | unsigned long flags; | 2415 | unsigned long flags; |
@@ -2422,6 +2436,7 @@ int test_set_page_writeback(struct page *page) | |||
2422 | } | 2436 | } |
2423 | if (!ret) | 2437 | if (!ret) |
2424 | account_page_writeback(page); | 2438 | account_page_writeback(page); |
2439 | mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); | ||
2425 | return ret; | 2440 | return ret; |
2426 | 2441 | ||
2427 | } | 2442 | } |
@@ -1052,11 +1052,11 @@ void do_page_add_anon_rmap(struct page *page, | |||
1052 | { | 1052 | { |
1053 | int first = atomic_inc_and_test(&page->_mapcount); | 1053 | int first = atomic_inc_and_test(&page->_mapcount); |
1054 | if (first) { | 1054 | if (first) { |
1055 | if (!PageTransHuge(page)) | 1055 | if (PageTransHuge(page)) |
1056 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
1057 | else | ||
1058 | __inc_zone_page_state(page, | 1056 | __inc_zone_page_state(page, |
1059 | NR_ANON_TRANSPARENT_HUGEPAGES); | 1057 | NR_ANON_TRANSPARENT_HUGEPAGES); |
1058 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, | ||
1059 | hpage_nr_pages(page)); | ||
1060 | } | 1060 | } |
1061 | if (unlikely(PageKsm(page))) | 1061 | if (unlikely(PageKsm(page))) |
1062 | return; | 1062 | return; |
@@ -1085,10 +1085,10 @@ void page_add_new_anon_rmap(struct page *page, | |||
1085 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 1085 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
1086 | SetPageSwapBacked(page); | 1086 | SetPageSwapBacked(page); |
1087 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 1087 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
1088 | if (!PageTransHuge(page)) | 1088 | if (PageTransHuge(page)) |
1089 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
1090 | else | ||
1091 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1089 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
1090 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, | ||
1091 | hpage_nr_pages(page)); | ||
1092 | __page_set_anon_rmap(page, vma, address, 1); | 1092 | __page_set_anon_rmap(page, vma, address, 1); |
1093 | if (!mlocked_vma_newpage(vma, page)) { | 1093 | if (!mlocked_vma_newpage(vma, page)) { |
1094 | SetPageActive(page); | 1094 | SetPageActive(page); |
@@ -1111,7 +1111,7 @@ void page_add_file_rmap(struct page *page) | |||
1111 | mem_cgroup_begin_update_page_stat(page, &locked, &flags); | 1111 | mem_cgroup_begin_update_page_stat(page, &locked, &flags); |
1112 | if (atomic_inc_and_test(&page->_mapcount)) { | 1112 | if (atomic_inc_and_test(&page->_mapcount)) { |
1113 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 1113 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
1114 | mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); | 1114 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); |
1115 | } | 1115 | } |
1116 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | 1116 | mem_cgroup_end_update_page_stat(page, &locked, &flags); |
1117 | } | 1117 | } |
@@ -1148,14 +1148,14 @@ void page_remove_rmap(struct page *page) | |||
1148 | goto out; | 1148 | goto out; |
1149 | if (anon) { | 1149 | if (anon) { |
1150 | mem_cgroup_uncharge_page(page); | 1150 | mem_cgroup_uncharge_page(page); |
1151 | if (!PageTransHuge(page)) | 1151 | if (PageTransHuge(page)) |
1152 | __dec_zone_page_state(page, NR_ANON_PAGES); | ||
1153 | else | ||
1154 | __dec_zone_page_state(page, | 1152 | __dec_zone_page_state(page, |
1155 | NR_ANON_TRANSPARENT_HUGEPAGES); | 1153 | NR_ANON_TRANSPARENT_HUGEPAGES); |
1154 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, | ||
1155 | -hpage_nr_pages(page)); | ||
1156 | } else { | 1156 | } else { |
1157 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 1157 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
1158 | mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); | 1158 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); |
1159 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | 1159 | mem_cgroup_end_update_page_stat(page, &locked, &flags); |
1160 | } | 1160 | } |
1161 | if (unlikely(PageMlocked(page))) | 1161 | if (unlikely(PageMlocked(page))) |
@@ -432,6 +432,11 @@ static void activate_page_drain(int cpu) | |||
432 | pagevec_lru_move_fn(pvec, __activate_page, NULL); | 432 | pagevec_lru_move_fn(pvec, __activate_page, NULL); |
433 | } | 433 | } |
434 | 434 | ||
435 | static bool need_activate_page_drain(int cpu) | ||
436 | { | ||
437 | return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; | ||
438 | } | ||
439 | |||
435 | void activate_page(struct page *page) | 440 | void activate_page(struct page *page) |
436 | { | 441 | { |
437 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | 442 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
@@ -449,6 +454,11 @@ static inline void activate_page_drain(int cpu) | |||
449 | { | 454 | { |
450 | } | 455 | } |
451 | 456 | ||
457 | static bool need_activate_page_drain(int cpu) | ||
458 | { | ||
459 | return false; | ||
460 | } | ||
461 | |||
452 | void activate_page(struct page *page) | 462 | void activate_page(struct page *page) |
453 | { | 463 | { |
454 | struct zone *zone = page_zone(page); | 464 | struct zone *zone = page_zone(page); |
@@ -701,12 +711,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) | |||
701 | lru_add_drain(); | 711 | lru_add_drain(); |
702 | } | 712 | } |
703 | 713 | ||
704 | /* | 714 | static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); |
705 | * Returns 0 for success | 715 | |
706 | */ | 716 | void lru_add_drain_all(void) |
707 | int lru_add_drain_all(void) | ||
708 | { | 717 | { |
709 | return schedule_on_each_cpu(lru_add_drain_per_cpu); | 718 | static DEFINE_MUTEX(lock); |
719 | static struct cpumask has_work; | ||
720 | int cpu; | ||
721 | |||
722 | mutex_lock(&lock); | ||
723 | get_online_cpus(); | ||
724 | cpumask_clear(&has_work); | ||
725 | |||
726 | for_each_online_cpu(cpu) { | ||
727 | struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); | ||
728 | |||
729 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || | ||
730 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || | ||
731 | pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || | ||
732 | need_activate_page_drain(cpu)) { | ||
733 | INIT_WORK(work, lru_add_drain_per_cpu); | ||
734 | schedule_work_on(cpu, work); | ||
735 | cpumask_set_cpu(cpu, &has_work); | ||
736 | } | ||
737 | } | ||
738 | |||
739 | for_each_cpu(cpu, &has_work) | ||
740 | flush_work(&per_cpu(lru_add_drain_work, cpu)); | ||
741 | |||
742 | put_online_cpus(); | ||
743 | mutex_unlock(&lock); | ||
710 | } | 744 | } |
711 | 745 | ||
712 | /* | 746 | /* |
diff --git a/mm/truncate.c b/mm/truncate.c index e2e8a8a7eb9d..353b683afd6e 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -567,7 +567,6 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | |||
567 | /** | 567 | /** |
568 | * truncate_pagecache - unmap and remove pagecache that has been truncated | 568 | * truncate_pagecache - unmap and remove pagecache that has been truncated |
569 | * @inode: inode | 569 | * @inode: inode |
570 | * @oldsize: old file size | ||
571 | * @newsize: new file size | 570 | * @newsize: new file size |
572 | * | 571 | * |
573 | * inode's new i_size must already be written before truncate_pagecache | 572 | * inode's new i_size must already be written before truncate_pagecache |
@@ -580,7 +579,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | |||
580 | * situations such as writepage being called for a page that has already | 579 | * situations such as writepage being called for a page that has already |
581 | * had its underlying blocks deallocated. | 580 | * had its underlying blocks deallocated. |
582 | */ | 581 | */ |
583 | void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) | 582 | void truncate_pagecache(struct inode *inode, loff_t newsize) |
584 | { | 583 | { |
585 | struct address_space *mapping = inode->i_mapping; | 584 | struct address_space *mapping = inode->i_mapping; |
586 | loff_t holebegin = round_up(newsize, PAGE_SIZE); | 585 | loff_t holebegin = round_up(newsize, PAGE_SIZE); |
@@ -614,12 +613,8 @@ EXPORT_SYMBOL(truncate_pagecache); | |||
614 | */ | 613 | */ |
615 | void truncate_setsize(struct inode *inode, loff_t newsize) | 614 | void truncate_setsize(struct inode *inode, loff_t newsize) |
616 | { | 615 | { |
617 | loff_t oldsize; | ||
618 | |||
619 | oldsize = inode->i_size; | ||
620 | i_size_write(inode, newsize); | 616 | i_size_write(inode, newsize); |
621 | 617 | truncate_pagecache(inode, newsize); | |
622 | truncate_pagecache(inode, oldsize, newsize); | ||
623 | } | 618 | } |
624 | EXPORT_SYMBOL(truncate_setsize); | 619 | EXPORT_SYMBOL(truncate_setsize); |
625 | 620 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index beb35778c69f..8ed1b775bdc9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -139,11 +139,23 @@ static bool global_reclaim(struct scan_control *sc) | |||
139 | { | 139 | { |
140 | return !sc->target_mem_cgroup; | 140 | return !sc->target_mem_cgroup; |
141 | } | 141 | } |
142 | |||
143 | static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) | ||
144 | { | ||
145 | struct mem_cgroup *root = sc->target_mem_cgroup; | ||
146 | return !mem_cgroup_disabled() && | ||
147 | mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE; | ||
148 | } | ||
142 | #else | 149 | #else |
143 | static bool global_reclaim(struct scan_control *sc) | 150 | static bool global_reclaim(struct scan_control *sc) |
144 | { | 151 | { |
145 | return true; | 152 | return true; |
146 | } | 153 | } |
154 | |||
155 | static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) | ||
156 | { | ||
157 | return false; | ||
158 | } | ||
147 | #endif | 159 | #endif |
148 | 160 | ||
149 | unsigned long zone_reclaimable_pages(struct zone *zone) | 161 | unsigned long zone_reclaimable_pages(struct zone *zone) |
@@ -2164,9 +2176,11 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
2164 | } | 2176 | } |
2165 | } | 2177 | } |
2166 | 2178 | ||
2167 | static void shrink_zone(struct zone *zone, struct scan_control *sc) | 2179 | static int |
2180 | __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) | ||
2168 | { | 2181 | { |
2169 | unsigned long nr_reclaimed, nr_scanned; | 2182 | unsigned long nr_reclaimed, nr_scanned; |
2183 | int groups_scanned = 0; | ||
2170 | 2184 | ||
2171 | do { | 2185 | do { |
2172 | struct mem_cgroup *root = sc->target_mem_cgroup; | 2186 | struct mem_cgroup *root = sc->target_mem_cgroup; |
@@ -2174,15 +2188,17 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2174 | .zone = zone, | 2188 | .zone = zone, |
2175 | .priority = sc->priority, | 2189 | .priority = sc->priority, |
2176 | }; | 2190 | }; |
2177 | struct mem_cgroup *memcg; | 2191 | struct mem_cgroup *memcg = NULL; |
2192 | mem_cgroup_iter_filter filter = (soft_reclaim) ? | ||
2193 | mem_cgroup_soft_reclaim_eligible : NULL; | ||
2178 | 2194 | ||
2179 | nr_reclaimed = sc->nr_reclaimed; | 2195 | nr_reclaimed = sc->nr_reclaimed; |
2180 | nr_scanned = sc->nr_scanned; | 2196 | nr_scanned = sc->nr_scanned; |
2181 | 2197 | ||
2182 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 2198 | while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { |
2183 | do { | ||
2184 | struct lruvec *lruvec; | 2199 | struct lruvec *lruvec; |
2185 | 2200 | ||
2201 | groups_scanned++; | ||
2186 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2202 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2187 | 2203 | ||
2188 | shrink_lruvec(lruvec, sc); | 2204 | shrink_lruvec(lruvec, sc); |
@@ -2202,8 +2218,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2202 | mem_cgroup_iter_break(root, memcg); | 2218 | mem_cgroup_iter_break(root, memcg); |
2203 | break; | 2219 | break; |
2204 | } | 2220 | } |
2205 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | 2221 | } |
2206 | } while (memcg); | ||
2207 | 2222 | ||
2208 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, | 2223 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, |
2209 | sc->nr_scanned - nr_scanned, | 2224 | sc->nr_scanned - nr_scanned, |
@@ -2211,6 +2226,37 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2211 | 2226 | ||
2212 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, | 2227 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, |
2213 | sc->nr_scanned - nr_scanned, sc)); | 2228 | sc->nr_scanned - nr_scanned, sc)); |
2229 | |||
2230 | return groups_scanned; | ||
2231 | } | ||
2232 | |||
2233 | |||
2234 | static void shrink_zone(struct zone *zone, struct scan_control *sc) | ||
2235 | { | ||
2236 | bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); | ||
2237 | unsigned long nr_scanned = sc->nr_scanned; | ||
2238 | int scanned_groups; | ||
2239 | |||
2240 | scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim); | ||
2241 | /* | ||
2242 | * memcg iterator might race with other reclaimer or start from | ||
2243 | * a incomplete tree walk so the tree walk in __shrink_zone | ||
2244 | * might have missed groups that are above the soft limit. Try | ||
2245 | * another loop to catch up with others. Do it just once to | ||
2246 | * prevent from reclaim latencies when other reclaimers always | ||
2247 | * preempt this one. | ||
2248 | */ | ||
2249 | if (do_soft_reclaim && !scanned_groups) | ||
2250 | __shrink_zone(zone, sc, do_soft_reclaim); | ||
2251 | |||
2252 | /* | ||
2253 | * No group is over the soft limit or those that are do not have | ||
2254 | * pages in the zone we are reclaiming so we have to reclaim everybody | ||
2255 | */ | ||
2256 | if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) { | ||
2257 | __shrink_zone(zone, sc, false); | ||
2258 | return; | ||
2259 | } | ||
2214 | } | 2260 | } |
2215 | 2261 | ||
2216 | /* Returns true if compaction should go ahead for a high-order request */ | 2262 | /* Returns true if compaction should go ahead for a high-order request */ |
@@ -2274,8 +2320,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2274 | { | 2320 | { |
2275 | struct zoneref *z; | 2321 | struct zoneref *z; |
2276 | struct zone *zone; | 2322 | struct zone *zone; |
2277 | unsigned long nr_soft_reclaimed; | ||
2278 | unsigned long nr_soft_scanned; | ||
2279 | bool aborted_reclaim = false; | 2323 | bool aborted_reclaim = false; |
2280 | 2324 | ||
2281 | /* | 2325 | /* |
@@ -2315,18 +2359,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2315 | continue; | 2359 | continue; |
2316 | } | 2360 | } |
2317 | } | 2361 | } |
2318 | /* | ||
2319 | * This steals pages from memory cgroups over softlimit | ||
2320 | * and returns the number of reclaimed pages and | ||
2321 | * scanned pages. This works for global memory pressure | ||
2322 | * and balancing, not for a memcg's limit. | ||
2323 | */ | ||
2324 | nr_soft_scanned = 0; | ||
2325 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
2326 | sc->order, sc->gfp_mask, | ||
2327 | &nr_soft_scanned); | ||
2328 | sc->nr_reclaimed += nr_soft_reclaimed; | ||
2329 | sc->nr_scanned += nr_soft_scanned; | ||
2330 | /* need some check for avoid more shrink_zone() */ | 2362 | /* need some check for avoid more shrink_zone() */ |
2331 | } | 2363 | } |
2332 | 2364 | ||
@@ -2920,8 +2952,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2920 | { | 2952 | { |
2921 | int i; | 2953 | int i; |
2922 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2954 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
2923 | unsigned long nr_soft_reclaimed; | ||
2924 | unsigned long nr_soft_scanned; | ||
2925 | struct scan_control sc = { | 2955 | struct scan_control sc = { |
2926 | .gfp_mask = GFP_KERNEL, | 2956 | .gfp_mask = GFP_KERNEL, |
2927 | .priority = DEF_PRIORITY, | 2957 | .priority = DEF_PRIORITY, |
@@ -3036,15 +3066,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3036 | 3066 | ||
3037 | sc.nr_scanned = 0; | 3067 | sc.nr_scanned = 0; |
3038 | 3068 | ||
3039 | nr_soft_scanned = 0; | ||
3040 | /* | ||
3041 | * Call soft limit reclaim before calling shrink_zone. | ||
3042 | */ | ||
3043 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
3044 | order, sc.gfp_mask, | ||
3045 | &nr_soft_scanned); | ||
3046 | sc.nr_reclaimed += nr_soft_reclaimed; | ||
3047 | |||
3048 | /* | 3069 | /* |
3049 | * There should be no need to raise the scanning | 3070 | * There should be no need to raise the scanning |
3050 | * priority if enough pages are already being scanned | 3071 | * priority if enough pages are already being scanned |
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 8a57d79b0b16..559d4ae6ebf4 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c | |||
@@ -87,8 +87,8 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) | |||
87 | if (!cg_proto) | 87 | if (!cg_proto) |
88 | return -EINVAL; | 88 | return -EINVAL; |
89 | 89 | ||
90 | if (val > RESOURCE_MAX) | 90 | if (val > RES_COUNTER_MAX) |
91 | val = RESOURCE_MAX; | 91 | val = RES_COUNTER_MAX; |
92 | 92 | ||
93 | tcp = tcp_from_cgproto(cg_proto); | 93 | tcp = tcp_from_cgproto(cg_proto); |
94 | 94 | ||
@@ -101,9 +101,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) | |||
101 | tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, | 101 | tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, |
102 | net->ipv4.sysctl_tcp_mem[i]); | 102 | net->ipv4.sysctl_tcp_mem[i]); |
103 | 103 | ||
104 | if (val == RESOURCE_MAX) | 104 | if (val == RES_COUNTER_MAX) |
105 | clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); | 105 | clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); |
106 | else if (val != RESOURCE_MAX) { | 106 | else if (val != RES_COUNTER_MAX) { |
107 | /* | 107 | /* |
108 | * The active bit needs to be written after the static_key | 108 | * The active bit needs to be written after the static_key |
109 | * update. This is what guarantees that the socket activation | 109 | * update. This is what guarantees that the socket activation |
@@ -187,7 +187,7 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) | |||
187 | 187 | ||
188 | switch (cft->private) { | 188 | switch (cft->private) { |
189 | case RES_LIMIT: | 189 | case RES_LIMIT: |
190 | val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX); | 190 | val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX); |
191 | break; | 191 | break; |
192 | case RES_USAGE: | 192 | case RES_USAGE: |
193 | val = tcp_read_usage(memcg); | 193 | val = tcp_read_usage(memcg); |