diff options
79 files changed, 973 insertions, 919 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 2a3330696372..8af4ad121828 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
| @@ -490,6 +490,8 @@ pgpgin - # of charging events to the memory cgroup. The charging | |||
| 490 | pgpgout - # of uncharging events to the memory cgroup. The uncharging | 490 | pgpgout - # of uncharging events to the memory cgroup. The uncharging |
| 491 | event happens each time a page is unaccounted from the cgroup. | 491 | event happens each time a page is unaccounted from the cgroup. |
| 492 | swap - # of bytes of swap usage | 492 | swap - # of bytes of swap usage |
| 493 | writeback - # of bytes of file/anon cache that are queued for syncing to | ||
| 494 | disk. | ||
| 493 | inactive_anon - # of bytes of anonymous and swap cache memory on inactive | 495 | inactive_anon - # of bytes of anonymous and swap cache memory on inactive |
| 494 | LRU list. | 496 | LRU list. |
| 495 | active_anon - # of bytes of anonymous and swap cache memory on active | 497 | active_anon - # of bytes of anonymous and swap cache memory on active |
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 0c4132dd3507..98838a05ba6d 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c | |||
| @@ -89,8 +89,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, | |||
| 89 | const struct exception_table_entry *fixup; | 89 | const struct exception_table_entry *fixup; |
| 90 | int fault, si_code = SEGV_MAPERR; | 90 | int fault, si_code = SEGV_MAPERR; |
| 91 | siginfo_t info; | 91 | siginfo_t info; |
| 92 | unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 92 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 93 | (cause > 0 ? FAULT_FLAG_WRITE : 0)); | ||
| 94 | 93 | ||
| 95 | /* As of EV6, a load into $31/$f31 is a prefetch, and never faults | 94 | /* As of EV6, a load into $31/$f31 is a prefetch, and never faults |
| 96 | (or is suppressed by the PALcode). Support that for older CPUs | 95 | (or is suppressed by the PALcode). Support that for older CPUs |
| @@ -115,7 +114,8 @@ do_page_fault(unsigned long address, unsigned long mmcsr, | |||
| 115 | if (address >= TASK_SIZE) | 114 | if (address >= TASK_SIZE) |
| 116 | goto vmalloc_fault; | 115 | goto vmalloc_fault; |
| 117 | #endif | 116 | #endif |
| 118 | 117 | if (user_mode(regs)) | |
| 118 | flags |= FAULT_FLAG_USER; | ||
| 119 | retry: | 119 | retry: |
| 120 | down_read(&mm->mmap_sem); | 120 | down_read(&mm->mmap_sem); |
| 121 | vma = find_vma(mm, address); | 121 | vma = find_vma(mm, address); |
| @@ -142,6 +142,7 @@ retry: | |||
| 142 | } else { | 142 | } else { |
| 143 | if (!(vma->vm_flags & VM_WRITE)) | 143 | if (!(vma->vm_flags & VM_WRITE)) |
| 144 | goto bad_area; | 144 | goto bad_area; |
| 145 | flags |= FAULT_FLAG_WRITE; | ||
| 145 | } | 146 | } |
| 146 | 147 | ||
| 147 | /* If for any reason at all we couldn't handle the fault, | 148 | /* If for any reason at all we couldn't handle the fault, |
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 0fd1f0d515ff..d63f3de0cd5b 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c | |||
| @@ -60,8 +60,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address) | |||
| 60 | siginfo_t info; | 60 | siginfo_t info; |
| 61 | int fault, ret; | 61 | int fault, ret; |
| 62 | int write = regs->ecr_cause & ECR_C_PROTV_STORE; /* ST/EX */ | 62 | int write = regs->ecr_cause & ECR_C_PROTV_STORE; /* ST/EX */ |
| 63 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 63 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 64 | (write ? FAULT_FLAG_WRITE : 0); | ||
| 65 | 64 | ||
| 66 | /* | 65 | /* |
| 67 | * We fault-in kernel-space virtual memory on-demand. The | 66 | * We fault-in kernel-space virtual memory on-demand. The |
| @@ -89,6 +88,8 @@ void do_page_fault(struct pt_regs *regs, unsigned long address) | |||
| 89 | if (in_atomic() || !mm) | 88 | if (in_atomic() || !mm) |
| 90 | goto no_context; | 89 | goto no_context; |
| 91 | 90 | ||
| 91 | if (user_mode(regs)) | ||
| 92 | flags |= FAULT_FLAG_USER; | ||
| 92 | retry: | 93 | retry: |
| 93 | down_read(&mm->mmap_sem); | 94 | down_read(&mm->mmap_sem); |
| 94 | vma = find_vma(mm, address); | 95 | vma = find_vma(mm, address); |
| @@ -117,12 +118,12 @@ good_area: | |||
| 117 | if (write) { | 118 | if (write) { |
| 118 | if (!(vma->vm_flags & VM_WRITE)) | 119 | if (!(vma->vm_flags & VM_WRITE)) |
| 119 | goto bad_area; | 120 | goto bad_area; |
| 121 | flags |= FAULT_FLAG_WRITE; | ||
| 120 | } else { | 122 | } else { |
| 121 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | 123 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) |
| 122 | goto bad_area; | 124 | goto bad_area; |
| 123 | } | 125 | } |
| 124 | 126 | ||
| 125 | survive: | ||
| 126 | /* | 127 | /* |
| 127 | * If for any reason at all we couldn't handle the fault, | 128 | * If for any reason at all we couldn't handle the fault, |
| 128 | * make sure we exit gracefully rather than endlessly redo | 129 | * make sure we exit gracefully rather than endlessly redo |
| @@ -201,10 +202,6 @@ no_context: | |||
| 201 | die("Oops", regs, address); | 202 | die("Oops", regs, address); |
| 202 | 203 | ||
| 203 | out_of_memory: | 204 | out_of_memory: |
| 204 | if (is_global_init(tsk)) { | ||
| 205 | yield(); | ||
| 206 | goto survive; | ||
| 207 | } | ||
| 208 | up_read(&mm->mmap_sem); | 205 | up_read(&mm->mmap_sem); |
| 209 | 206 | ||
| 210 | if (user_mode(regs)) { | 207 | if (user_mode(regs)) { |
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index c97f7940cb95..eb8830a4c5ed 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c | |||
| @@ -261,9 +261,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) | |||
| 261 | struct task_struct *tsk; | 261 | struct task_struct *tsk; |
| 262 | struct mm_struct *mm; | 262 | struct mm_struct *mm; |
| 263 | int fault, sig, code; | 263 | int fault, sig, code; |
| 264 | int write = fsr & FSR_WRITE; | 264 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 265 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | ||
| 266 | (write ? FAULT_FLAG_WRITE : 0); | ||
| 267 | 265 | ||
| 268 | if (notify_page_fault(regs, fsr)) | 266 | if (notify_page_fault(regs, fsr)) |
| 269 | return 0; | 267 | return 0; |
| @@ -282,6 +280,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) | |||
| 282 | if (in_atomic() || !mm) | 280 | if (in_atomic() || !mm) |
| 283 | goto no_context; | 281 | goto no_context; |
| 284 | 282 | ||
| 283 | if (user_mode(regs)) | ||
| 284 | flags |= FAULT_FLAG_USER; | ||
| 285 | if (fsr & FSR_WRITE) | ||
| 286 | flags |= FAULT_FLAG_WRITE; | ||
| 287 | |||
| 285 | /* | 288 | /* |
| 286 | * As per x86, we may deadlock here. However, since the kernel only | 289 | * As per x86, we may deadlock here. However, since the kernel only |
| 287 | * validly references user space from well defined areas of the code, | 290 | * validly references user space from well defined areas of the code, |
| @@ -349,6 +352,13 @@ retry: | |||
| 349 | if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) | 352 | if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) |
| 350 | return 0; | 353 | return 0; |
| 351 | 354 | ||
| 355 | /* | ||
| 356 | * If we are in kernel mode at this point, we | ||
| 357 | * have no context to handle this fault with. | ||
| 358 | */ | ||
| 359 | if (!user_mode(regs)) | ||
| 360 | goto no_context; | ||
| 361 | |||
| 352 | if (fault & VM_FAULT_OOM) { | 362 | if (fault & VM_FAULT_OOM) { |
| 353 | /* | 363 | /* |
| 354 | * We ran out of memory, call the OOM killer, and return to | 364 | * We ran out of memory, call the OOM killer, and return to |
| @@ -359,13 +369,6 @@ retry: | |||
| 359 | return 0; | 369 | return 0; |
| 360 | } | 370 | } |
| 361 | 371 | ||
| 362 | /* | ||
| 363 | * If we are in kernel mode at this point, we | ||
| 364 | * have no context to handle this fault with. | ||
| 365 | */ | ||
| 366 | if (!user_mode(regs)) | ||
| 367 | goto no_context; | ||
| 368 | |||
| 369 | if (fault & VM_FAULT_SIGBUS) { | 372 | if (fault & VM_FAULT_SIGBUS) { |
| 370 | /* | 373 | /* |
| 371 | * We had some memory, but were unable to | 374 | * We had some memory, but were unable to |
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 6c8ba25bf6bb..6d6acf153bff 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c | |||
| @@ -199,13 +199,6 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, | |||
| 199 | unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC; | 199 | unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC; |
| 200 | unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; | 200 | unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 201 | 201 | ||
| 202 | if (esr & ESR_LNX_EXEC) { | ||
| 203 | vm_flags = VM_EXEC; | ||
| 204 | } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { | ||
| 205 | vm_flags = VM_WRITE; | ||
| 206 | mm_flags |= FAULT_FLAG_WRITE; | ||
| 207 | } | ||
| 208 | |||
| 209 | tsk = current; | 202 | tsk = current; |
| 210 | mm = tsk->mm; | 203 | mm = tsk->mm; |
| 211 | 204 | ||
| @@ -220,6 +213,16 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, | |||
| 220 | if (in_atomic() || !mm) | 213 | if (in_atomic() || !mm) |
| 221 | goto no_context; | 214 | goto no_context; |
| 222 | 215 | ||
| 216 | if (user_mode(regs)) | ||
| 217 | mm_flags |= FAULT_FLAG_USER; | ||
| 218 | |||
| 219 | if (esr & ESR_LNX_EXEC) { | ||
| 220 | vm_flags = VM_EXEC; | ||
| 221 | } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { | ||
| 222 | vm_flags = VM_WRITE; | ||
| 223 | mm_flags |= FAULT_FLAG_WRITE; | ||
| 224 | } | ||
| 225 | |||
| 223 | /* | 226 | /* |
| 224 | * As per x86, we may deadlock here. However, since the kernel only | 227 | * As per x86, we may deadlock here. However, since the kernel only |
| 225 | * validly references user space from well defined areas of the code, | 228 | * validly references user space from well defined areas of the code, |
| @@ -288,6 +291,13 @@ retry: | |||
| 288 | VM_FAULT_BADACCESS)))) | 291 | VM_FAULT_BADACCESS)))) |
| 289 | return 0; | 292 | return 0; |
| 290 | 293 | ||
| 294 | /* | ||
| 295 | * If we are in kernel mode at this point, we have no context to | ||
| 296 | * handle this fault with. | ||
| 297 | */ | ||
| 298 | if (!user_mode(regs)) | ||
| 299 | goto no_context; | ||
| 300 | |||
| 291 | if (fault & VM_FAULT_OOM) { | 301 | if (fault & VM_FAULT_OOM) { |
| 292 | /* | 302 | /* |
| 293 | * We ran out of memory, call the OOM killer, and return to | 303 | * We ran out of memory, call the OOM killer, and return to |
| @@ -298,13 +308,6 @@ retry: | |||
| 298 | return 0; | 308 | return 0; |
| 299 | } | 309 | } |
| 300 | 310 | ||
| 301 | /* | ||
| 302 | * If we are in kernel mode at this point, we have no context to | ||
| 303 | * handle this fault with. | ||
| 304 | */ | ||
| 305 | if (!user_mode(regs)) | ||
| 306 | goto no_context; | ||
| 307 | |||
| 308 | if (fault & VM_FAULT_SIGBUS) { | 311 | if (fault & VM_FAULT_SIGBUS) { |
| 309 | /* | 312 | /* |
| 310 | * We had some memory, but were unable to successfully fix up | 313 | * We had some memory, but were unable to successfully fix up |
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c index b2f2d2d66849..0eca93327195 100644 --- a/arch/avr32/mm/fault.c +++ b/arch/avr32/mm/fault.c | |||
| @@ -86,6 +86,8 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs) | |||
| 86 | 86 | ||
| 87 | local_irq_enable(); | 87 | local_irq_enable(); |
| 88 | 88 | ||
| 89 | if (user_mode(regs)) | ||
| 90 | flags |= FAULT_FLAG_USER; | ||
| 89 | retry: | 91 | retry: |
| 90 | down_read(&mm->mmap_sem); | 92 | down_read(&mm->mmap_sem); |
| 91 | 93 | ||
| @@ -228,9 +230,9 @@ no_context: | |||
| 228 | */ | 230 | */ |
| 229 | out_of_memory: | 231 | out_of_memory: |
| 230 | up_read(&mm->mmap_sem); | 232 | up_read(&mm->mmap_sem); |
| 231 | pagefault_out_of_memory(); | ||
| 232 | if (!user_mode(regs)) | 233 | if (!user_mode(regs)) |
| 233 | goto no_context; | 234 | goto no_context; |
| 235 | pagefault_out_of_memory(); | ||
| 234 | return; | 236 | return; |
| 235 | 237 | ||
| 236 | do_sigbus: | 238 | do_sigbus: |
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c index 73312ab6c696..1790f22e71a2 100644 --- a/arch/cris/mm/fault.c +++ b/arch/cris/mm/fault.c | |||
| @@ -58,8 +58,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs, | |||
| 58 | struct vm_area_struct * vma; | 58 | struct vm_area_struct * vma; |
| 59 | siginfo_t info; | 59 | siginfo_t info; |
| 60 | int fault; | 60 | int fault; |
| 61 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 61 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 62 | ((writeaccess & 1) ? FAULT_FLAG_WRITE : 0); | ||
| 63 | 62 | ||
| 64 | D(printk(KERN_DEBUG | 63 | D(printk(KERN_DEBUG |
| 65 | "Page fault for %lX on %X at %lX, prot %d write %d\n", | 64 | "Page fault for %lX on %X at %lX, prot %d write %d\n", |
| @@ -117,6 +116,8 @@ do_page_fault(unsigned long address, struct pt_regs *regs, | |||
| 117 | if (in_atomic() || !mm) | 116 | if (in_atomic() || !mm) |
| 118 | goto no_context; | 117 | goto no_context; |
| 119 | 118 | ||
| 119 | if (user_mode(regs)) | ||
| 120 | flags |= FAULT_FLAG_USER; | ||
| 120 | retry: | 121 | retry: |
| 121 | down_read(&mm->mmap_sem); | 122 | down_read(&mm->mmap_sem); |
| 122 | vma = find_vma(mm, address); | 123 | vma = find_vma(mm, address); |
| @@ -155,6 +156,7 @@ retry: | |||
| 155 | } else if (writeaccess == 1) { | 156 | } else if (writeaccess == 1) { |
| 156 | if (!(vma->vm_flags & VM_WRITE)) | 157 | if (!(vma->vm_flags & VM_WRITE)) |
| 157 | goto bad_area; | 158 | goto bad_area; |
| 159 | flags |= FAULT_FLAG_WRITE; | ||
| 158 | } else { | 160 | } else { |
| 159 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | 161 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) |
| 160 | goto bad_area; | 162 | goto bad_area; |
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c index 331c1e2cfb67..9a66372fc7c7 100644 --- a/arch/frv/mm/fault.c +++ b/arch/frv/mm/fault.c | |||
| @@ -34,11 +34,11 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear | |||
| 34 | struct vm_area_struct *vma; | 34 | struct vm_area_struct *vma; |
| 35 | struct mm_struct *mm; | 35 | struct mm_struct *mm; |
| 36 | unsigned long _pme, lrai, lrad, fixup; | 36 | unsigned long _pme, lrai, lrad, fixup; |
| 37 | unsigned long flags = 0; | ||
| 37 | siginfo_t info; | 38 | siginfo_t info; |
| 38 | pgd_t *pge; | 39 | pgd_t *pge; |
| 39 | pud_t *pue; | 40 | pud_t *pue; |
| 40 | pte_t *pte; | 41 | pte_t *pte; |
| 41 | int write; | ||
| 42 | int fault; | 42 | int fault; |
| 43 | 43 | ||
| 44 | #if 0 | 44 | #if 0 |
| @@ -81,6 +81,9 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear | |||
| 81 | if (in_atomic() || !mm) | 81 | if (in_atomic() || !mm) |
| 82 | goto no_context; | 82 | goto no_context; |
| 83 | 83 | ||
| 84 | if (user_mode(__frame)) | ||
| 85 | flags |= FAULT_FLAG_USER; | ||
| 86 | |||
| 84 | down_read(&mm->mmap_sem); | 87 | down_read(&mm->mmap_sem); |
| 85 | 88 | ||
| 86 | vma = find_vma(mm, ear0); | 89 | vma = find_vma(mm, ear0); |
| @@ -129,7 +132,6 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear | |||
| 129 | */ | 132 | */ |
| 130 | good_area: | 133 | good_area: |
| 131 | info.si_code = SEGV_ACCERR; | 134 | info.si_code = SEGV_ACCERR; |
| 132 | write = 0; | ||
| 133 | switch (esr0 & ESR0_ATXC) { | 135 | switch (esr0 & ESR0_ATXC) { |
| 134 | default: | 136 | default: |
| 135 | /* handle write to write protected page */ | 137 | /* handle write to write protected page */ |
| @@ -140,7 +142,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear | |||
| 140 | #endif | 142 | #endif |
| 141 | if (!(vma->vm_flags & VM_WRITE)) | 143 | if (!(vma->vm_flags & VM_WRITE)) |
| 142 | goto bad_area; | 144 | goto bad_area; |
| 143 | write = 1; | 145 | flags |= FAULT_FLAG_WRITE; |
| 144 | break; | 146 | break; |
| 145 | 147 | ||
| 146 | /* handle read from protected page */ | 148 | /* handle read from protected page */ |
| @@ -162,7 +164,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear | |||
| 162 | * make sure we exit gracefully rather than endlessly redo | 164 | * make sure we exit gracefully rather than endlessly redo |
| 163 | * the fault. | 165 | * the fault. |
| 164 | */ | 166 | */ |
| 165 | fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0); | 167 | fault = handle_mm_fault(mm, vma, ear0, flags); |
| 166 | if (unlikely(fault & VM_FAULT_ERROR)) { | 168 | if (unlikely(fault & VM_FAULT_ERROR)) { |
| 167 | if (fault & VM_FAULT_OOM) | 169 | if (fault & VM_FAULT_OOM) |
| 168 | goto out_of_memory; | 170 | goto out_of_memory; |
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index 1bd276dbec7d..8704c9320032 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c | |||
| @@ -53,8 +53,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) | |||
| 53 | int si_code = SEGV_MAPERR; | 53 | int si_code = SEGV_MAPERR; |
| 54 | int fault; | 54 | int fault; |
| 55 | const struct exception_table_entry *fixup; | 55 | const struct exception_table_entry *fixup; |
| 56 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 56 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 57 | (cause > 0 ? FAULT_FLAG_WRITE : 0); | ||
| 58 | 57 | ||
| 59 | /* | 58 | /* |
| 60 | * If we're in an interrupt or have no user context, | 59 | * If we're in an interrupt or have no user context, |
| @@ -65,6 +64,8 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) | |||
| 65 | 64 | ||
| 66 | local_irq_enable(); | 65 | local_irq_enable(); |
| 67 | 66 | ||
| 67 | if (user_mode(regs)) | ||
| 68 | flags |= FAULT_FLAG_USER; | ||
| 68 | retry: | 69 | retry: |
| 69 | down_read(&mm->mmap_sem); | 70 | down_read(&mm->mmap_sem); |
| 70 | vma = find_vma(mm, address); | 71 | vma = find_vma(mm, address); |
| @@ -96,6 +97,7 @@ good_area: | |||
| 96 | case FLT_STORE: | 97 | case FLT_STORE: |
| 97 | if (!(vma->vm_flags & VM_WRITE)) | 98 | if (!(vma->vm_flags & VM_WRITE)) |
| 98 | goto bad_area; | 99 | goto bad_area; |
| 100 | flags |= FAULT_FLAG_WRITE; | ||
| 99 | break; | 101 | break; |
| 100 | } | 102 | } |
| 101 | 103 | ||
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 6cf0341f978e..7225dad87094 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c | |||
| @@ -90,8 +90,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re | |||
| 90 | mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT) | 90 | mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT) |
| 91 | | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)); | 91 | | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)); |
| 92 | 92 | ||
| 93 | flags |= ((mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0); | ||
| 94 | |||
| 95 | /* mmap_sem is performance critical.... */ | 93 | /* mmap_sem is performance critical.... */ |
| 96 | prefetchw(&mm->mmap_sem); | 94 | prefetchw(&mm->mmap_sem); |
| 97 | 95 | ||
| @@ -119,6 +117,10 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re | |||
| 119 | if (notify_page_fault(regs, TRAP_BRKPT)) | 117 | if (notify_page_fault(regs, TRAP_BRKPT)) |
| 120 | return; | 118 | return; |
| 121 | 119 | ||
| 120 | if (user_mode(regs)) | ||
| 121 | flags |= FAULT_FLAG_USER; | ||
| 122 | if (mask & VM_WRITE) | ||
| 123 | flags |= FAULT_FLAG_WRITE; | ||
| 122 | retry: | 124 | retry: |
| 123 | down_read(&mm->mmap_sem); | 125 | down_read(&mm->mmap_sem); |
| 124 | 126 | ||
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c index 3cdfa9c1d091..e9c6a8014bd6 100644 --- a/arch/m32r/mm/fault.c +++ b/arch/m32r/mm/fault.c | |||
| @@ -78,7 +78,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, | |||
| 78 | struct mm_struct *mm; | 78 | struct mm_struct *mm; |
| 79 | struct vm_area_struct * vma; | 79 | struct vm_area_struct * vma; |
| 80 | unsigned long page, addr; | 80 | unsigned long page, addr; |
| 81 | int write; | 81 | unsigned long flags = 0; |
| 82 | int fault; | 82 | int fault; |
| 83 | siginfo_t info; | 83 | siginfo_t info; |
| 84 | 84 | ||
| @@ -117,6 +117,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, | |||
| 117 | if (in_atomic() || !mm) | 117 | if (in_atomic() || !mm) |
| 118 | goto bad_area_nosemaphore; | 118 | goto bad_area_nosemaphore; |
| 119 | 119 | ||
| 120 | if (error_code & ACE_USERMODE) | ||
| 121 | flags |= FAULT_FLAG_USER; | ||
| 122 | |||
| 120 | /* When running in the kernel we expect faults to occur only to | 123 | /* When running in the kernel we expect faults to occur only to |
| 121 | * addresses in user space. All other faults represent errors in the | 124 | * addresses in user space. All other faults represent errors in the |
| 122 | * kernel and should generate an OOPS. Unfortunately, in the case of an | 125 | * kernel and should generate an OOPS. Unfortunately, in the case of an |
| @@ -166,14 +169,13 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, | |||
| 166 | */ | 169 | */ |
| 167 | good_area: | 170 | good_area: |
| 168 | info.si_code = SEGV_ACCERR; | 171 | info.si_code = SEGV_ACCERR; |
| 169 | write = 0; | ||
| 170 | switch (error_code & (ACE_WRITE|ACE_PROTECTION)) { | 172 | switch (error_code & (ACE_WRITE|ACE_PROTECTION)) { |
| 171 | default: /* 3: write, present */ | 173 | default: /* 3: write, present */ |
| 172 | /* fall through */ | 174 | /* fall through */ |
| 173 | case ACE_WRITE: /* write, not present */ | 175 | case ACE_WRITE: /* write, not present */ |
| 174 | if (!(vma->vm_flags & VM_WRITE)) | 176 | if (!(vma->vm_flags & VM_WRITE)) |
| 175 | goto bad_area; | 177 | goto bad_area; |
| 176 | write++; | 178 | flags |= FAULT_FLAG_WRITE; |
| 177 | break; | 179 | break; |
| 178 | case ACE_PROTECTION: /* read, present */ | 180 | case ACE_PROTECTION: /* read, present */ |
| 179 | case 0: /* read, not present */ | 181 | case 0: /* read, not present */ |
| @@ -194,7 +196,7 @@ good_area: | |||
| 194 | */ | 196 | */ |
| 195 | addr = (address & PAGE_MASK); | 197 | addr = (address & PAGE_MASK); |
| 196 | set_thread_fault_code(error_code); | 198 | set_thread_fault_code(error_code); |
| 197 | fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0); | 199 | fault = handle_mm_fault(mm, vma, addr, flags); |
| 198 | if (unlikely(fault & VM_FAULT_ERROR)) { | 200 | if (unlikely(fault & VM_FAULT_ERROR)) { |
| 199 | if (fault & VM_FAULT_OOM) | 201 | if (fault & VM_FAULT_OOM) |
| 200 | goto out_of_memory; | 202 | goto out_of_memory; |
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index a563727806bf..eb1d61f68725 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c | |||
| @@ -88,6 +88,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, | |||
| 88 | if (in_atomic() || !mm) | 88 | if (in_atomic() || !mm) |
| 89 | goto no_context; | 89 | goto no_context; |
| 90 | 90 | ||
| 91 | if (user_mode(regs)) | ||
| 92 | flags |= FAULT_FLAG_USER; | ||
| 91 | retry: | 93 | retry: |
| 92 | down_read(&mm->mmap_sem); | 94 | down_read(&mm->mmap_sem); |
| 93 | 95 | ||
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c index 8fddf46e6c62..332680e5ebf2 100644 --- a/arch/metag/mm/fault.c +++ b/arch/metag/mm/fault.c | |||
| @@ -53,8 +53,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, | |||
| 53 | struct vm_area_struct *vma, *prev_vma; | 53 | struct vm_area_struct *vma, *prev_vma; |
| 54 | siginfo_t info; | 54 | siginfo_t info; |
| 55 | int fault; | 55 | int fault; |
| 56 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 56 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 57 | (write_access ? FAULT_FLAG_WRITE : 0); | ||
| 58 | 57 | ||
| 59 | tsk = current; | 58 | tsk = current; |
| 60 | 59 | ||
| @@ -109,6 +108,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, | |||
| 109 | if (in_atomic() || !mm) | 108 | if (in_atomic() || !mm) |
| 110 | goto no_context; | 109 | goto no_context; |
| 111 | 110 | ||
| 111 | if (user_mode(regs)) | ||
| 112 | flags |= FAULT_FLAG_USER; | ||
| 112 | retry: | 113 | retry: |
| 113 | down_read(&mm->mmap_sem); | 114 | down_read(&mm->mmap_sem); |
| 114 | 115 | ||
| @@ -121,6 +122,7 @@ good_area: | |||
| 121 | if (write_access) { | 122 | if (write_access) { |
| 122 | if (!(vma->vm_flags & VM_WRITE)) | 123 | if (!(vma->vm_flags & VM_WRITE)) |
| 123 | goto bad_area; | 124 | goto bad_area; |
| 125 | flags |= FAULT_FLAG_WRITE; | ||
| 124 | } else { | 126 | } else { |
| 125 | if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) | 127 | if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) |
| 126 | goto bad_area; | 128 | goto bad_area; |
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index 731f739d17a1..fa4cf52aa7a6 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c | |||
| @@ -92,8 +92,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, | |||
| 92 | int code = SEGV_MAPERR; | 92 | int code = SEGV_MAPERR; |
| 93 | int is_write = error_code & ESR_S; | 93 | int is_write = error_code & ESR_S; |
| 94 | int fault; | 94 | int fault; |
| 95 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 95 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 96 | (is_write ? FAULT_FLAG_WRITE : 0); | ||
| 97 | 96 | ||
| 98 | regs->ear = address; | 97 | regs->ear = address; |
| 99 | regs->esr = error_code; | 98 | regs->esr = error_code; |
| @@ -121,6 +120,9 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, | |||
| 121 | die("Weird page fault", regs, SIGSEGV); | 120 | die("Weird page fault", regs, SIGSEGV); |
| 122 | } | 121 | } |
| 123 | 122 | ||
| 123 | if (user_mode(regs)) | ||
| 124 | flags |= FAULT_FLAG_USER; | ||
| 125 | |||
| 124 | /* When running in the kernel we expect faults to occur only to | 126 | /* When running in the kernel we expect faults to occur only to |
| 125 | * addresses in user space. All other faults represent errors in the | 127 | * addresses in user space. All other faults represent errors in the |
| 126 | * kernel and should generate an OOPS. Unfortunately, in the case of an | 128 | * kernel and should generate an OOPS. Unfortunately, in the case of an |
| @@ -199,6 +201,7 @@ good_area: | |||
| 199 | if (unlikely(is_write)) { | 201 | if (unlikely(is_write)) { |
| 200 | if (unlikely(!(vma->vm_flags & VM_WRITE))) | 202 | if (unlikely(!(vma->vm_flags & VM_WRITE))) |
| 201 | goto bad_area; | 203 | goto bad_area; |
| 204 | flags |= FAULT_FLAG_WRITE; | ||
| 202 | /* a read */ | 205 | /* a read */ |
| 203 | } else { | 206 | } else { |
| 204 | /* protection fault */ | 207 | /* protection fault */ |
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 85df1cd8d446..becc42bb1849 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c | |||
| @@ -42,8 +42,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, | |||
| 42 | const int field = sizeof(unsigned long) * 2; | 42 | const int field = sizeof(unsigned long) * 2; |
| 43 | siginfo_t info; | 43 | siginfo_t info; |
| 44 | int fault; | 44 | int fault; |
| 45 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 45 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 46 | (write ? FAULT_FLAG_WRITE : 0); | ||
| 47 | 46 | ||
| 48 | #if 0 | 47 | #if 0 |
| 49 | printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(), | 48 | printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(), |
| @@ -93,6 +92,8 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, | |||
| 93 | if (in_atomic() || !mm) | 92 | if (in_atomic() || !mm) |
| 94 | goto bad_area_nosemaphore; | 93 | goto bad_area_nosemaphore; |
| 95 | 94 | ||
| 95 | if (user_mode(regs)) | ||
| 96 | flags |= FAULT_FLAG_USER; | ||
| 96 | retry: | 97 | retry: |
| 97 | down_read(&mm->mmap_sem); | 98 | down_read(&mm->mmap_sem); |
| 98 | vma = find_vma(mm, address); | 99 | vma = find_vma(mm, address); |
| @@ -114,6 +115,7 @@ good_area: | |||
| 114 | if (write) { | 115 | if (write) { |
| 115 | if (!(vma->vm_flags & VM_WRITE)) | 116 | if (!(vma->vm_flags & VM_WRITE)) |
| 116 | goto bad_area; | 117 | goto bad_area; |
| 118 | flags |= FAULT_FLAG_WRITE; | ||
| 117 | } else { | 119 | } else { |
| 118 | if (cpu_has_rixi) { | 120 | if (cpu_has_rixi) { |
| 119 | if (address == regs->cp0_epc && !(vma->vm_flags & VM_EXEC)) { | 121 | if (address == regs->cp0_epc && !(vma->vm_flags & VM_EXEC)) { |
| @@ -241,6 +243,8 @@ out_of_memory: | |||
| 241 | * (which will retry the fault, or kill us if we got oom-killed). | 243 | * (which will retry the fault, or kill us if we got oom-killed). |
| 242 | */ | 244 | */ |
| 243 | up_read(&mm->mmap_sem); | 245 | up_read(&mm->mmap_sem); |
| 246 | if (!user_mode(regs)) | ||
| 247 | goto no_context; | ||
| 244 | pagefault_out_of_memory(); | 248 | pagefault_out_of_memory(); |
| 245 | return; | 249 | return; |
| 246 | 250 | ||
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c index 8a2e6ded9a44..3516cbdf1ee9 100644 --- a/arch/mn10300/mm/fault.c +++ b/arch/mn10300/mm/fault.c | |||
| @@ -171,6 +171,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code, | |||
| 171 | if (in_atomic() || !mm) | 171 | if (in_atomic() || !mm) |
| 172 | goto no_context; | 172 | goto no_context; |
| 173 | 173 | ||
| 174 | if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) | ||
| 175 | flags |= FAULT_FLAG_USER; | ||
| 174 | retry: | 176 | retry: |
| 175 | down_read(&mm->mmap_sem); | 177 | down_read(&mm->mmap_sem); |
| 176 | 178 | ||
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 4a41f8493ab0..0703acf7d327 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c | |||
| @@ -86,6 +86,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address, | |||
| 86 | if (user_mode(regs)) { | 86 | if (user_mode(regs)) { |
| 87 | /* Exception was in userspace: reenable interrupts */ | 87 | /* Exception was in userspace: reenable interrupts */ |
| 88 | local_irq_enable(); | 88 | local_irq_enable(); |
| 89 | flags |= FAULT_FLAG_USER; | ||
| 89 | } else { | 90 | } else { |
| 90 | /* If exception was in a syscall, then IRQ's may have | 91 | /* If exception was in a syscall, then IRQ's may have |
| 91 | * been enabled or disabled. If they were enabled, | 92 | * been enabled or disabled. If they were enabled, |
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index f247a3480e8e..d10d27a720c0 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c | |||
| @@ -180,6 +180,10 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, | |||
| 180 | if (in_atomic() || !mm) | 180 | if (in_atomic() || !mm) |
| 181 | goto no_context; | 181 | goto no_context; |
| 182 | 182 | ||
| 183 | if (user_mode(regs)) | ||
| 184 | flags |= FAULT_FLAG_USER; | ||
| 185 | if (acc_type & VM_WRITE) | ||
| 186 | flags |= FAULT_FLAG_WRITE; | ||
| 183 | retry: | 187 | retry: |
| 184 | down_read(&mm->mmap_sem); | 188 | down_read(&mm->mmap_sem); |
| 185 | vma = find_vma_prev(mm, address, &prev_vma); | 189 | vma = find_vma_prev(mm, address, &prev_vma); |
| @@ -203,8 +207,7 @@ good_area: | |||
| 203 | * fault. | 207 | * fault. |
| 204 | */ | 208 | */ |
| 205 | 209 | ||
| 206 | fault = handle_mm_fault(mm, vma, address, | 210 | fault = handle_mm_fault(mm, vma, address, flags); |
| 207 | flags | ((acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0)); | ||
| 208 | 211 | ||
| 209 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 212 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
| 210 | return; | 213 | return; |
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 2dd69bf4af46..51ab9e7e6c39 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c | |||
| @@ -223,9 +223,6 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, | |||
| 223 | is_write = error_code & ESR_DST; | 223 | is_write = error_code & ESR_DST; |
| 224 | #endif /* CONFIG_4xx || CONFIG_BOOKE */ | 224 | #endif /* CONFIG_4xx || CONFIG_BOOKE */ |
| 225 | 225 | ||
| 226 | if (is_write) | ||
| 227 | flags |= FAULT_FLAG_WRITE; | ||
| 228 | |||
| 229 | #ifdef CONFIG_PPC_ICSWX | 226 | #ifdef CONFIG_PPC_ICSWX |
| 230 | /* | 227 | /* |
| 231 | * we need to do this early because this "data storage | 228 | * we need to do this early because this "data storage |
| @@ -288,6 +285,9 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, | |||
| 288 | if (user_mode(regs)) | 285 | if (user_mode(regs)) |
| 289 | store_update_sp = store_updates_sp(regs); | 286 | store_update_sp = store_updates_sp(regs); |
| 290 | 287 | ||
| 288 | if (user_mode(regs)) | ||
| 289 | flags |= FAULT_FLAG_USER; | ||
| 290 | |||
| 291 | /* When running in the kernel we expect faults to occur only to | 291 | /* When running in the kernel we expect faults to occur only to |
| 292 | * addresses in user space. All other faults represent errors in the | 292 | * addresses in user space. All other faults represent errors in the |
| 293 | * kernel and should generate an OOPS. Unfortunately, in the case of an | 293 | * kernel and should generate an OOPS. Unfortunately, in the case of an |
| @@ -415,6 +415,7 @@ good_area: | |||
| 415 | } else if (is_write) { | 415 | } else if (is_write) { |
| 416 | if (!(vma->vm_flags & VM_WRITE)) | 416 | if (!(vma->vm_flags & VM_WRITE)) |
| 417 | goto bad_area; | 417 | goto bad_area; |
| 418 | flags |= FAULT_FLAG_WRITE; | ||
| 418 | /* a read */ | 419 | /* a read */ |
| 419 | } else { | 420 | } else { |
| 420 | /* protection fault */ | 421 | /* protection fault */ |
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 7de4469915f0..fc6679210d83 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c | |||
| @@ -302,6 +302,8 @@ static inline int do_exception(struct pt_regs *regs, int access) | |||
| 302 | address = trans_exc_code & __FAIL_ADDR_MASK; | 302 | address = trans_exc_code & __FAIL_ADDR_MASK; |
| 303 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); | 303 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); |
| 304 | flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; | 304 | flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 305 | if (user_mode(regs)) | ||
| 306 | flags |= FAULT_FLAG_USER; | ||
| 305 | if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) | 307 | if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) |
| 306 | flags |= FAULT_FLAG_WRITE; | 308 | flags |= FAULT_FLAG_WRITE; |
| 307 | down_read(&mm->mmap_sem); | 309 | down_read(&mm->mmap_sem); |
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c index 6b18fb0189ae..52238983527d 100644 --- a/arch/score/mm/fault.c +++ b/arch/score/mm/fault.c | |||
| @@ -47,6 +47,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, | |||
| 47 | struct task_struct *tsk = current; | 47 | struct task_struct *tsk = current; |
| 48 | struct mm_struct *mm = tsk->mm; | 48 | struct mm_struct *mm = tsk->mm; |
| 49 | const int field = sizeof(unsigned long) * 2; | 49 | const int field = sizeof(unsigned long) * 2; |
| 50 | unsigned long flags = 0; | ||
| 50 | siginfo_t info; | 51 | siginfo_t info; |
| 51 | int fault; | 52 | int fault; |
| 52 | 53 | ||
| @@ -75,6 +76,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, | |||
| 75 | if (in_atomic() || !mm) | 76 | if (in_atomic() || !mm) |
| 76 | goto bad_area_nosemaphore; | 77 | goto bad_area_nosemaphore; |
| 77 | 78 | ||
| 79 | if (user_mode(regs)) | ||
| 80 | flags |= FAULT_FLAG_USER; | ||
| 81 | |||
| 78 | down_read(&mm->mmap_sem); | 82 | down_read(&mm->mmap_sem); |
| 79 | vma = find_vma(mm, address); | 83 | vma = find_vma(mm, address); |
| 80 | if (!vma) | 84 | if (!vma) |
| @@ -95,18 +99,18 @@ good_area: | |||
| 95 | if (write) { | 99 | if (write) { |
| 96 | if (!(vma->vm_flags & VM_WRITE)) | 100 | if (!(vma->vm_flags & VM_WRITE)) |
| 97 | goto bad_area; | 101 | goto bad_area; |
| 102 | flags |= FAULT_FLAG_WRITE; | ||
| 98 | } else { | 103 | } else { |
| 99 | if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) | 104 | if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) |
| 100 | goto bad_area; | 105 | goto bad_area; |
| 101 | } | 106 | } |
| 102 | 107 | ||
| 103 | survive: | ||
| 104 | /* | 108 | /* |
| 105 | * If for any reason at all we couldn't handle the fault, | 109 | * If for any reason at all we couldn't handle the fault, |
| 106 | * make sure we exit gracefully rather than endlessly redo | 110 | * make sure we exit gracefully rather than endlessly redo |
| 107 | * the fault. | 111 | * the fault. |
| 108 | */ | 112 | */ |
| 109 | fault = handle_mm_fault(mm, vma, address, write); | 113 | fault = handle_mm_fault(mm, vma, address, flags); |
| 110 | if (unlikely(fault & VM_FAULT_ERROR)) { | 114 | if (unlikely(fault & VM_FAULT_ERROR)) { |
| 111 | if (fault & VM_FAULT_OOM) | 115 | if (fault & VM_FAULT_OOM) |
| 112 | goto out_of_memory; | 116 | goto out_of_memory; |
| @@ -167,11 +171,6 @@ no_context: | |||
| 167 | */ | 171 | */ |
| 168 | out_of_memory: | 172 | out_of_memory: |
| 169 | up_read(&mm->mmap_sem); | 173 | up_read(&mm->mmap_sem); |
| 170 | if (is_global_init(tsk)) { | ||
| 171 | yield(); | ||
| 172 | down_read(&mm->mmap_sem); | ||
| 173 | goto survive; | ||
| 174 | } | ||
| 175 | if (!user_mode(regs)) | 174 | if (!user_mode(regs)) |
| 176 | goto no_context; | 175 | goto no_context; |
| 177 | pagefault_out_of_memory(); | 176 | pagefault_out_of_memory(); |
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 1f49c28affa9..541dc6101508 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c | |||
| @@ -400,9 +400,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, | |||
| 400 | struct mm_struct *mm; | 400 | struct mm_struct *mm; |
| 401 | struct vm_area_struct * vma; | 401 | struct vm_area_struct * vma; |
| 402 | int fault; | 402 | int fault; |
| 403 | int write = error_code & FAULT_CODE_WRITE; | 403 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 404 | unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | ||
| 405 | (write ? FAULT_FLAG_WRITE : 0)); | ||
| 406 | 404 | ||
| 407 | tsk = current; | 405 | tsk = current; |
| 408 | mm = tsk->mm; | 406 | mm = tsk->mm; |
| @@ -476,6 +474,11 @@ good_area: | |||
| 476 | 474 | ||
| 477 | set_thread_fault_code(error_code); | 475 | set_thread_fault_code(error_code); |
| 478 | 476 | ||
| 477 | if (user_mode(regs)) | ||
| 478 | flags |= FAULT_FLAG_USER; | ||
| 479 | if (error_code & FAULT_CODE_WRITE) | ||
| 480 | flags |= FAULT_FLAG_WRITE; | ||
| 481 | |||
| 479 | /* | 482 | /* |
| 480 | * If for any reason at all we couldn't handle the fault, | 483 | * If for any reason at all we couldn't handle the fault, |
| 481 | * make sure we exit gracefully rather than endlessly redo | 484 | * make sure we exit gracefully rather than endlessly redo |
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index e98bfda205a2..59dbd4645725 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c | |||
| @@ -177,8 +177,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, | |||
| 177 | unsigned long g2; | 177 | unsigned long g2; |
| 178 | int from_user = !(regs->psr & PSR_PS); | 178 | int from_user = !(regs->psr & PSR_PS); |
| 179 | int fault, code; | 179 | int fault, code; |
| 180 | unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 180 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 181 | (write ? FAULT_FLAG_WRITE : 0)); | ||
| 182 | 181 | ||
| 183 | if (text_fault) | 182 | if (text_fault) |
| 184 | address = regs->pc; | 183 | address = regs->pc; |
| @@ -235,6 +234,11 @@ good_area: | |||
| 235 | goto bad_area; | 234 | goto bad_area; |
| 236 | } | 235 | } |
| 237 | 236 | ||
| 237 | if (from_user) | ||
| 238 | flags |= FAULT_FLAG_USER; | ||
| 239 | if (write) | ||
| 240 | flags |= FAULT_FLAG_WRITE; | ||
| 241 | |||
| 238 | /* | 242 | /* |
| 239 | * If for any reason at all we couldn't handle the fault, | 243 | * If for any reason at all we couldn't handle the fault, |
| 240 | * make sure we exit gracefully rather than endlessly redo | 244 | * make sure we exit gracefully rather than endlessly redo |
| @@ -383,6 +387,7 @@ static void force_user_fault(unsigned long address, int write) | |||
| 383 | struct vm_area_struct *vma; | 387 | struct vm_area_struct *vma; |
| 384 | struct task_struct *tsk = current; | 388 | struct task_struct *tsk = current; |
| 385 | struct mm_struct *mm = tsk->mm; | 389 | struct mm_struct *mm = tsk->mm; |
| 390 | unsigned int flags = FAULT_FLAG_USER; | ||
| 386 | int code; | 391 | int code; |
| 387 | 392 | ||
| 388 | code = SEGV_MAPERR; | 393 | code = SEGV_MAPERR; |
| @@ -402,11 +407,12 @@ good_area: | |||
| 402 | if (write) { | 407 | if (write) { |
| 403 | if (!(vma->vm_flags & VM_WRITE)) | 408 | if (!(vma->vm_flags & VM_WRITE)) |
| 404 | goto bad_area; | 409 | goto bad_area; |
| 410 | flags |= FAULT_FLAG_WRITE; | ||
| 405 | } else { | 411 | } else { |
| 406 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | 412 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) |
| 407 | goto bad_area; | 413 | goto bad_area; |
| 408 | } | 414 | } |
| 409 | switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0)) { | 415 | switch (handle_mm_fault(mm, vma, address, flags)) { |
| 410 | case VM_FAULT_SIGBUS: | 416 | case VM_FAULT_SIGBUS: |
| 411 | case VM_FAULT_OOM: | 417 | case VM_FAULT_OOM: |
| 412 | goto do_sigbus; | 418 | goto do_sigbus; |
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 5062ff389e83..2ebec263d685 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c | |||
| @@ -315,7 +315,8 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) | |||
| 315 | bad_kernel_pc(regs, address); | 315 | bad_kernel_pc(regs, address); |
| 316 | return; | 316 | return; |
| 317 | } | 317 | } |
| 318 | } | 318 | } else |
| 319 | flags |= FAULT_FLAG_USER; | ||
| 319 | 320 | ||
| 320 | /* | 321 | /* |
| 321 | * If we're in an interrupt or have no user | 322 | * If we're in an interrupt or have no user |
| @@ -418,13 +419,14 @@ good_area: | |||
| 418 | vma->vm_file != NULL) | 419 | vma->vm_file != NULL) |
| 419 | set_thread_fault_code(fault_code | | 420 | set_thread_fault_code(fault_code | |
| 420 | FAULT_CODE_BLKCOMMIT); | 421 | FAULT_CODE_BLKCOMMIT); |
| 422 | |||
| 423 | flags |= FAULT_FLAG_WRITE; | ||
| 421 | } else { | 424 | } else { |
| 422 | /* Allow reads even for write-only mappings */ | 425 | /* Allow reads even for write-only mappings */ |
| 423 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | 426 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) |
| 424 | goto bad_area; | 427 | goto bad_area; |
| 425 | } | 428 | } |
| 426 | 429 | ||
| 427 | flags |= ((fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0); | ||
| 428 | fault = handle_mm_fault(mm, vma, address, flags); | 430 | fault = handle_mm_fault(mm, vma, address, flags); |
| 429 | 431 | ||
| 430 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 432 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index 111d5a9b76f1..4c288f199453 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c | |||
| @@ -280,8 +280,7 @@ static int handle_page_fault(struct pt_regs *regs, | |||
| 280 | if (!is_page_fault) | 280 | if (!is_page_fault) |
| 281 | write = 1; | 281 | write = 1; |
| 282 | 282 | ||
| 283 | flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 283 | flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 284 | (write ? FAULT_FLAG_WRITE : 0)); | ||
| 285 | 284 | ||
| 286 | is_kernel_mode = !user_mode(regs); | 285 | is_kernel_mode = !user_mode(regs); |
| 287 | 286 | ||
| @@ -365,6 +364,9 @@ static int handle_page_fault(struct pt_regs *regs, | |||
| 365 | goto bad_area_nosemaphore; | 364 | goto bad_area_nosemaphore; |
| 366 | } | 365 | } |
| 367 | 366 | ||
| 367 | if (!is_kernel_mode) | ||
| 368 | flags |= FAULT_FLAG_USER; | ||
| 369 | |||
| 368 | /* | 370 | /* |
| 369 | * When running in the kernel we expect faults to occur only to | 371 | * When running in the kernel we expect faults to occur only to |
| 370 | * addresses in user space. All other faults represent errors in the | 372 | * addresses in user space. All other faults represent errors in the |
| @@ -425,12 +427,12 @@ good_area: | |||
| 425 | #endif | 427 | #endif |
| 426 | if (!(vma->vm_flags & VM_WRITE)) | 428 | if (!(vma->vm_flags & VM_WRITE)) |
| 427 | goto bad_area; | 429 | goto bad_area; |
| 430 | flags |= FAULT_FLAG_WRITE; | ||
| 428 | } else { | 431 | } else { |
| 429 | if (!is_page_fault || !(vma->vm_flags & VM_READ)) | 432 | if (!is_page_fault || !(vma->vm_flags & VM_READ)) |
| 430 | goto bad_area; | 433 | goto bad_area; |
| 431 | } | 434 | } |
| 432 | 435 | ||
| 433 | survive: | ||
| 434 | /* | 436 | /* |
| 435 | * If for any reason at all we couldn't handle the fault, | 437 | * If for any reason at all we couldn't handle the fault, |
| 436 | * make sure we exit gracefully rather than endlessly redo | 438 | * make sure we exit gracefully rather than endlessly redo |
| @@ -555,11 +557,6 @@ no_context: | |||
| 555 | */ | 557 | */ |
| 556 | out_of_memory: | 558 | out_of_memory: |
| 557 | up_read(&mm->mmap_sem); | 559 | up_read(&mm->mmap_sem); |
| 558 | if (is_global_init(tsk)) { | ||
| 559 | yield(); | ||
| 560 | down_read(&mm->mmap_sem); | ||
| 561 | goto survive; | ||
| 562 | } | ||
| 563 | if (is_kernel_mode) | 560 | if (is_kernel_mode) |
| 564 | goto no_context; | 561 | goto no_context; |
| 565 | pagefault_out_of_memory(); | 562 | pagefault_out_of_memory(); |
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 089f3987e273..5c3aef74237f 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c | |||
| @@ -30,8 +30,7 @@ int handle_page_fault(unsigned long address, unsigned long ip, | |||
| 30 | pmd_t *pmd; | 30 | pmd_t *pmd; |
| 31 | pte_t *pte; | 31 | pte_t *pte; |
| 32 | int err = -EFAULT; | 32 | int err = -EFAULT; |
| 33 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 33 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 34 | (is_write ? FAULT_FLAG_WRITE : 0); | ||
| 35 | 34 | ||
| 36 | *code_out = SEGV_MAPERR; | 35 | *code_out = SEGV_MAPERR; |
| 37 | 36 | ||
| @@ -42,6 +41,8 @@ int handle_page_fault(unsigned long address, unsigned long ip, | |||
| 42 | if (in_atomic()) | 41 | if (in_atomic()) |
| 43 | goto out_nosemaphore; | 42 | goto out_nosemaphore; |
| 44 | 43 | ||
| 44 | if (is_user) | ||
| 45 | flags |= FAULT_FLAG_USER; | ||
| 45 | retry: | 46 | retry: |
| 46 | down_read(&mm->mmap_sem); | 47 | down_read(&mm->mmap_sem); |
| 47 | vma = find_vma(mm, address); | 48 | vma = find_vma(mm, address); |
| @@ -58,12 +59,15 @@ retry: | |||
| 58 | 59 | ||
| 59 | good_area: | 60 | good_area: |
| 60 | *code_out = SEGV_ACCERR; | 61 | *code_out = SEGV_ACCERR; |
| 61 | if (is_write && !(vma->vm_flags & VM_WRITE)) | 62 | if (is_write) { |
| 62 | goto out; | 63 | if (!(vma->vm_flags & VM_WRITE)) |
| 63 | 64 | goto out; | |
| 64 | /* Don't require VM_READ|VM_EXEC for write faults! */ | 65 | flags |= FAULT_FLAG_WRITE; |
| 65 | if (!is_write && !(vma->vm_flags & (VM_READ | VM_EXEC))) | 66 | } else { |
| 66 | goto out; | 67 | /* Don't require VM_READ|VM_EXEC for write faults! */ |
| 68 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | ||
| 69 | goto out; | ||
| 70 | } | ||
| 67 | 71 | ||
| 68 | do { | 72 | do { |
| 69 | int fault; | 73 | int fault; |
| @@ -124,6 +128,8 @@ out_of_memory: | |||
| 124 | * (which will retry the fault, or kill us if we got oom-killed). | 128 | * (which will retry the fault, or kill us if we got oom-killed). |
| 125 | */ | 129 | */ |
| 126 | up_read(&mm->mmap_sem); | 130 | up_read(&mm->mmap_sem); |
| 131 | if (!is_user) | ||
| 132 | goto out_nosemaphore; | ||
| 127 | pagefault_out_of_memory(); | 133 | pagefault_out_of_memory(); |
| 128 | return 0; | 134 | return 0; |
| 129 | } | 135 | } |
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c index f9b5c10bccee..0dc922dba915 100644 --- a/arch/unicore32/mm/fault.c +++ b/arch/unicore32/mm/fault.c | |||
| @@ -209,8 +209,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs) | |||
| 209 | struct task_struct *tsk; | 209 | struct task_struct *tsk; |
| 210 | struct mm_struct *mm; | 210 | struct mm_struct *mm; |
| 211 | int fault, sig, code; | 211 | int fault, sig, code; |
| 212 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | 212 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 213 | ((!(fsr ^ 0x12)) ? FAULT_FLAG_WRITE : 0); | ||
| 214 | 213 | ||
| 215 | tsk = current; | 214 | tsk = current; |
| 216 | mm = tsk->mm; | 215 | mm = tsk->mm; |
| @@ -222,6 +221,11 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs) | |||
| 222 | if (in_atomic() || !mm) | 221 | if (in_atomic() || !mm) |
| 223 | goto no_context; | 222 | goto no_context; |
| 224 | 223 | ||
| 224 | if (user_mode(regs)) | ||
| 225 | flags |= FAULT_FLAG_USER; | ||
| 226 | if (!(fsr ^ 0x12)) | ||
| 227 | flags |= FAULT_FLAG_WRITE; | ||
| 228 | |||
| 225 | /* | 229 | /* |
| 226 | * As per x86, we may deadlock here. However, since the kernel only | 230 | * As per x86, we may deadlock here. However, since the kernel only |
| 227 | * validly references user space from well defined areas of the code, | 231 | * validly references user space from well defined areas of the code, |
| @@ -278,6 +282,13 @@ retry: | |||
| 278 | (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) | 282 | (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) |
| 279 | return 0; | 283 | return 0; |
| 280 | 284 | ||
| 285 | /* | ||
| 286 | * If we are in kernel mode at this point, we | ||
| 287 | * have no context to handle this fault with. | ||
| 288 | */ | ||
| 289 | if (!user_mode(regs)) | ||
| 290 | goto no_context; | ||
| 291 | |||
| 281 | if (fault & VM_FAULT_OOM) { | 292 | if (fault & VM_FAULT_OOM) { |
| 282 | /* | 293 | /* |
| 283 | * We ran out of memory, call the OOM killer, and return to | 294 | * We ran out of memory, call the OOM killer, and return to |
| @@ -288,13 +299,6 @@ retry: | |||
| 288 | return 0; | 299 | return 0; |
| 289 | } | 300 | } |
| 290 | 301 | ||
| 291 | /* | ||
| 292 | * If we are in kernel mode at this point, we | ||
| 293 | * have no context to handle this fault with. | ||
| 294 | */ | ||
| 295 | if (!user_mode(regs)) | ||
| 296 | goto no_context; | ||
| 297 | |||
| 298 | if (fault & VM_FAULT_SIGBUS) { | 302 | if (fault & VM_FAULT_SIGBUS) { |
| 299 | /* | 303 | /* |
| 300 | * We had some memory, but were unable to | 304 | * We had some memory, but were unable to |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 654be4ae3047..3aaeffcfd67a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
| @@ -842,23 +842,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, | |||
| 842 | force_sig_info_fault(SIGBUS, code, address, tsk, fault); | 842 | force_sig_info_fault(SIGBUS, code, address, tsk, fault); |
| 843 | } | 843 | } |
| 844 | 844 | ||
| 845 | static noinline int | 845 | static noinline void |
| 846 | mm_fault_error(struct pt_regs *regs, unsigned long error_code, | 846 | mm_fault_error(struct pt_regs *regs, unsigned long error_code, |
| 847 | unsigned long address, unsigned int fault) | 847 | unsigned long address, unsigned int fault) |
| 848 | { | 848 | { |
| 849 | /* | 849 | if (fatal_signal_pending(current) && !(error_code & PF_USER)) { |
| 850 | * Pagefault was interrupted by SIGKILL. We have no reason to | 850 | up_read(¤t->mm->mmap_sem); |
| 851 | * continue pagefault. | 851 | no_context(regs, error_code, address, 0, 0); |
| 852 | */ | 852 | return; |
| 853 | if (fatal_signal_pending(current)) { | ||
| 854 | if (!(fault & VM_FAULT_RETRY)) | ||
| 855 | up_read(¤t->mm->mmap_sem); | ||
| 856 | if (!(error_code & PF_USER)) | ||
| 857 | no_context(regs, error_code, address, 0, 0); | ||
| 858 | return 1; | ||
| 859 | } | 853 | } |
| 860 | if (!(fault & VM_FAULT_ERROR)) | ||
| 861 | return 0; | ||
| 862 | 854 | ||
| 863 | if (fault & VM_FAULT_OOM) { | 855 | if (fault & VM_FAULT_OOM) { |
| 864 | /* Kernel mode? Handle exceptions or die: */ | 856 | /* Kernel mode? Handle exceptions or die: */ |
| @@ -866,7 +858,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, | |||
| 866 | up_read(¤t->mm->mmap_sem); | 858 | up_read(¤t->mm->mmap_sem); |
| 867 | no_context(regs, error_code, address, | 859 | no_context(regs, error_code, address, |
| 868 | SIGSEGV, SEGV_MAPERR); | 860 | SIGSEGV, SEGV_MAPERR); |
| 869 | return 1; | 861 | return; |
| 870 | } | 862 | } |
| 871 | 863 | ||
| 872 | up_read(¤t->mm->mmap_sem); | 864 | up_read(¤t->mm->mmap_sem); |
| @@ -884,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, | |||
| 884 | else | 876 | else |
| 885 | BUG(); | 877 | BUG(); |
| 886 | } | 878 | } |
| 887 | return 1; | ||
| 888 | } | 879 | } |
| 889 | 880 | ||
| 890 | static int spurious_fault_check(unsigned long error_code, pte_t *pte) | 881 | static int spurious_fault_check(unsigned long error_code, pte_t *pte) |
| @@ -1011,9 +1002,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
| 1011 | unsigned long address; | 1002 | unsigned long address; |
| 1012 | struct mm_struct *mm; | 1003 | struct mm_struct *mm; |
| 1013 | int fault; | 1004 | int fault; |
| 1014 | int write = error_code & PF_WRITE; | 1005 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
| 1015 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | ||
| 1016 | (write ? FAULT_FLAG_WRITE : 0); | ||
| 1017 | 1006 | ||
| 1018 | tsk = current; | 1007 | tsk = current; |
| 1019 | mm = tsk->mm; | 1008 | mm = tsk->mm; |
| @@ -1083,6 +1072,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
| 1083 | if (user_mode_vm(regs)) { | 1072 | if (user_mode_vm(regs)) { |
| 1084 | local_irq_enable(); | 1073 | local_irq_enable(); |
| 1085 | error_code |= PF_USER; | 1074 | error_code |= PF_USER; |
| 1075 | flags |= FAULT_FLAG_USER; | ||
| 1086 | } else { | 1076 | } else { |
| 1087 | if (regs->flags & X86_EFLAGS_IF) | 1077 | if (regs->flags & X86_EFLAGS_IF) |
| 1088 | local_irq_enable(); | 1078 | local_irq_enable(); |
| @@ -1109,6 +1099,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
| 1109 | return; | 1099 | return; |
| 1110 | } | 1100 | } |
| 1111 | 1101 | ||
| 1102 | if (error_code & PF_WRITE) | ||
| 1103 | flags |= FAULT_FLAG_WRITE; | ||
| 1104 | |||
| 1112 | /* | 1105 | /* |
| 1113 | * When running in the kernel we expect faults to occur only to | 1106 | * When running in the kernel we expect faults to occur only to |
| 1114 | * addresses in user space. All other faults represent errors in | 1107 | * addresses in user space. All other faults represent errors in |
| @@ -1187,9 +1180,17 @@ good_area: | |||
| 1187 | */ | 1180 | */ |
| 1188 | fault = handle_mm_fault(mm, vma, address, flags); | 1181 | fault = handle_mm_fault(mm, vma, address, flags); |
| 1189 | 1182 | ||
| 1190 | if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { | 1183 | /* |
| 1191 | if (mm_fault_error(regs, error_code, address, fault)) | 1184 | * If we need to retry but a fatal signal is pending, handle the |
| 1192 | return; | 1185 | * signal first. We do not need to release the mmap_sem because it |
| 1186 | * would already be released in __lock_page_or_retry in mm/filemap.c. | ||
| 1187 | */ | ||
| 1188 | if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))) | ||
| 1189 | return; | ||
| 1190 | |||
| 1191 | if (unlikely(fault & VM_FAULT_ERROR)) { | ||
| 1192 | mm_fault_error(regs, error_code, address, fault); | ||
| 1193 | return; | ||
| 1193 | } | 1194 | } |
| 1194 | 1195 | ||
| 1195 | /* | 1196 | /* |
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 4b7bc8db170f..70fa7bc42b4a 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c | |||
| @@ -72,6 +72,8 @@ void do_page_fault(struct pt_regs *regs) | |||
| 72 | address, exccause, regs->pc, is_write? "w":"", is_exec? "x":""); | 72 | address, exccause, regs->pc, is_write? "w":"", is_exec? "x":""); |
| 73 | #endif | 73 | #endif |
| 74 | 74 | ||
| 75 | if (user_mode(regs)) | ||
| 76 | flags |= FAULT_FLAG_USER; | ||
| 75 | retry: | 77 | retry: |
| 76 | down_read(&mm->mmap_sem); | 78 | down_read(&mm->mmap_sem); |
| 77 | vma = find_vma(mm, address); | 79 | vma = find_vma(mm, address); |
diff --git a/drivers/base/node.c b/drivers/base/node.c index 7616a77ca322..bc9f43bf7e29 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c | |||
| @@ -125,13 +125,7 @@ static ssize_t node_read_meminfo(struct device *dev, | |||
| 125 | nid, K(node_page_state(nid, NR_WRITEBACK)), | 125 | nid, K(node_page_state(nid, NR_WRITEBACK)), |
| 126 | nid, K(node_page_state(nid, NR_FILE_PAGES)), | 126 | nid, K(node_page_state(nid, NR_FILE_PAGES)), |
| 127 | nid, K(node_page_state(nid, NR_FILE_MAPPED)), | 127 | nid, K(node_page_state(nid, NR_FILE_MAPPED)), |
| 128 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
| 129 | nid, K(node_page_state(nid, NR_ANON_PAGES) | ||
| 130 | + node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) * | ||
| 131 | HPAGE_PMD_NR), | ||
| 132 | #else | ||
| 133 | nid, K(node_page_state(nid, NR_ANON_PAGES)), | 128 | nid, K(node_page_state(nid, NR_ANON_PAGES)), |
| 134 | #endif | ||
| 135 | nid, K(node_page_state(nid, NR_SHMEM)), | 129 | nid, K(node_page_state(nid, NR_SHMEM)), |
| 136 | nid, node_page_state(nid, NR_KERNEL_STACK) * | 130 | nid, node_page_state(nid, NR_KERNEL_STACK) * |
| 137 | THREAD_SIZE / 1024, | 131 | THREAD_SIZE / 1024, |
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 5f95d1ed9c6d..b9acadafa4a1 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c | |||
| @@ -50,7 +50,7 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to) | |||
| 50 | struct inode *inode = mapping->host; | 50 | struct inode *inode = mapping->host; |
| 51 | 51 | ||
| 52 | if (to > inode->i_size) | 52 | if (to > inode->i_size) |
| 53 | truncate_pagecache(inode, to, inode->i_size); | 53 | truncate_pagecache(inode, inode->i_size); |
| 54 | } | 54 | } |
| 55 | 55 | ||
| 56 | static int adfs_write_begin(struct file *file, struct address_space *mapping, | 56 | static int adfs_write_begin(struct file *file, struct address_space *mapping, |
diff --git a/fs/affs/file.c b/fs/affs/file.c index 776e3935a758..8669b6ecddee 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c | |||
| @@ -406,7 +406,7 @@ static void affs_write_failed(struct address_space *mapping, loff_t to) | |||
| 406 | struct inode *inode = mapping->host; | 406 | struct inode *inode = mapping->host; |
| 407 | 407 | ||
| 408 | if (to > inode->i_size) { | 408 | if (to > inode->i_size) { |
| 409 | truncate_pagecache(inode, to, inode->i_size); | 409 | truncate_pagecache(inode, inode->i_size); |
| 410 | affs_truncate(inode); | 410 | affs_truncate(inode); |
| 411 | } | 411 | } |
| 412 | } | 412 | } |
diff --git a/fs/bfs/file.c b/fs/bfs/file.c index ad3ea1497cc3..ae2892218335 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c | |||
| @@ -166,7 +166,7 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to) | |||
| 166 | struct inode *inode = mapping->host; | 166 | struct inode *inode = mapping->host; |
| 167 | 167 | ||
| 168 | if (to > inode->i_size) | 168 | if (to > inode->i_size) |
| 169 | truncate_pagecache(inode, to, inode->i_size); | 169 | truncate_pagecache(inode, inode->i_size); |
| 170 | } | 170 | } |
| 171 | 171 | ||
| 172 | static int bfs_write_begin(struct file *file, struct address_space *mapping, | 172 | static int bfs_write_begin(struct file *file, struct address_space *mapping, |
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ef3bea7bb257..3f0ddfce96e6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c | |||
| @@ -221,12 +221,10 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, | |||
| 221 | struct btrfs_path *path, | 221 | struct btrfs_path *path, |
| 222 | struct inode *inode) | 222 | struct inode *inode) |
| 223 | { | 223 | { |
| 224 | loff_t oldsize; | ||
| 225 | int ret = 0; | 224 | int ret = 0; |
| 226 | 225 | ||
| 227 | oldsize = i_size_read(inode); | ||
| 228 | btrfs_i_size_write(inode, 0); | 226 | btrfs_i_size_write(inode, 0); |
| 229 | truncate_pagecache(inode, oldsize, 0); | 227 | truncate_pagecache(inode, 0); |
| 230 | 228 | ||
| 231 | /* | 229 | /* |
| 232 | * We don't need an orphan item because truncating the free space cache | 230 | * We don't need an orphan item because truncating the free space cache |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index db1e43948579..f338c5672d58 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
| @@ -4349,7 +4349,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) | |||
| 4349 | inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); | 4349 | inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); |
| 4350 | 4350 | ||
| 4351 | if (newsize > oldsize) { | 4351 | if (newsize > oldsize) { |
| 4352 | truncate_pagecache(inode, oldsize, newsize); | 4352 | truncate_pagecache(inode, newsize); |
| 4353 | ret = btrfs_cont_expand(inode, oldsize, newsize); | 4353 | ret = btrfs_cont_expand(inode, oldsize, newsize); |
| 4354 | if (ret) | 4354 | if (ret) |
| 4355 | return ret; | 4355 | return ret; |
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index e3bb6477c83f..f9ff9c173f78 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c | |||
| @@ -1856,14 +1856,11 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from) | |||
| 1856 | 1856 | ||
| 1857 | static void cifs_setsize(struct inode *inode, loff_t offset) | 1857 | static void cifs_setsize(struct inode *inode, loff_t offset) |
| 1858 | { | 1858 | { |
| 1859 | loff_t oldsize; | ||
| 1860 | |||
| 1861 | spin_lock(&inode->i_lock); | 1859 | spin_lock(&inode->i_lock); |
| 1862 | oldsize = inode->i_size; | ||
| 1863 | i_size_write(inode, offset); | 1860 | i_size_write(inode, offset); |
| 1864 | spin_unlock(&inode->i_lock); | 1861 | spin_unlock(&inode->i_lock); |
| 1865 | 1862 | ||
| 1866 | truncate_pagecache(inode, oldsize, offset); | 1863 | truncate_pagecache(inode, offset); |
| 1867 | } | 1864 | } |
| 1868 | 1865 | ||
| 1869 | static int | 1866 | static int |
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 2ec8eb1ab269..a52a5d23c30b 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c | |||
| @@ -861,7 +861,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc) | |||
| 861 | static void _write_failed(struct inode *inode, loff_t to) | 861 | static void _write_failed(struct inode *inode, loff_t to) |
| 862 | { | 862 | { |
| 863 | if (to > inode->i_size) | 863 | if (to > inode->i_size) |
| 864 | truncate_pagecache(inode, to, inode->i_size); | 864 | truncate_pagecache(inode, inode->i_size); |
| 865 | } | 865 | } |
| 866 | 866 | ||
| 867 | int exofs_write_begin(struct file *file, struct address_space *mapping, | 867 | int exofs_write_begin(struct file *file, struct address_space *mapping, |
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 0a87bb10998d..c260de6d7b6d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c | |||
| @@ -58,7 +58,7 @@ static void ext2_write_failed(struct address_space *mapping, loff_t to) | |||
| 58 | struct inode *inode = mapping->host; | 58 | struct inode *inode = mapping->host; |
| 59 | 59 | ||
| 60 | if (to > inode->i_size) { | 60 | if (to > inode->i_size) { |
| 61 | truncate_pagecache(inode, to, inode->i_size); | 61 | truncate_pagecache(inode, inode->i_size); |
| 62 | ext2_truncate_blocks(inode, inode->i_size); | 62 | ext2_truncate_blocks(inode, inode->i_size); |
| 63 | } | 63 | } |
| 64 | } | 64 | } |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c79fd7dabe79..0d424d7ac02b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
| @@ -4587,7 +4587,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
| 4587 | 4587 | ||
| 4588 | if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { | 4588 | if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { |
| 4589 | handle_t *handle; | 4589 | handle_t *handle; |
| 4590 | loff_t oldsize = inode->i_size; | ||
| 4591 | 4590 | ||
| 4592 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { | 4591 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { |
| 4593 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 4592 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
| @@ -4650,7 +4649,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
| 4650 | * Truncate pagecache after we've waited for commit | 4649 | * Truncate pagecache after we've waited for commit |
| 4651 | * in data=journal mode to make pages freeable. | 4650 | * in data=journal mode to make pages freeable. |
| 4652 | */ | 4651 | */ |
| 4653 | truncate_pagecache(inode, oldsize, inode->i_size); | 4652 | truncate_pagecache(inode, inode->i_size); |
| 4654 | } | 4653 | } |
| 4655 | /* | 4654 | /* |
| 4656 | * We want to call ext4_truncate() even if attr->ia_size == | 4655 | * We want to call ext4_truncate() even if attr->ia_size == |
diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 11b51bb55b42..0062da21dd8b 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c | |||
| @@ -147,7 +147,7 @@ static void fat_write_failed(struct address_space *mapping, loff_t to) | |||
| 147 | struct inode *inode = mapping->host; | 147 | struct inode *inode = mapping->host; |
| 148 | 148 | ||
| 149 | if (to > inode->i_size) { | 149 | if (to > inode->i_size) { |
| 150 | truncate_pagecache(inode, to, inode->i_size); | 150 | truncate_pagecache(inode, inode->i_size); |
| 151 | fat_truncate_blocks(inode, inode->i_size); | 151 | fat_truncate_blocks(inode, inode->i_size); |
| 152 | } | 152 | } |
| 153 | } | 153 | } |
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3ac91086f41f..62b43b577bfc 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c | |||
| @@ -1678,7 +1678,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, | |||
| 1678 | * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. | 1678 | * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. |
| 1679 | */ | 1679 | */ |
| 1680 | if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { | 1680 | if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { |
| 1681 | truncate_pagecache(inode, oldsize, outarg.attr.size); | 1681 | truncate_pagecache(inode, outarg.attr.size); |
| 1682 | invalidate_inode_pages2(inode->i_mapping); | 1682 | invalidate_inode_pages2(inode->i_mapping); |
| 1683 | } | 1683 | } |
| 1684 | 1684 | ||
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 84434594e80e..a8ce6dab60a0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c | |||
| @@ -218,7 +218,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, | |||
| 218 | bool inval = false; | 218 | bool inval = false; |
| 219 | 219 | ||
| 220 | if (oldsize != attr->size) { | 220 | if (oldsize != attr->size) { |
| 221 | truncate_pagecache(inode, oldsize, attr->size); | 221 | truncate_pagecache(inode, attr->size); |
| 222 | inval = true; | 222 | inval = true; |
| 223 | } else if (fc->auto_inval_data) { | 223 | } else if (fc->auto_inval_data) { |
| 224 | struct timespec new_mtime = { | 224 | struct timespec new_mtime = { |
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5e2f56fccf6b..62a65fc448dc 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c | |||
| @@ -1016,7 +1016,7 @@ static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize | |||
| 1016 | chunk = oldsize - newsize; | 1016 | chunk = oldsize - newsize; |
| 1017 | if (chunk > max_chunk) | 1017 | if (chunk > max_chunk) |
| 1018 | chunk = max_chunk; | 1018 | chunk = max_chunk; |
| 1019 | truncate_pagecache(inode, oldsize, oldsize - chunk); | 1019 | truncate_pagecache(inode, oldsize - chunk); |
| 1020 | oldsize -= chunk; | 1020 | oldsize -= chunk; |
| 1021 | gfs2_trans_end(sdp); | 1021 | gfs2_trans_end(sdp); |
| 1022 | error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); | 1022 | error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); |
| @@ -1067,7 +1067,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) | |||
| 1067 | if (journaled) | 1067 | if (journaled) |
| 1068 | error = gfs2_journaled_truncate(inode, oldsize, newsize); | 1068 | error = gfs2_journaled_truncate(inode, oldsize, newsize); |
| 1069 | else | 1069 | else |
| 1070 | truncate_pagecache(inode, oldsize, newsize); | 1070 | truncate_pagecache(inode, newsize); |
| 1071 | 1071 | ||
| 1072 | if (error) { | 1072 | if (error) { |
| 1073 | brelse(dibh); | 1073 | brelse(dibh); |
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index f9299d8a64e3..380ab31b5e0f 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c | |||
| @@ -41,7 +41,7 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to) | |||
| 41 | struct inode *inode = mapping->host; | 41 | struct inode *inode = mapping->host; |
| 42 | 42 | ||
| 43 | if (to > inode->i_size) { | 43 | if (to > inode->i_size) { |
| 44 | truncate_pagecache(inode, to, inode->i_size); | 44 | truncate_pagecache(inode, inode->i_size); |
| 45 | hfs_file_truncate(inode); | 45 | hfs_file_truncate(inode); |
| 46 | } | 46 | } |
| 47 | } | 47 | } |
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 4d2edaea891c..37213d075f3c 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c | |||
| @@ -36,7 +36,7 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to) | |||
| 36 | struct inode *inode = mapping->host; | 36 | struct inode *inode = mapping->host; |
| 37 | 37 | ||
| 38 | if (to > inode->i_size) { | 38 | if (to > inode->i_size) { |
| 39 | truncate_pagecache(inode, to, inode->i_size); | 39 | truncate_pagecache(inode, inode->i_size); |
| 40 | hfsplus_file_truncate(inode); | 40 | hfsplus_file_truncate(inode); |
| 41 | } | 41 | } |
| 42 | } | 42 | } |
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 4e9dabcf1f4c..67c1a61e0955 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c | |||
| @@ -138,7 +138,7 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to) | |||
| 138 | hpfs_lock(inode->i_sb); | 138 | hpfs_lock(inode->i_sb); |
| 139 | 139 | ||
| 140 | if (to > inode->i_size) { | 140 | if (to > inode->i_size) { |
| 141 | truncate_pagecache(inode, to, inode->i_size); | 141 | truncate_pagecache(inode, inode->i_size); |
| 142 | hpfs_truncate(inode); | 142 | hpfs_truncate(inode); |
| 143 | } | 143 | } |
| 144 | 144 | ||
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 730f24e282a6..f4aab719add5 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c | |||
| @@ -306,7 +306,7 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to) | |||
| 306 | struct inode *inode = mapping->host; | 306 | struct inode *inode = mapping->host; |
| 307 | 307 | ||
| 308 | if (to > inode->i_size) { | 308 | if (to > inode->i_size) { |
| 309 | truncate_pagecache(inode, to, inode->i_size); | 309 | truncate_pagecache(inode, inode->i_size); |
| 310 | jfs_truncate(inode); | 310 | jfs_truncate(inode); |
| 311 | } | 311 | } |
| 312 | } | 312 | } |
diff --git a/fs/minix/inode.c b/fs/minix/inode.c index df122496f328..0332109162a5 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c | |||
| @@ -400,7 +400,7 @@ static void minix_write_failed(struct address_space *mapping, loff_t to) | |||
| 400 | struct inode *inode = mapping->host; | 400 | struct inode *inode = mapping->host; |
| 401 | 401 | ||
| 402 | if (to > inode->i_size) { | 402 | if (to > inode->i_size) { |
| 403 | truncate_pagecache(inode, to, inode->i_size); | 403 | truncate_pagecache(inode, inode->i_size); |
| 404 | minix_truncate(inode); | 404 | minix_truncate(inode); |
| 405 | } | 405 | } |
| 406 | } | 406 | } |
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 87e797640828..eda8879171c4 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c | |||
| @@ -541,7 +541,6 @@ EXPORT_SYMBOL_GPL(nfs_setattr); | |||
| 541 | */ | 541 | */ |
| 542 | static int nfs_vmtruncate(struct inode * inode, loff_t offset) | 542 | static int nfs_vmtruncate(struct inode * inode, loff_t offset) |
| 543 | { | 543 | { |
| 544 | loff_t oldsize; | ||
| 545 | int err; | 544 | int err; |
| 546 | 545 | ||
| 547 | err = inode_newsize_ok(inode, offset); | 546 | err = inode_newsize_ok(inode, offset); |
| @@ -549,11 +548,10 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) | |||
| 549 | goto out; | 548 | goto out; |
| 550 | 549 | ||
| 551 | spin_lock(&inode->i_lock); | 550 | spin_lock(&inode->i_lock); |
| 552 | oldsize = inode->i_size; | ||
| 553 | i_size_write(inode, offset); | 551 | i_size_write(inode, offset); |
| 554 | spin_unlock(&inode->i_lock); | 552 | spin_unlock(&inode->i_lock); |
| 555 | 553 | ||
| 556 | truncate_pagecache(inode, oldsize, offset); | 554 | truncate_pagecache(inode, offset); |
| 557 | out: | 555 | out: |
| 558 | return err; | 556 | return err; |
| 559 | } | 557 | } |
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b1a5277cfd18..7e350c562e0e 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c | |||
| @@ -254,7 +254,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to) | |||
| 254 | struct inode *inode = mapping->host; | 254 | struct inode *inode = mapping->host; |
| 255 | 255 | ||
| 256 | if (to > inode->i_size) { | 256 | if (to > inode->i_size) { |
| 257 | truncate_pagecache(inode, to, inode->i_size); | 257 | truncate_pagecache(inode, inode->i_size); |
| 258 | nilfs_truncate(inode); | 258 | nilfs_truncate(inode); |
| 259 | } | 259 | } |
| 260 | } | 260 | } |
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index c5670b8d198c..ea4ba9daeb47 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c | |||
| @@ -1768,7 +1768,7 @@ static void ntfs_write_failed(struct address_space *mapping, loff_t to) | |||
| 1768 | struct inode *inode = mapping->host; | 1768 | struct inode *inode = mapping->host; |
| 1769 | 1769 | ||
| 1770 | if (to > inode->i_size) { | 1770 | if (to > inode->i_size) { |
| 1771 | truncate_pagecache(inode, to, inode->i_size); | 1771 | truncate_pagecache(inode, inode->i_size); |
| 1772 | ntfs_truncate_vfs(inode); | 1772 | ntfs_truncate_vfs(inode); |
| 1773 | } | 1773 | } |
| 1774 | } | 1774 | } |
diff --git a/fs/omfs/file.c b/fs/omfs/file.c index e0d9b3e722bd..54d57d6ba68d 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c | |||
| @@ -311,7 +311,7 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to) | |||
| 311 | struct inode *inode = mapping->host; | 311 | struct inode *inode = mapping->host; |
| 312 | 312 | ||
| 313 | if (to > inode->i_size) { | 313 | if (to > inode->i_size) { |
| 314 | truncate_pagecache(inode, to, inode->i_size); | 314 | truncate_pagecache(inode, inode->i_size); |
| 315 | omfs_truncate(inode); | 315 | omfs_truncate(inode); |
| 316 | } | 316 | } |
| 317 | } | 317 | } |
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 5aa847a603c0..59d85d608898 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
| @@ -132,13 +132,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
| 132 | K(i.freeswap), | 132 | K(i.freeswap), |
| 133 | K(global_page_state(NR_FILE_DIRTY)), | 133 | K(global_page_state(NR_FILE_DIRTY)), |
| 134 | K(global_page_state(NR_WRITEBACK)), | 134 | K(global_page_state(NR_WRITEBACK)), |
| 135 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
| 136 | K(global_page_state(NR_ANON_PAGES) | ||
| 137 | + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * | ||
| 138 | HPAGE_PMD_NR), | ||
| 139 | #else | ||
| 140 | K(global_page_state(NR_ANON_PAGES)), | 135 | K(global_page_state(NR_ANON_PAGES)), |
| 141 | #endif | ||
| 142 | K(global_page_state(NR_FILE_MAPPED)), | 136 | K(global_page_state(NR_FILE_MAPPED)), |
| 143 | K(global_page_state(NR_SHMEM)), | 137 | K(global_page_state(NR_SHMEM)), |
| 144 | K(global_page_state(NR_SLAB_RECLAIMABLE) + | 138 | K(global_page_state(NR_SLAB_RECLAIMABLE) + |
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index c1a591a4725b..66bc316927e8 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c | |||
| @@ -469,7 +469,7 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to) | |||
| 469 | struct inode *inode = mapping->host; | 469 | struct inode *inode = mapping->host; |
| 470 | 470 | ||
| 471 | if (to > inode->i_size) { | 471 | if (to > inode->i_size) { |
| 472 | truncate_pagecache(inode, to, inode->i_size); | 472 | truncate_pagecache(inode, inode->i_size); |
| 473 | sysv_truncate(inode); | 473 | sysv_truncate(inode); |
| 474 | } | 474 | } |
| 475 | } | 475 | } |
diff --git a/fs/udf/inode.c b/fs/udf/inode.c index b6d15d349810..062b7925bca0 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c | |||
| @@ -172,7 +172,7 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) | |||
| 172 | loff_t isize = inode->i_size; | 172 | loff_t isize = inode->i_size; |
| 173 | 173 | ||
| 174 | if (to > isize) { | 174 | if (to > isize) { |
| 175 | truncate_pagecache(inode, to, isize); | 175 | truncate_pagecache(inode, isize); |
| 176 | if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { | 176 | if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { |
| 177 | down_write(&iinfo->i_data_sem); | 177 | down_write(&iinfo->i_data_sem); |
| 178 | udf_clear_extent_cache(inode); | 178 | udf_clear_extent_cache(inode); |
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index ff24e4449ece..c8ca96086784 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c | |||
| @@ -531,7 +531,7 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to) | |||
| 531 | struct inode *inode = mapping->host; | 531 | struct inode *inode = mapping->host; |
| 532 | 532 | ||
| 533 | if (to > inode->i_size) | 533 | if (to > inode->i_size) |
| 534 | truncate_pagecache(inode, to, inode->i_size); | 534 | truncate_pagecache(inode, inode->i_size); |
| 535 | } | 535 | } |
| 536 | 536 | ||
| 537 | static int ufs_write_begin(struct file *file, struct address_space *mapping, | 537 | static int ufs_write_begin(struct file *file, struct address_space *mapping, |
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 977da0ec6604..e51e581454e9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c | |||
| @@ -1582,7 +1582,7 @@ xfs_vm_write_begin( | |||
| 1582 | unlock_page(page); | 1582 | unlock_page(page); |
| 1583 | 1583 | ||
| 1584 | if (pos + len > i_size_read(inode)) | 1584 | if (pos + len > i_size_read(inode)) |
| 1585 | truncate_pagecache(inode, pos + len, i_size_read(inode)); | 1585 | truncate_pagecache(inode, i_size_read(inode)); |
| 1586 | 1586 | ||
| 1587 | page_cache_release(page); | 1587 | page_cache_release(page); |
| 1588 | page = NULL; | 1588 | page = NULL; |
| @@ -1618,7 +1618,7 @@ xfs_vm_write_end( | |||
| 1618 | loff_t to = pos + len; | 1618 | loff_t to = pos + len; |
| 1619 | 1619 | ||
| 1620 | if (to > isize) { | 1620 | if (to > isize) { |
| 1621 | truncate_pagecache(inode, to, isize); | 1621 | truncate_pagecache(inode, isize); |
| 1622 | xfs_vm_kill_delalloc_range(inode, isize, to); | 1622 | xfs_vm_kill_delalloc_range(inode, isize, to); |
| 1623 | } | 1623 | } |
| 1624 | } | 1624 | } |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b60de92e2edc..3935428c57cf 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
| @@ -96,9 +96,6 @@ extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 96 | pmd_t *dst_pmd, pmd_t *src_pmd, | 96 | pmd_t *dst_pmd, pmd_t *src_pmd, |
| 97 | struct vm_area_struct *vma, | 97 | struct vm_area_struct *vma, |
| 98 | unsigned long addr, unsigned long end); | 98 | unsigned long addr, unsigned long end); |
| 99 | extern int handle_pte_fault(struct mm_struct *mm, | ||
| 100 | struct vm_area_struct *vma, unsigned long address, | ||
| 101 | pte_t *pte, pmd_t *pmd, unsigned int flags); | ||
| 102 | extern int split_huge_page_to_list(struct page *page, struct list_head *list); | 99 | extern int split_huge_page_to_list(struct page *page, struct list_head *list); |
| 103 | static inline int split_huge_page(struct page *page) | 100 | static inline int split_huge_page(struct page *page) |
| 104 | { | 101 | { |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6c416092e324..60e95872da29 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
| @@ -30,9 +30,21 @@ struct page; | |||
| 30 | struct mm_struct; | 30 | struct mm_struct; |
| 31 | struct kmem_cache; | 31 | struct kmem_cache; |
| 32 | 32 | ||
| 33 | /* Stats that can be updated by kernel. */ | 33 | /* |
| 34 | enum mem_cgroup_page_stat_item { | 34 | * The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c, |
| 35 | MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ | 35 | * These two lists should keep in accord with each other. |
| 36 | */ | ||
| 37 | enum mem_cgroup_stat_index { | ||
| 38 | /* | ||
| 39 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. | ||
| 40 | */ | ||
| 41 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | ||
| 42 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | ||
| 43 | MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ | ||
| 44 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | ||
| 45 | MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ | ||
| 46 | MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ | ||
| 47 | MEM_CGROUP_STAT_NSTATS, | ||
| 36 | }; | 48 | }; |
| 37 | 49 | ||
| 38 | struct mem_cgroup_reclaim_cookie { | 50 | struct mem_cgroup_reclaim_cookie { |
| @@ -41,6 +53,23 @@ struct mem_cgroup_reclaim_cookie { | |||
| 41 | unsigned int generation; | 53 | unsigned int generation; |
| 42 | }; | 54 | }; |
| 43 | 55 | ||
| 56 | enum mem_cgroup_filter_t { | ||
| 57 | VISIT, /* visit current node */ | ||
| 58 | SKIP, /* skip the current node and continue traversal */ | ||
| 59 | SKIP_TREE, /* skip the whole subtree and continue traversal */ | ||
| 60 | }; | ||
| 61 | |||
| 62 | /* | ||
| 63 | * mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to | ||
| 64 | * iterate through the hierarchy tree. Each tree element is checked by the | ||
| 65 | * predicate before it is returned by the iterator. If a filter returns | ||
| 66 | * SKIP or SKIP_TREE then the iterator code continues traversal (with the | ||
| 67 | * next node down the hierarchy or the next node that doesn't belong under the | ||
| 68 | * memcg's subtree). | ||
| 69 | */ | ||
| 70 | typedef enum mem_cgroup_filter_t | ||
| 71 | (*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root); | ||
| 72 | |||
| 44 | #ifdef CONFIG_MEMCG | 73 | #ifdef CONFIG_MEMCG |
| 45 | /* | 74 | /* |
| 46 | * All "charge" functions with gfp_mask should use GFP_KERNEL or | 75 | * All "charge" functions with gfp_mask should use GFP_KERNEL or |
| @@ -108,9 +137,18 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage, | |||
| 108 | extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, | 137 | extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, |
| 109 | struct page *oldpage, struct page *newpage, bool migration_ok); | 138 | struct page *oldpage, struct page *newpage, bool migration_ok); |
| 110 | 139 | ||
| 111 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, | 140 | struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, |
| 112 | struct mem_cgroup *, | 141 | struct mem_cgroup *prev, |
| 113 | struct mem_cgroup_reclaim_cookie *); | 142 | struct mem_cgroup_reclaim_cookie *reclaim, |
| 143 | mem_cgroup_iter_filter cond); | ||
| 144 | |||
| 145 | static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | ||
| 146 | struct mem_cgroup *prev, | ||
| 147 | struct mem_cgroup_reclaim_cookie *reclaim) | ||
| 148 | { | ||
| 149 | return mem_cgroup_iter_cond(root, prev, reclaim, NULL); | ||
| 150 | } | ||
| 151 | |||
| 114 | void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); | 152 | void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); |
| 115 | 153 | ||
| 116 | /* | 154 | /* |
| @@ -125,6 +163,48 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, | |||
| 125 | extern void mem_cgroup_replace_page_cache(struct page *oldpage, | 163 | extern void mem_cgroup_replace_page_cache(struct page *oldpage, |
| 126 | struct page *newpage); | 164 | struct page *newpage); |
| 127 | 165 | ||
| 166 | /** | ||
| 167 | * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task | ||
| 168 | * @new: true to enable, false to disable | ||
| 169 | * | ||
| 170 | * Toggle whether a failed memcg charge should invoke the OOM killer | ||
| 171 | * or just return -ENOMEM. Returns the previous toggle state. | ||
| 172 | * | ||
| 173 | * NOTE: Any path that enables the OOM killer before charging must | ||
| 174 | * call mem_cgroup_oom_synchronize() afterward to finalize the | ||
| 175 | * OOM handling and clean up. | ||
| 176 | */ | ||
| 177 | static inline bool mem_cgroup_toggle_oom(bool new) | ||
| 178 | { | ||
| 179 | bool old; | ||
| 180 | |||
| 181 | old = current->memcg_oom.may_oom; | ||
| 182 | current->memcg_oom.may_oom = new; | ||
| 183 | |||
| 184 | return old; | ||
| 185 | } | ||
| 186 | |||
| 187 | static inline void mem_cgroup_enable_oom(void) | ||
| 188 | { | ||
| 189 | bool old = mem_cgroup_toggle_oom(true); | ||
| 190 | |||
| 191 | WARN_ON(old == true); | ||
| 192 | } | ||
| 193 | |||
| 194 | static inline void mem_cgroup_disable_oom(void) | ||
| 195 | { | ||
| 196 | bool old = mem_cgroup_toggle_oom(false); | ||
| 197 | |||
| 198 | WARN_ON(old == false); | ||
| 199 | } | ||
| 200 | |||
| 201 | static inline bool task_in_memcg_oom(struct task_struct *p) | ||
| 202 | { | ||
| 203 | return p->memcg_oom.in_memcg_oom; | ||
| 204 | } | ||
| 205 | |||
| 206 | bool mem_cgroup_oom_synchronize(void); | ||
| 207 | |||
| 128 | #ifdef CONFIG_MEMCG_SWAP | 208 | #ifdef CONFIG_MEMCG_SWAP |
| 129 | extern int do_swap_account; | 209 | extern int do_swap_account; |
| 130 | #endif | 210 | #endif |
| @@ -165,24 +245,24 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page, | |||
| 165 | } | 245 | } |
| 166 | 246 | ||
| 167 | void mem_cgroup_update_page_stat(struct page *page, | 247 | void mem_cgroup_update_page_stat(struct page *page, |
| 168 | enum mem_cgroup_page_stat_item idx, | 248 | enum mem_cgroup_stat_index idx, |
| 169 | int val); | 249 | int val); |
| 170 | 250 | ||
| 171 | static inline void mem_cgroup_inc_page_stat(struct page *page, | 251 | static inline void mem_cgroup_inc_page_stat(struct page *page, |
| 172 | enum mem_cgroup_page_stat_item idx) | 252 | enum mem_cgroup_stat_index idx) |
| 173 | { | 253 | { |
| 174 | mem_cgroup_update_page_stat(page, idx, 1); | 254 | mem_cgroup_update_page_stat(page, idx, 1); |
| 175 | } | 255 | } |
| 176 | 256 | ||
| 177 | static inline void mem_cgroup_dec_page_stat(struct page *page, | 257 | static inline void mem_cgroup_dec_page_stat(struct page *page, |
| 178 | enum mem_cgroup_page_stat_item idx) | 258 | enum mem_cgroup_stat_index idx) |
| 179 | { | 259 | { |
| 180 | mem_cgroup_update_page_stat(page, idx, -1); | 260 | mem_cgroup_update_page_stat(page, idx, -1); |
| 181 | } | 261 | } |
| 182 | 262 | ||
| 183 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | 263 | enum mem_cgroup_filter_t |
| 184 | gfp_t gfp_mask, | 264 | mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, |
| 185 | unsigned long *total_scanned); | 265 | struct mem_cgroup *root); |
| 186 | 266 | ||
| 187 | void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); | 267 | void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); |
| 188 | static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, | 268 | static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, |
| @@ -296,6 +376,15 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
| 296 | struct page *oldpage, struct page *newpage, bool migration_ok) | 376 | struct page *oldpage, struct page *newpage, bool migration_ok) |
| 297 | { | 377 | { |
| 298 | } | 378 | } |
| 379 | static inline struct mem_cgroup * | ||
| 380 | mem_cgroup_iter_cond(struct mem_cgroup *root, | ||
| 381 | struct mem_cgroup *prev, | ||
| 382 | struct mem_cgroup_reclaim_cookie *reclaim, | ||
| 383 | mem_cgroup_iter_filter cond) | ||
| 384 | { | ||
| 385 | /* first call must return non-NULL, second return NULL */ | ||
| 386 | return (struct mem_cgroup *)(unsigned long)!prev; | ||
| 387 | } | ||
| 299 | 388 | ||
| 300 | static inline struct mem_cgroup * | 389 | static inline struct mem_cgroup * |
| 301 | mem_cgroup_iter(struct mem_cgroup *root, | 390 | mem_cgroup_iter(struct mem_cgroup *root, |
| @@ -348,22 +437,45 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page, | |||
| 348 | { | 437 | { |
| 349 | } | 438 | } |
| 350 | 439 | ||
| 440 | static inline bool mem_cgroup_toggle_oom(bool new) | ||
| 441 | { | ||
| 442 | return false; | ||
| 443 | } | ||
| 444 | |||
| 445 | static inline void mem_cgroup_enable_oom(void) | ||
| 446 | { | ||
| 447 | } | ||
| 448 | |||
| 449 | static inline void mem_cgroup_disable_oom(void) | ||
| 450 | { | ||
| 451 | } | ||
| 452 | |||
| 453 | static inline bool task_in_memcg_oom(struct task_struct *p) | ||
| 454 | { | ||
| 455 | return false; | ||
| 456 | } | ||
| 457 | |||
| 458 | static inline bool mem_cgroup_oom_synchronize(void) | ||
| 459 | { | ||
| 460 | return false; | ||
| 461 | } | ||
| 462 | |||
| 351 | static inline void mem_cgroup_inc_page_stat(struct page *page, | 463 | static inline void mem_cgroup_inc_page_stat(struct page *page, |
| 352 | enum mem_cgroup_page_stat_item idx) | 464 | enum mem_cgroup_stat_index idx) |
| 353 | { | 465 | { |
| 354 | } | 466 | } |
| 355 | 467 | ||
| 356 | static inline void mem_cgroup_dec_page_stat(struct page *page, | 468 | static inline void mem_cgroup_dec_page_stat(struct page *page, |
| 357 | enum mem_cgroup_page_stat_item idx) | 469 | enum mem_cgroup_stat_index idx) |
| 358 | { | 470 | { |
| 359 | } | 471 | } |
| 360 | 472 | ||
| 361 | static inline | 473 | static inline |
| 362 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | 474 | enum mem_cgroup_filter_t |
| 363 | gfp_t gfp_mask, | 475 | mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, |
| 364 | unsigned long *total_scanned) | 476 | struct mem_cgroup *root) |
| 365 | { | 477 | { |
| 366 | return 0; | 478 | return VISIT; |
| 367 | } | 479 | } |
| 368 | 480 | ||
| 369 | static inline void mem_cgroup_split_huge_fixup(struct page *head) | 481 | static inline void mem_cgroup_split_huge_fixup(struct page *head) |
diff --git a/include/linux/mm.h b/include/linux/mm.h index caf543c7eaa7..8b6e55ee8855 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
| @@ -176,6 +176,7 @@ extern pgprot_t protection_map[16]; | |||
| 176 | #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ | 176 | #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ |
| 177 | #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ | 177 | #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ |
| 178 | #define FAULT_FLAG_TRIED 0x40 /* second try */ | 178 | #define FAULT_FLAG_TRIED 0x40 /* second try */ |
| 179 | #define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */ | ||
| 179 | 180 | ||
| 180 | /* | 181 | /* |
| 181 | * vm_fault is filled by the the pagefault handler and passed to the vma's | 182 | * vm_fault is filled by the the pagefault handler and passed to the vma's |
| @@ -876,11 +877,12 @@ static inline int page_mapped(struct page *page) | |||
| 876 | #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ | 877 | #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ |
| 877 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ | 878 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ |
| 878 | #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ | 879 | #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ |
| 880 | #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ | ||
| 879 | 881 | ||
| 880 | #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ | 882 | #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ |
| 881 | 883 | ||
| 882 | #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ | 884 | #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ |
| 883 | VM_FAULT_HWPOISON_LARGE) | 885 | VM_FAULT_FALLBACK | VM_FAULT_HWPOISON_LARGE) |
| 884 | 886 | ||
| 885 | /* Encode hstate index for a hwpoisoned large page */ | 887 | /* Encode hstate index for a hwpoisoned large page */ |
| 886 | #define VM_FAULT_SET_HINDEX(x) ((x) << 12) | 888 | #define VM_FAULT_SET_HINDEX(x) ((x) << 12) |
| @@ -984,7 +986,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, | |||
| 984 | unmap_mapping_range(mapping, holebegin, holelen, 0); | 986 | unmap_mapping_range(mapping, holebegin, holelen, 0); |
| 985 | } | 987 | } |
| 986 | 988 | ||
| 987 | extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new); | 989 | extern void truncate_pagecache(struct inode *inode, loff_t new); |
| 988 | extern void truncate_setsize(struct inode *inode, loff_t newsize); | 990 | extern void truncate_setsize(struct inode *inode, loff_t newsize); |
| 989 | void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); | 991 | void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); |
| 990 | int truncate_inode_page(struct address_space *mapping, struct page *page); | 992 | int truncate_inode_page(struct address_space *mapping, struct page *page); |
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index 96a509b6be04..201a69749659 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h | |||
| @@ -54,7 +54,7 @@ struct res_counter { | |||
| 54 | struct res_counter *parent; | 54 | struct res_counter *parent; |
| 55 | }; | 55 | }; |
| 56 | 56 | ||
| 57 | #define RESOURCE_MAX (unsigned long long)LLONG_MAX | 57 | #define RES_COUNTER_MAX ULLONG_MAX |
| 58 | 58 | ||
| 59 | /** | 59 | /** |
| 60 | * Helpers to interact with userspace | 60 | * Helpers to interact with userspace |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 45f254dddafc..6682da36b293 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -1393,6 +1393,13 @@ struct task_struct { | |||
| 1393 | unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ | 1393 | unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ |
| 1394 | } memcg_batch; | 1394 | } memcg_batch; |
| 1395 | unsigned int memcg_kmem_skip_account; | 1395 | unsigned int memcg_kmem_skip_account; |
| 1396 | struct memcg_oom_info { | ||
| 1397 | unsigned int may_oom:1; | ||
| 1398 | unsigned int in_memcg_oom:1; | ||
| 1399 | unsigned int oom_locked:1; | ||
| 1400 | int wakeups; | ||
| 1401 | struct mem_cgroup *wait_on_memcg; | ||
| 1402 | } memcg_oom; | ||
| 1396 | #endif | 1403 | #endif |
| 1397 | #ifdef CONFIG_UPROBES | 1404 | #ifdef CONFIG_UPROBES |
| 1398 | struct uprobe_task *utask; | 1405 | struct uprobe_task *utask; |
diff --git a/include/linux/swap.h b/include/linux/swap.h index c03c139219c9..46ba0c6c219f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
| @@ -280,7 +280,7 @@ extern void activate_page(struct page *); | |||
| 280 | extern void mark_page_accessed(struct page *); | 280 | extern void mark_page_accessed(struct page *); |
| 281 | extern void lru_add_drain(void); | 281 | extern void lru_add_drain(void); |
| 282 | extern void lru_add_drain_cpu(int cpu); | 282 | extern void lru_add_drain_cpu(int cpu); |
| 283 | extern int lru_add_drain_all(void); | 283 | extern void lru_add_drain_all(void); |
| 284 | extern void rotate_reclaimable_page(struct page *page); | 284 | extern void rotate_reclaimable_page(struct page *page); |
| 285 | extern void deactivate_page(struct page *page); | 285 | extern void deactivate_page(struct page *page); |
| 286 | extern void swap_setup(void); | 286 | extern void swap_setup(void); |
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 9bd0934f6c33..7a7d2ee96d42 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c | |||
| @@ -74,7 +74,7 @@ static int __init gcov_persist_setup(char *str) | |||
| 74 | { | 74 | { |
| 75 | unsigned long val; | 75 | unsigned long val; |
| 76 | 76 | ||
| 77 | if (strict_strtoul(str, 0, &val)) { | 77 | if (kstrtoul(str, 0, &val)) { |
| 78 | pr_warning("invalid gcov_persist parameter '%s'\n", str); | 78 | pr_warning("invalid gcov_persist parameter '%s'\n", str); |
| 79 | return 0; | 79 | return 0; |
| 80 | } | 80 | } |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 6ada93c23a9a..9659d38e008f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
| @@ -113,7 +113,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, | |||
| 113 | unsigned long cnt; | 113 | unsigned long cnt; |
| 114 | int ret; | 114 | int ret; |
| 115 | 115 | ||
| 116 | if (strict_strtoul(buf, 0, &cnt)) | 116 | if (kstrtoul(buf, 0, &cnt)) |
| 117 | return -EINVAL; | 117 | return -EINVAL; |
| 118 | 118 | ||
| 119 | ret = crash_shrink_memory(cnt); | 119 | ret = crash_shrink_memory(cnt); |
diff --git a/kernel/params.c b/kernel/params.c index 501bde4f3bee..81c4e78c8f4c 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
| @@ -253,13 +253,13 @@ int parse_args(const char *doing, | |||
| 253 | EXPORT_SYMBOL(param_ops_##name) | 253 | EXPORT_SYMBOL(param_ops_##name) |
| 254 | 254 | ||
| 255 | 255 | ||
| 256 | STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, strict_strtoul); | 256 | STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); |
| 257 | STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); | 257 | STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtoul); |
| 258 | STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); | 258 | STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); |
| 259 | STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); | 259 | STANDARD_PARAM_DEF(int, int, "%i", long, kstrtoul); |
| 260 | STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); | 260 | STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); |
| 261 | STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); | 261 | STANDARD_PARAM_DEF(long, long, "%li", long, kstrtoul); |
| 262 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); | 262 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); |
| 263 | 263 | ||
| 264 | int param_set_charp(const char *val, const struct kernel_param *kp) | 264 | int param_set_charp(const char *val, const struct kernel_param *kp) |
| 265 | { | 265 | { |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index ff55247e7049..4aa8a305aede 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
| @@ -17,8 +17,8 @@ | |||
| 17 | void res_counter_init(struct res_counter *counter, struct res_counter *parent) | 17 | void res_counter_init(struct res_counter *counter, struct res_counter *parent) |
| 18 | { | 18 | { |
| 19 | spin_lock_init(&counter->lock); | 19 | spin_lock_init(&counter->lock); |
| 20 | counter->limit = RESOURCE_MAX; | 20 | counter->limit = RES_COUNTER_MAX; |
| 21 | counter->soft_limit = RESOURCE_MAX; | 21 | counter->soft_limit = RES_COUNTER_MAX; |
| 22 | counter->parent = parent; | 22 | counter->parent = parent; |
| 23 | } | 23 | } |
| 24 | 24 | ||
| @@ -178,23 +178,30 @@ u64 res_counter_read_u64(struct res_counter *counter, int member) | |||
| 178 | #endif | 178 | #endif |
| 179 | 179 | ||
| 180 | int res_counter_memparse_write_strategy(const char *buf, | 180 | int res_counter_memparse_write_strategy(const char *buf, |
| 181 | unsigned long long *res) | 181 | unsigned long long *resp) |
| 182 | { | 182 | { |
| 183 | char *end; | 183 | char *end; |
| 184 | unsigned long long res; | ||
| 184 | 185 | ||
| 185 | /* return RESOURCE_MAX(unlimited) if "-1" is specified */ | 186 | /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ |
| 186 | if (*buf == '-') { | 187 | if (*buf == '-') { |
| 187 | *res = simple_strtoull(buf + 1, &end, 10); | 188 | res = simple_strtoull(buf + 1, &end, 10); |
| 188 | if (*res != 1 || *end != '\0') | 189 | if (res != 1 || *end != '\0') |
| 189 | return -EINVAL; | 190 | return -EINVAL; |
| 190 | *res = RESOURCE_MAX; | 191 | *resp = RES_COUNTER_MAX; |
| 191 | return 0; | 192 | return 0; |
| 192 | } | 193 | } |
| 193 | 194 | ||
| 194 | *res = memparse(buf, &end); | 195 | res = memparse(buf, &end); |
| 195 | if (*end != '\0') | 196 | if (*end != '\0') |
| 196 | return -EINVAL; | 197 | return -EINVAL; |
| 197 | 198 | ||
| 198 | *res = PAGE_ALIGN(*res); | 199 | if (PAGE_ALIGN(res) >= res) |
| 200 | res = PAGE_ALIGN(res); | ||
| 201 | else | ||
| 202 | res = RES_COUNTER_MAX; | ||
| 203 | |||
| 204 | *resp = res; | ||
| 205 | |||
| 199 | return 0; | 206 | return 0; |
| 200 | } | 207 | } |
diff --git a/mm/Kconfig b/mm/Kconfig index 6cdd27043303..026771a9b097 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -245,7 +245,7 @@ config COMPACTION | |||
| 245 | config MIGRATION | 245 | config MIGRATION |
| 246 | bool "Page migration" | 246 | bool "Page migration" |
| 247 | def_bool y | 247 | def_bool y |
| 248 | depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA | 248 | depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU |
| 249 | help | 249 | help |
| 250 | Allows the migration of the physical location of pages of processes | 250 | Allows the migration of the physical location of pages of processes |
| 251 | while the virtual addresses are not changed. This is useful in | 251 | while the virtual addresses are not changed. This is useful in |
| @@ -480,7 +480,7 @@ config FRONTSWAP | |||
| 480 | 480 | ||
| 481 | config CMA | 481 | config CMA |
| 482 | bool "Contiguous Memory Allocator" | 482 | bool "Contiguous Memory Allocator" |
| 483 | depends on HAVE_MEMBLOCK | 483 | depends on HAVE_MEMBLOCK && MMU |
| 484 | select MIGRATION | 484 | select MIGRATION |
| 485 | select MEMORY_ISOLATION | 485 | select MEMORY_ISOLATION |
| 486 | help | 486 | help |
diff --git a/mm/filemap.c b/mm/filemap.c index e607728db4a8..1e6aec4a2d2e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -467,32 +467,34 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
| 467 | error = mem_cgroup_cache_charge(page, current->mm, | 467 | error = mem_cgroup_cache_charge(page, current->mm, |
| 468 | gfp_mask & GFP_RECLAIM_MASK); | 468 | gfp_mask & GFP_RECLAIM_MASK); |
| 469 | if (error) | 469 | if (error) |
| 470 | goto out; | 470 | return error; |
| 471 | 471 | ||
| 472 | error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); | 472 | error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); |
| 473 | if (error == 0) { | 473 | if (error) { |
| 474 | page_cache_get(page); | ||
| 475 | page->mapping = mapping; | ||
| 476 | page->index = offset; | ||
| 477 | |||
| 478 | spin_lock_irq(&mapping->tree_lock); | ||
| 479 | error = radix_tree_insert(&mapping->page_tree, offset, page); | ||
| 480 | if (likely(!error)) { | ||
| 481 | mapping->nrpages++; | ||
| 482 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
| 483 | spin_unlock_irq(&mapping->tree_lock); | ||
| 484 | trace_mm_filemap_add_to_page_cache(page); | ||
| 485 | } else { | ||
| 486 | page->mapping = NULL; | ||
| 487 | /* Leave page->index set: truncation relies upon it */ | ||
| 488 | spin_unlock_irq(&mapping->tree_lock); | ||
| 489 | mem_cgroup_uncharge_cache_page(page); | ||
| 490 | page_cache_release(page); | ||
| 491 | } | ||
| 492 | radix_tree_preload_end(); | ||
| 493 | } else | ||
| 494 | mem_cgroup_uncharge_cache_page(page); | 474 | mem_cgroup_uncharge_cache_page(page); |
| 495 | out: | 475 | return error; |
| 476 | } | ||
| 477 | |||
| 478 | page_cache_get(page); | ||
| 479 | page->mapping = mapping; | ||
| 480 | page->index = offset; | ||
| 481 | |||
| 482 | spin_lock_irq(&mapping->tree_lock); | ||
| 483 | error = radix_tree_insert(&mapping->page_tree, offset, page); | ||
| 484 | radix_tree_preload_end(); | ||
| 485 | if (unlikely(error)) | ||
| 486 | goto err_insert; | ||
| 487 | mapping->nrpages++; | ||
| 488 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
| 489 | spin_unlock_irq(&mapping->tree_lock); | ||
| 490 | trace_mm_filemap_add_to_page_cache(page); | ||
| 491 | return 0; | ||
| 492 | err_insert: | ||
| 493 | page->mapping = NULL; | ||
| 494 | /* Leave page->index set: truncation relies upon it */ | ||
| 495 | spin_unlock_irq(&mapping->tree_lock); | ||
| 496 | mem_cgroup_uncharge_cache_page(page); | ||
| 497 | page_cache_release(page); | ||
| 496 | return error; | 498 | return error; |
| 497 | } | 499 | } |
| 498 | EXPORT_SYMBOL(add_to_page_cache_locked); | 500 | EXPORT_SYMBOL(add_to_page_cache_locked); |
| @@ -1614,6 +1616,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1614 | struct inode *inode = mapping->host; | 1616 | struct inode *inode = mapping->host; |
| 1615 | pgoff_t offset = vmf->pgoff; | 1617 | pgoff_t offset = vmf->pgoff; |
| 1616 | struct page *page; | 1618 | struct page *page; |
| 1619 | bool memcg_oom; | ||
| 1617 | pgoff_t size; | 1620 | pgoff_t size; |
| 1618 | int ret = 0; | 1621 | int ret = 0; |
| 1619 | 1622 | ||
| @@ -1622,7 +1625,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1622 | return VM_FAULT_SIGBUS; | 1625 | return VM_FAULT_SIGBUS; |
| 1623 | 1626 | ||
| 1624 | /* | 1627 | /* |
| 1625 | * Do we have something in the page cache already? | 1628 | * Do we have something in the page cache already? Either |
| 1629 | * way, try readahead, but disable the memcg OOM killer for it | ||
| 1630 | * as readahead is optional and no errors are propagated up | ||
| 1631 | * the fault stack. The OOM killer is enabled while trying to | ||
| 1632 | * instantiate the faulting page individually below. | ||
| 1626 | */ | 1633 | */ |
| 1627 | page = find_get_page(mapping, offset); | 1634 | page = find_get_page(mapping, offset); |
| 1628 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { | 1635 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { |
| @@ -1630,10 +1637,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1630 | * We found the page, so try async readahead before | 1637 | * We found the page, so try async readahead before |
| 1631 | * waiting for the lock. | 1638 | * waiting for the lock. |
| 1632 | */ | 1639 | */ |
| 1640 | memcg_oom = mem_cgroup_toggle_oom(false); | ||
| 1633 | do_async_mmap_readahead(vma, ra, file, page, offset); | 1641 | do_async_mmap_readahead(vma, ra, file, page, offset); |
| 1642 | mem_cgroup_toggle_oom(memcg_oom); | ||
| 1634 | } else if (!page) { | 1643 | } else if (!page) { |
| 1635 | /* No page in the page cache at all */ | 1644 | /* No page in the page cache at all */ |
| 1645 | memcg_oom = mem_cgroup_toggle_oom(false); | ||
| 1636 | do_sync_mmap_readahead(vma, ra, file, offset); | 1646 | do_sync_mmap_readahead(vma, ra, file, offset); |
| 1647 | mem_cgroup_toggle_oom(memcg_oom); | ||
| 1637 | count_vm_event(PGMAJFAULT); | 1648 | count_vm_event(PGMAJFAULT); |
| 1638 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | 1649 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
| 1639 | ret = VM_FAULT_MAJOR; | 1650 | ret = VM_FAULT_MAJOR; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d66010e0049d..7489884682d8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
| @@ -695,11 +695,10 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | |||
| 695 | return pmd; | 695 | return pmd; |
| 696 | } | 696 | } |
| 697 | 697 | ||
| 698 | static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) | 698 | static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) |
| 699 | { | 699 | { |
| 700 | pmd_t entry; | 700 | pmd_t entry; |
| 701 | entry = mk_pmd(page, vma->vm_page_prot); | 701 | entry = mk_pmd(page, prot); |
| 702 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
| 703 | entry = pmd_mkhuge(entry); | 702 | entry = pmd_mkhuge(entry); |
| 704 | return entry; | 703 | return entry; |
| 705 | } | 704 | } |
| @@ -732,7 +731,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
| 732 | pte_free(mm, pgtable); | 731 | pte_free(mm, pgtable); |
| 733 | } else { | 732 | } else { |
| 734 | pmd_t entry; | 733 | pmd_t entry; |
| 735 | entry = mk_huge_pmd(page, vma); | 734 | entry = mk_huge_pmd(page, vma->vm_page_prot); |
| 735 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
| 736 | page_add_new_anon_rmap(page, vma, haddr); | 736 | page_add_new_anon_rmap(page, vma, haddr); |
| 737 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 737 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
| 738 | set_pmd_at(mm, haddr, pmd, entry); | 738 | set_pmd_at(mm, haddr, pmd, entry); |
| @@ -788,77 +788,57 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 788 | { | 788 | { |
| 789 | struct page *page; | 789 | struct page *page; |
| 790 | unsigned long haddr = address & HPAGE_PMD_MASK; | 790 | unsigned long haddr = address & HPAGE_PMD_MASK; |
| 791 | pte_t *pte; | ||
| 792 | 791 | ||
| 793 | if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { | 792 | if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) |
| 794 | if (unlikely(anon_vma_prepare(vma))) | 793 | return VM_FAULT_FALLBACK; |
| 795 | return VM_FAULT_OOM; | 794 | if (unlikely(anon_vma_prepare(vma))) |
| 796 | if (unlikely(khugepaged_enter(vma))) | 795 | return VM_FAULT_OOM; |
| 796 | if (unlikely(khugepaged_enter(vma))) | ||
| 797 | return VM_FAULT_OOM; | ||
| 798 | if (!(flags & FAULT_FLAG_WRITE) && | ||
| 799 | transparent_hugepage_use_zero_page()) { | ||
| 800 | pgtable_t pgtable; | ||
| 801 | struct page *zero_page; | ||
| 802 | bool set; | ||
| 803 | pgtable = pte_alloc_one(mm, haddr); | ||
| 804 | if (unlikely(!pgtable)) | ||
| 797 | return VM_FAULT_OOM; | 805 | return VM_FAULT_OOM; |
| 798 | if (!(flags & FAULT_FLAG_WRITE) && | 806 | zero_page = get_huge_zero_page(); |
| 799 | transparent_hugepage_use_zero_page()) { | 807 | if (unlikely(!zero_page)) { |
| 800 | pgtable_t pgtable; | 808 | pte_free(mm, pgtable); |
| 801 | struct page *zero_page; | ||
| 802 | bool set; | ||
| 803 | pgtable = pte_alloc_one(mm, haddr); | ||
| 804 | if (unlikely(!pgtable)) | ||
| 805 | return VM_FAULT_OOM; | ||
| 806 | zero_page = get_huge_zero_page(); | ||
| 807 | if (unlikely(!zero_page)) { | ||
| 808 | pte_free(mm, pgtable); | ||
| 809 | count_vm_event(THP_FAULT_FALLBACK); | ||
| 810 | goto out; | ||
| 811 | } | ||
| 812 | spin_lock(&mm->page_table_lock); | ||
| 813 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, | ||
| 814 | zero_page); | ||
| 815 | spin_unlock(&mm->page_table_lock); | ||
| 816 | if (!set) { | ||
| 817 | pte_free(mm, pgtable); | ||
| 818 | put_huge_zero_page(); | ||
| 819 | } | ||
| 820 | return 0; | ||
| 821 | } | ||
| 822 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | ||
| 823 | vma, haddr, numa_node_id(), 0); | ||
| 824 | if (unlikely(!page)) { | ||
| 825 | count_vm_event(THP_FAULT_FALLBACK); | 809 | count_vm_event(THP_FAULT_FALLBACK); |
| 826 | goto out; | 810 | return VM_FAULT_FALLBACK; |
| 827 | } | ||
| 828 | count_vm_event(THP_FAULT_ALLOC); | ||
| 829 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { | ||
| 830 | put_page(page); | ||
| 831 | goto out; | ||
| 832 | } | 811 | } |
| 833 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, | 812 | spin_lock(&mm->page_table_lock); |
| 834 | page))) { | 813 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, |
| 835 | mem_cgroup_uncharge_page(page); | 814 | zero_page); |
| 836 | put_page(page); | 815 | spin_unlock(&mm->page_table_lock); |
| 837 | goto out; | 816 | if (!set) { |
| 817 | pte_free(mm, pgtable); | ||
| 818 | put_huge_zero_page(); | ||
| 838 | } | 819 | } |
| 839 | |||
| 840 | return 0; | 820 | return 0; |
| 841 | } | 821 | } |
| 842 | out: | 822 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
| 843 | /* | 823 | vma, haddr, numa_node_id(), 0); |
| 844 | * Use __pte_alloc instead of pte_alloc_map, because we can't | 824 | if (unlikely(!page)) { |
| 845 | * run pte_offset_map on the pmd, if an huge pmd could | 825 | count_vm_event(THP_FAULT_FALLBACK); |
| 846 | * materialize from under us from a different thread. | 826 | return VM_FAULT_FALLBACK; |
| 847 | */ | 827 | } |
| 848 | if (unlikely(pmd_none(*pmd)) && | 828 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { |
| 849 | unlikely(__pte_alloc(mm, vma, pmd, address))) | 829 | put_page(page); |
| 850 | return VM_FAULT_OOM; | 830 | count_vm_event(THP_FAULT_FALLBACK); |
| 851 | /* if an huge pmd materialized from under us just retry later */ | 831 | return VM_FAULT_FALLBACK; |
| 852 | if (unlikely(pmd_trans_huge(*pmd))) | 832 | } |
| 853 | return 0; | 833 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { |
| 854 | /* | 834 | mem_cgroup_uncharge_page(page); |
| 855 | * A regular pmd is established and it can't morph into a huge pmd | 835 | put_page(page); |
| 856 | * from under us anymore at this point because we hold the mmap_sem | 836 | count_vm_event(THP_FAULT_FALLBACK); |
| 857 | * read mode and khugepaged takes it in write mode. So now it's | 837 | return VM_FAULT_FALLBACK; |
| 858 | * safe to run pte_offset_map(). | 838 | } |
| 859 | */ | 839 | |
| 860 | pte = pte_offset_map(pmd, address); | 840 | count_vm_event(THP_FAULT_ALLOC); |
| 861 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 841 | return 0; |
| 862 | } | 842 | } |
| 863 | 843 | ||
| 864 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 844 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
| @@ -1170,7 +1150,6 @@ alloc: | |||
| 1170 | new_page = NULL; | 1150 | new_page = NULL; |
| 1171 | 1151 | ||
| 1172 | if (unlikely(!new_page)) { | 1152 | if (unlikely(!new_page)) { |
| 1173 | count_vm_event(THP_FAULT_FALLBACK); | ||
| 1174 | if (is_huge_zero_pmd(orig_pmd)) { | 1153 | if (is_huge_zero_pmd(orig_pmd)) { |
| 1175 | ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, | 1154 | ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, |
| 1176 | address, pmd, orig_pmd, haddr); | 1155 | address, pmd, orig_pmd, haddr); |
| @@ -1181,9 +1160,9 @@ alloc: | |||
| 1181 | split_huge_page(page); | 1160 | split_huge_page(page); |
| 1182 | put_page(page); | 1161 | put_page(page); |
| 1183 | } | 1162 | } |
| 1163 | count_vm_event(THP_FAULT_FALLBACK); | ||
| 1184 | goto out; | 1164 | goto out; |
| 1185 | } | 1165 | } |
| 1186 | count_vm_event(THP_FAULT_ALLOC); | ||
| 1187 | 1166 | ||
| 1188 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1167 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
| 1189 | put_page(new_page); | 1168 | put_page(new_page); |
| @@ -1191,10 +1170,13 @@ alloc: | |||
| 1191 | split_huge_page(page); | 1170 | split_huge_page(page); |
| 1192 | put_page(page); | 1171 | put_page(page); |
| 1193 | } | 1172 | } |
| 1173 | count_vm_event(THP_FAULT_FALLBACK); | ||
| 1194 | ret |= VM_FAULT_OOM; | 1174 | ret |= VM_FAULT_OOM; |
| 1195 | goto out; | 1175 | goto out; |
| 1196 | } | 1176 | } |
| 1197 | 1177 | ||
| 1178 | count_vm_event(THP_FAULT_ALLOC); | ||
| 1179 | |||
| 1198 | if (is_huge_zero_pmd(orig_pmd)) | 1180 | if (is_huge_zero_pmd(orig_pmd)) |
| 1199 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); | 1181 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); |
| 1200 | else | 1182 | else |
| @@ -1215,7 +1197,8 @@ alloc: | |||
| 1215 | goto out_mn; | 1197 | goto out_mn; |
| 1216 | } else { | 1198 | } else { |
| 1217 | pmd_t entry; | 1199 | pmd_t entry; |
| 1218 | entry = mk_huge_pmd(new_page, vma); | 1200 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); |
| 1201 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
| 1219 | pmdp_clear_flush(vma, haddr, pmd); | 1202 | pmdp_clear_flush(vma, haddr, pmd); |
| 1220 | page_add_new_anon_rmap(new_page, vma, haddr); | 1203 | page_add_new_anon_rmap(new_page, vma, haddr); |
| 1221 | set_pmd_at(mm, haddr, pmd, entry); | 1204 | set_pmd_at(mm, haddr, pmd, entry); |
| @@ -1666,7 +1649,6 @@ static void __split_huge_page_refcount(struct page *page, | |||
| 1666 | BUG_ON(atomic_read(&page->_count) <= 0); | 1649 | BUG_ON(atomic_read(&page->_count) <= 0); |
| 1667 | 1650 | ||
| 1668 | __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); | 1651 | __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); |
| 1669 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); | ||
| 1670 | 1652 | ||
| 1671 | ClearPageCompound(page); | 1653 | ClearPageCompound(page); |
| 1672 | compound_unlock(page); | 1654 | compound_unlock(page); |
| @@ -2364,7 +2346,8 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
| 2364 | __SetPageUptodate(new_page); | 2346 | __SetPageUptodate(new_page); |
| 2365 | pgtable = pmd_pgtable(_pmd); | 2347 | pgtable = pmd_pgtable(_pmd); |
| 2366 | 2348 | ||
| 2367 | _pmd = mk_huge_pmd(new_page, vma); | 2349 | _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); |
| 2350 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | ||
| 2368 | 2351 | ||
| 2369 | /* | 2352 | /* |
| 2370 | * spin_lock() below is not the equivalent of smp_wmb(), so | 2353 | * spin_lock() below is not the equivalent of smp_wmb(), so |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c6bd28edd533..d5ff3ce13029 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -39,7 +39,6 @@ | |||
| 39 | #include <linux/limits.h> | 39 | #include <linux/limits.h> |
| 40 | #include <linux/export.h> | 40 | #include <linux/export.h> |
| 41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
| 42 | #include <linux/rbtree.h> | ||
| 43 | #include <linux/slab.h> | 42 | #include <linux/slab.h> |
| 44 | #include <linux/swap.h> | 43 | #include <linux/swap.h> |
| 45 | #include <linux/swapops.h> | 44 | #include <linux/swapops.h> |
| @@ -85,26 +84,12 @@ static int really_do_swap_account __initdata = 0; | |||
| 85 | #endif | 84 | #endif |
| 86 | 85 | ||
| 87 | 86 | ||
| 88 | /* | ||
| 89 | * Statistics for memory cgroup. | ||
| 90 | */ | ||
| 91 | enum mem_cgroup_stat_index { | ||
| 92 | /* | ||
| 93 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. | ||
| 94 | */ | ||
| 95 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | ||
| 96 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | ||
| 97 | MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ | ||
| 98 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | ||
| 99 | MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ | ||
| 100 | MEM_CGROUP_STAT_NSTATS, | ||
| 101 | }; | ||
| 102 | |||
| 103 | static const char * const mem_cgroup_stat_names[] = { | 87 | static const char * const mem_cgroup_stat_names[] = { |
| 104 | "cache", | 88 | "cache", |
| 105 | "rss", | 89 | "rss", |
| 106 | "rss_huge", | 90 | "rss_huge", |
| 107 | "mapped_file", | 91 | "mapped_file", |
| 92 | "writeback", | ||
| 108 | "swap", | 93 | "swap", |
| 109 | }; | 94 | }; |
| 110 | 95 | ||
| @@ -175,10 +160,6 @@ struct mem_cgroup_per_zone { | |||
| 175 | 160 | ||
| 176 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 161 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; |
| 177 | 162 | ||
| 178 | struct rb_node tree_node; /* RB tree node */ | ||
| 179 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
| 180 | /* the soft limit is exceeded*/ | ||
| 181 | bool on_tree; | ||
| 182 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ | 163 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ |
| 183 | /* use container_of */ | 164 | /* use container_of */ |
| 184 | }; | 165 | }; |
| @@ -187,26 +168,6 @@ struct mem_cgroup_per_node { | |||
| 187 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 168 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
| 188 | }; | 169 | }; |
| 189 | 170 | ||
| 190 | /* | ||
| 191 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
| 192 | * their hierarchy representation | ||
| 193 | */ | ||
| 194 | |||
| 195 | struct mem_cgroup_tree_per_zone { | ||
| 196 | struct rb_root rb_root; | ||
| 197 | spinlock_t lock; | ||
| 198 | }; | ||
| 199 | |||
| 200 | struct mem_cgroup_tree_per_node { | ||
| 201 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
| 202 | }; | ||
| 203 | |||
| 204 | struct mem_cgroup_tree { | ||
| 205 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
| 206 | }; | ||
| 207 | |||
| 208 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
| 209 | |||
| 210 | struct mem_cgroup_threshold { | 171 | struct mem_cgroup_threshold { |
| 211 | struct eventfd_ctx *eventfd; | 172 | struct eventfd_ctx *eventfd; |
| 212 | u64 threshold; | 173 | u64 threshold; |
| @@ -280,6 +241,7 @@ struct mem_cgroup { | |||
| 280 | 241 | ||
| 281 | bool oom_lock; | 242 | bool oom_lock; |
| 282 | atomic_t under_oom; | 243 | atomic_t under_oom; |
| 244 | atomic_t oom_wakeups; | ||
| 283 | 245 | ||
| 284 | int swappiness; | 246 | int swappiness; |
| 285 | /* OOM-Killer disable */ | 247 | /* OOM-Killer disable */ |
| @@ -304,7 +266,7 @@ struct mem_cgroup { | |||
| 304 | * Should we move charges of a task when a task is moved into this | 266 | * Should we move charges of a task when a task is moved into this |
| 305 | * mem_cgroup ? And what type of charges should we move ? | 267 | * mem_cgroup ? And what type of charges should we move ? |
| 306 | */ | 268 | */ |
| 307 | unsigned long move_charge_at_immigrate; | 269 | unsigned long move_charge_at_immigrate; |
| 308 | /* | 270 | /* |
| 309 | * set > 0 if pages under this cgroup are moving to other cgroup. | 271 | * set > 0 if pages under this cgroup are moving to other cgroup. |
| 310 | */ | 272 | */ |
| @@ -341,6 +303,22 @@ struct mem_cgroup { | |||
| 341 | atomic_t numainfo_events; | 303 | atomic_t numainfo_events; |
| 342 | atomic_t numainfo_updating; | 304 | atomic_t numainfo_updating; |
| 343 | #endif | 305 | #endif |
| 306 | /* | ||
| 307 | * Protects soft_contributed transitions. | ||
| 308 | * See mem_cgroup_update_soft_limit | ||
| 309 | */ | ||
| 310 | spinlock_t soft_lock; | ||
| 311 | |||
| 312 | /* | ||
| 313 | * If true then this group has increased parents' children_in_excess | ||
| 314 | * when it got over the soft limit. | ||
| 315 | * When a group falls bellow the soft limit, parents' children_in_excess | ||
| 316 | * is decreased and soft_contributed changed to false. | ||
| 317 | */ | ||
| 318 | bool soft_contributed; | ||
| 319 | |||
| 320 | /* Number of children that are in soft limit excess */ | ||
| 321 | atomic_t children_in_excess; | ||
| 344 | 322 | ||
| 345 | struct mem_cgroup_per_node *nodeinfo[0]; | 323 | struct mem_cgroup_per_node *nodeinfo[0]; |
| 346 | /* WARNING: nodeinfo must be the last member here */ | 324 | /* WARNING: nodeinfo must be the last member here */ |
| @@ -444,7 +422,6 @@ static bool move_file(void) | |||
| 444 | * limit reclaim to prevent infinite loops, if they ever occur. | 422 | * limit reclaim to prevent infinite loops, if they ever occur. |
| 445 | */ | 423 | */ |
| 446 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 | 424 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 |
| 447 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 | ||
| 448 | 425 | ||
| 449 | enum charge_type { | 426 | enum charge_type { |
| 450 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 427 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
| @@ -671,164 +648,6 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) | |||
| 671 | return mem_cgroup_zoneinfo(memcg, nid, zid); | 648 | return mem_cgroup_zoneinfo(memcg, nid, zid); |
| 672 | } | 649 | } |
| 673 | 650 | ||
| 674 | static struct mem_cgroup_tree_per_zone * | ||
| 675 | soft_limit_tree_node_zone(int nid, int zid) | ||
| 676 | { | ||
| 677 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
| 678 | } | ||
| 679 | |||
| 680 | static struct mem_cgroup_tree_per_zone * | ||
| 681 | soft_limit_tree_from_page(struct page *page) | ||
| 682 | { | ||
| 683 | int nid = page_to_nid(page); | ||
| 684 | int zid = page_zonenum(page); | ||
| 685 | |||
| 686 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
| 687 | } | ||
| 688 | |||
| 689 | static void | ||
| 690 | __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, | ||
| 691 | struct mem_cgroup_per_zone *mz, | ||
| 692 | struct mem_cgroup_tree_per_zone *mctz, | ||
| 693 | unsigned long long new_usage_in_excess) | ||
| 694 | { | ||
| 695 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
| 696 | struct rb_node *parent = NULL; | ||
| 697 | struct mem_cgroup_per_zone *mz_node; | ||
| 698 | |||
| 699 | if (mz->on_tree) | ||
| 700 | return; | ||
| 701 | |||
| 702 | mz->usage_in_excess = new_usage_in_excess; | ||
| 703 | if (!mz->usage_in_excess) | ||
| 704 | return; | ||
| 705 | while (*p) { | ||
| 706 | parent = *p; | ||
| 707 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
| 708 | tree_node); | ||
| 709 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
| 710 | p = &(*p)->rb_left; | ||
| 711 | /* | ||
| 712 | * We can't avoid mem cgroups that are over their soft | ||
| 713 | * limit by the same amount | ||
| 714 | */ | ||
| 715 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
| 716 | p = &(*p)->rb_right; | ||
| 717 | } | ||
| 718 | rb_link_node(&mz->tree_node, parent, p); | ||
| 719 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
| 720 | mz->on_tree = true; | ||
| 721 | } | ||
| 722 | |||
| 723 | static void | ||
| 724 | __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
| 725 | struct mem_cgroup_per_zone *mz, | ||
| 726 | struct mem_cgroup_tree_per_zone *mctz) | ||
| 727 | { | ||
| 728 | if (!mz->on_tree) | ||
| 729 | return; | ||
| 730 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
| 731 | mz->on_tree = false; | ||
| 732 | } | ||
| 733 | |||
| 734 | static void | ||
| 735 | mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
| 736 | struct mem_cgroup_per_zone *mz, | ||
| 737 | struct mem_cgroup_tree_per_zone *mctz) | ||
| 738 | { | ||
| 739 | spin_lock(&mctz->lock); | ||
| 740 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
| 741 | spin_unlock(&mctz->lock); | ||
| 742 | } | ||
| 743 | |||
| 744 | |||
| 745 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | ||
| 746 | { | ||
| 747 | unsigned long long excess; | ||
| 748 | struct mem_cgroup_per_zone *mz; | ||
| 749 | struct mem_cgroup_tree_per_zone *mctz; | ||
| 750 | int nid = page_to_nid(page); | ||
| 751 | int zid = page_zonenum(page); | ||
| 752 | mctz = soft_limit_tree_from_page(page); | ||
| 753 | |||
| 754 | /* | ||
| 755 | * Necessary to update all ancestors when hierarchy is used. | ||
| 756 | * because their event counter is not touched. | ||
| 757 | */ | ||
| 758 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { | ||
| 759 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
| 760 | excess = res_counter_soft_limit_excess(&memcg->res); | ||
| 761 | /* | ||
| 762 | * We have to update the tree if mz is on RB-tree or | ||
| 763 | * mem is over its softlimit. | ||
| 764 | */ | ||
| 765 | if (excess || mz->on_tree) { | ||
| 766 | spin_lock(&mctz->lock); | ||
| 767 | /* if on-tree, remove it */ | ||
| 768 | if (mz->on_tree) | ||
| 769 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
| 770 | /* | ||
| 771 | * Insert again. mz->usage_in_excess will be updated. | ||
| 772 | * If excess is 0, no tree ops. | ||
| 773 | */ | ||
| 774 | __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); | ||
| 775 | spin_unlock(&mctz->lock); | ||
| 776 | } | ||
| 777 | } | ||
| 778 | } | ||
| 779 | |||
| 780 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) | ||
| 781 | { | ||
| 782 | int node, zone; | ||
| 783 | struct mem_cgroup_per_zone *mz; | ||
| 784 | struct mem_cgroup_tree_per_zone *mctz; | ||
| 785 | |||
| 786 | for_each_node(node) { | ||
| 787 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
| 788 | mz = mem_cgroup_zoneinfo(memcg, node, zone); | ||
| 789 | mctz = soft_limit_tree_node_zone(node, zone); | ||
| 790 | mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
| 791 | } | ||
| 792 | } | ||
| 793 | } | ||
| 794 | |||
| 795 | static struct mem_cgroup_per_zone * | ||
| 796 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
| 797 | { | ||
| 798 | struct rb_node *rightmost = NULL; | ||
| 799 | struct mem_cgroup_per_zone *mz; | ||
| 800 | |||
| 801 | retry: | ||
| 802 | mz = NULL; | ||
| 803 | rightmost = rb_last(&mctz->rb_root); | ||
| 804 | if (!rightmost) | ||
| 805 | goto done; /* Nothing to reclaim from */ | ||
| 806 | |||
| 807 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | ||
| 808 | /* | ||
| 809 | * Remove the node now but someone else can add it back, | ||
| 810 | * we will to add it back at the end of reclaim to its correct | ||
| 811 | * position in the tree. | ||
| 812 | */ | ||
| 813 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
| 814 | if (!res_counter_soft_limit_excess(&mz->memcg->res) || | ||
| 815 | !css_tryget(&mz->memcg->css)) | ||
| 816 | goto retry; | ||
| 817 | done: | ||
| 818 | return mz; | ||
| 819 | } | ||
| 820 | |||
| 821 | static struct mem_cgroup_per_zone * | ||
| 822 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
| 823 | { | ||
| 824 | struct mem_cgroup_per_zone *mz; | ||
| 825 | |||
| 826 | spin_lock(&mctz->lock); | ||
| 827 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | ||
| 828 | spin_unlock(&mctz->lock); | ||
| 829 | return mz; | ||
| 830 | } | ||
| 831 | |||
| 832 | /* | 651 | /* |
| 833 | * Implementation Note: reading percpu statistics for memcg. | 652 | * Implementation Note: reading percpu statistics for memcg. |
| 834 | * | 653 | * |
| @@ -1003,6 +822,48 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, | |||
| 1003 | } | 822 | } |
| 1004 | 823 | ||
| 1005 | /* | 824 | /* |
| 825 | * Called from rate-limited memcg_check_events when enough | ||
| 826 | * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure | ||
| 827 | * that all the parents up the hierarchy will be notified that this group | ||
| 828 | * is in excess or that it is not in excess anymore. mmecg->soft_contributed | ||
| 829 | * makes the transition a single action whenever the state flips from one to | ||
| 830 | * the other. | ||
| 831 | */ | ||
| 832 | static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) | ||
| 833 | { | ||
| 834 | unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); | ||
| 835 | struct mem_cgroup *parent = memcg; | ||
| 836 | int delta = 0; | ||
| 837 | |||
| 838 | spin_lock(&memcg->soft_lock); | ||
| 839 | if (excess) { | ||
| 840 | if (!memcg->soft_contributed) { | ||
| 841 | delta = 1; | ||
| 842 | memcg->soft_contributed = true; | ||
| 843 | } | ||
| 844 | } else { | ||
| 845 | if (memcg->soft_contributed) { | ||
| 846 | delta = -1; | ||
| 847 | memcg->soft_contributed = false; | ||
| 848 | } | ||
| 849 | } | ||
| 850 | |||
| 851 | /* | ||
| 852 | * Necessary to update all ancestors when hierarchy is used | ||
| 853 | * because their event counter is not touched. | ||
| 854 | * We track children even outside the hierarchy for the root | ||
| 855 | * cgroup because tree walk starting at root should visit | ||
| 856 | * all cgroups and we want to prevent from pointless tree | ||
| 857 | * walk if no children is below the limit. | ||
| 858 | */ | ||
| 859 | while (delta && (parent = parent_mem_cgroup(parent))) | ||
| 860 | atomic_add(delta, &parent->children_in_excess); | ||
| 861 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
| 862 | atomic_add(delta, &root_mem_cgroup->children_in_excess); | ||
| 863 | spin_unlock(&memcg->soft_lock); | ||
| 864 | } | ||
| 865 | |||
| 866 | /* | ||
| 1006 | * Check events in order. | 867 | * Check events in order. |
| 1007 | * | 868 | * |
| 1008 | */ | 869 | */ |
| @@ -1025,7 +886,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
| 1025 | 886 | ||
| 1026 | mem_cgroup_threshold(memcg); | 887 | mem_cgroup_threshold(memcg); |
| 1027 | if (unlikely(do_softlimit)) | 888 | if (unlikely(do_softlimit)) |
| 1028 | mem_cgroup_update_tree(memcg, page); | 889 | mem_cgroup_update_soft_limit(memcg); |
| 1029 | #if MAX_NUMNODES > 1 | 890 | #if MAX_NUMNODES > 1 |
| 1030 | if (unlikely(do_numainfo)) | 891 | if (unlikely(do_numainfo)) |
| 1031 | atomic_inc(&memcg->numainfo_events); | 892 | atomic_inc(&memcg->numainfo_events); |
| @@ -1068,6 +929,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
| 1068 | return memcg; | 929 | return memcg; |
| 1069 | } | 930 | } |
| 1070 | 931 | ||
| 932 | static enum mem_cgroup_filter_t | ||
| 933 | mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, | ||
| 934 | mem_cgroup_iter_filter cond) | ||
| 935 | { | ||
| 936 | if (!cond) | ||
| 937 | return VISIT; | ||
| 938 | return cond(memcg, root); | ||
| 939 | } | ||
| 940 | |||
| 1071 | /* | 941 | /* |
| 1072 | * Returns a next (in a pre-order walk) alive memcg (with elevated css | 942 | * Returns a next (in a pre-order walk) alive memcg (with elevated css |
| 1073 | * ref. count) or NULL if the whole root's subtree has been visited. | 943 | * ref. count) or NULL if the whole root's subtree has been visited. |
| @@ -1075,7 +945,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
| 1075 | * helper function to be used by mem_cgroup_iter | 945 | * helper function to be used by mem_cgroup_iter |
| 1076 | */ | 946 | */ |
| 1077 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, | 947 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, |
| 1078 | struct mem_cgroup *last_visited) | 948 | struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) |
| 1079 | { | 949 | { |
| 1080 | struct cgroup_subsys_state *prev_css, *next_css; | 950 | struct cgroup_subsys_state *prev_css, *next_css; |
| 1081 | 951 | ||
| @@ -1093,11 +963,31 @@ skip_node: | |||
| 1093 | if (next_css) { | 963 | if (next_css) { |
| 1094 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); | 964 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); |
| 1095 | 965 | ||
| 1096 | if (css_tryget(&mem->css)) | 966 | switch (mem_cgroup_filter(mem, root, cond)) { |
| 1097 | return mem; | 967 | case SKIP: |
| 1098 | else { | ||
| 1099 | prev_css = next_css; | 968 | prev_css = next_css; |
| 1100 | goto skip_node; | 969 | goto skip_node; |
| 970 | case SKIP_TREE: | ||
| 971 | if (mem == root) | ||
| 972 | return NULL; | ||
| 973 | /* | ||
| 974 | * css_rightmost_descendant is not an optimal way to | ||
| 975 | * skip through a subtree (especially for imbalanced | ||
| 976 | * trees leaning to right) but that's what we have right | ||
| 977 | * now. More effective solution would be traversing | ||
| 978 | * right-up for first non-NULL without calling | ||
| 979 | * css_next_descendant_pre afterwards. | ||
| 980 | */ | ||
| 981 | prev_css = css_rightmost_descendant(next_css); | ||
| 982 | goto skip_node; | ||
| 983 | case VISIT: | ||
| 984 | if (css_tryget(&mem->css)) | ||
| 985 | return mem; | ||
| 986 | else { | ||
| 987 | prev_css = next_css; | ||
| 988 | goto skip_node; | ||
| 989 | } | ||
| 990 | break; | ||
| 1101 | } | 991 | } |
| 1102 | } | 992 | } |
| 1103 | 993 | ||
| @@ -1161,6 +1051,7 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
| 1161 | * @root: hierarchy root | 1051 | * @root: hierarchy root |
| 1162 | * @prev: previously returned memcg, NULL on first invocation | 1052 | * @prev: previously returned memcg, NULL on first invocation |
| 1163 | * @reclaim: cookie for shared reclaim walks, NULL for full walks | 1053 | * @reclaim: cookie for shared reclaim walks, NULL for full walks |
| 1054 | * @cond: filter for visited nodes, NULL for no filter | ||
| 1164 | * | 1055 | * |
| 1165 | * Returns references to children of the hierarchy below @root, or | 1056 | * Returns references to children of the hierarchy below @root, or |
| 1166 | * @root itself, or %NULL after a full round-trip. | 1057 | * @root itself, or %NULL after a full round-trip. |
| @@ -1173,15 +1064,18 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
| 1173 | * divide up the memcgs in the hierarchy among all concurrent | 1064 | * divide up the memcgs in the hierarchy among all concurrent |
| 1174 | * reclaimers operating on the same zone and priority. | 1065 | * reclaimers operating on the same zone and priority. |
| 1175 | */ | 1066 | */ |
| 1176 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | 1067 | struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, |
| 1177 | struct mem_cgroup *prev, | 1068 | struct mem_cgroup *prev, |
| 1178 | struct mem_cgroup_reclaim_cookie *reclaim) | 1069 | struct mem_cgroup_reclaim_cookie *reclaim, |
| 1070 | mem_cgroup_iter_filter cond) | ||
| 1179 | { | 1071 | { |
| 1180 | struct mem_cgroup *memcg = NULL; | 1072 | struct mem_cgroup *memcg = NULL; |
| 1181 | struct mem_cgroup *last_visited = NULL; | 1073 | struct mem_cgroup *last_visited = NULL; |
| 1182 | 1074 | ||
| 1183 | if (mem_cgroup_disabled()) | 1075 | if (mem_cgroup_disabled()) { |
| 1184 | return NULL; | 1076 | /* first call must return non-NULL, second return NULL */ |
| 1077 | return (struct mem_cgroup *)(unsigned long)!prev; | ||
| 1078 | } | ||
| 1185 | 1079 | ||
| 1186 | if (!root) | 1080 | if (!root) |
| 1187 | root = root_mem_cgroup; | 1081 | root = root_mem_cgroup; |
| @@ -1192,7 +1086,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
| 1192 | if (!root->use_hierarchy && root != root_mem_cgroup) { | 1086 | if (!root->use_hierarchy && root != root_mem_cgroup) { |
| 1193 | if (prev) | 1087 | if (prev) |
| 1194 | goto out_css_put; | 1088 | goto out_css_put; |
| 1195 | return root; | 1089 | if (mem_cgroup_filter(root, root, cond) == VISIT) |
| 1090 | return root; | ||
| 1091 | return NULL; | ||
| 1196 | } | 1092 | } |
| 1197 | 1093 | ||
| 1198 | rcu_read_lock(); | 1094 | rcu_read_lock(); |
| @@ -1215,7 +1111,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
| 1215 | last_visited = mem_cgroup_iter_load(iter, root, &seq); | 1111 | last_visited = mem_cgroup_iter_load(iter, root, &seq); |
| 1216 | } | 1112 | } |
| 1217 | 1113 | ||
| 1218 | memcg = __mem_cgroup_iter_next(root, last_visited); | 1114 | memcg = __mem_cgroup_iter_next(root, last_visited, cond); |
| 1219 | 1115 | ||
| 1220 | if (reclaim) { | 1116 | if (reclaim) { |
| 1221 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); | 1117 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); |
| @@ -1226,7 +1122,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
| 1226 | reclaim->generation = iter->generation; | 1122 | reclaim->generation = iter->generation; |
| 1227 | } | 1123 | } |
| 1228 | 1124 | ||
| 1229 | if (prev && !memcg) | 1125 | /* |
| 1126 | * We have finished the whole tree walk or no group has been | ||
| 1127 | * visited because filter told us to skip the root node. | ||
| 1128 | */ | ||
| 1129 | if (!memcg && (prev || (cond && !last_visited))) | ||
| 1230 | goto out_unlock; | 1130 | goto out_unlock; |
| 1231 | } | 1131 | } |
| 1232 | out_unlock: | 1132 | out_unlock: |
| @@ -1867,6 +1767,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | |||
| 1867 | return total; | 1767 | return total; |
| 1868 | } | 1768 | } |
| 1869 | 1769 | ||
| 1770 | #if MAX_NUMNODES > 1 | ||
| 1870 | /** | 1771 | /** |
| 1871 | * test_mem_cgroup_node_reclaimable | 1772 | * test_mem_cgroup_node_reclaimable |
| 1872 | * @memcg: the target memcg | 1773 | * @memcg: the target memcg |
| @@ -1889,7 +1790,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, | |||
| 1889 | return false; | 1790 | return false; |
| 1890 | 1791 | ||
| 1891 | } | 1792 | } |
| 1892 | #if MAX_NUMNODES > 1 | ||
| 1893 | 1793 | ||
| 1894 | /* | 1794 | /* |
| 1895 | * Always updating the nodemask is not very good - even if we have an empty | 1795 | * Always updating the nodemask is not very good - even if we have an empty |
| @@ -1957,115 +1857,64 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
| 1957 | return node; | 1857 | return node; |
| 1958 | } | 1858 | } |
| 1959 | 1859 | ||
| 1960 | /* | ||
| 1961 | * Check all nodes whether it contains reclaimable pages or not. | ||
| 1962 | * For quick scan, we make use of scan_nodes. This will allow us to skip | ||
| 1963 | * unused nodes. But scan_nodes is lazily updated and may not cotain | ||
| 1964 | * enough new information. We need to do double check. | ||
| 1965 | */ | ||
| 1966 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
| 1967 | { | ||
| 1968 | int nid; | ||
| 1969 | |||
| 1970 | /* | ||
| 1971 | * quick check...making use of scan_node. | ||
| 1972 | * We can skip unused nodes. | ||
| 1973 | */ | ||
| 1974 | if (!nodes_empty(memcg->scan_nodes)) { | ||
| 1975 | for (nid = first_node(memcg->scan_nodes); | ||
| 1976 | nid < MAX_NUMNODES; | ||
| 1977 | nid = next_node(nid, memcg->scan_nodes)) { | ||
| 1978 | |||
| 1979 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
| 1980 | return true; | ||
| 1981 | } | ||
| 1982 | } | ||
| 1983 | /* | ||
| 1984 | * Check rest of nodes. | ||
| 1985 | */ | ||
| 1986 | for_each_node_state(nid, N_MEMORY) { | ||
| 1987 | if (node_isset(nid, memcg->scan_nodes)) | ||
| 1988 | continue; | ||
| 1989 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
| 1990 | return true; | ||
| 1991 | } | ||
| 1992 | return false; | ||
| 1993 | } | ||
| 1994 | |||
| 1995 | #else | 1860 | #else |
| 1996 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | 1861 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
| 1997 | { | 1862 | { |
| 1998 | return 0; | 1863 | return 0; |
| 1999 | } | 1864 | } |
| 2000 | 1865 | ||
| 2001 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
| 2002 | { | ||
| 2003 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); | ||
| 2004 | } | ||
| 2005 | #endif | 1866 | #endif |
| 2006 | 1867 | ||
| 2007 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, | 1868 | /* |
| 2008 | struct zone *zone, | 1869 | * A group is eligible for the soft limit reclaim under the given root |
| 2009 | gfp_t gfp_mask, | 1870 | * hierarchy if |
| 2010 | unsigned long *total_scanned) | 1871 | * a) it is over its soft limit |
| 2011 | { | 1872 | * b) any parent up the hierarchy is over its soft limit |
| 2012 | struct mem_cgroup *victim = NULL; | 1873 | * |
| 2013 | int total = 0; | 1874 | * If the given group doesn't have any children over the limit then it |
| 2014 | int loop = 0; | 1875 | * doesn't make any sense to iterate its subtree. |
| 2015 | unsigned long excess; | 1876 | */ |
| 2016 | unsigned long nr_scanned; | 1877 | enum mem_cgroup_filter_t |
| 2017 | struct mem_cgroup_reclaim_cookie reclaim = { | 1878 | mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, |
| 2018 | .zone = zone, | 1879 | struct mem_cgroup *root) |
| 2019 | .priority = 0, | 1880 | { |
| 2020 | }; | 1881 | struct mem_cgroup *parent; |
| 2021 | 1882 | ||
| 2022 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; | 1883 | if (!memcg) |
| 2023 | 1884 | memcg = root_mem_cgroup; | |
| 2024 | while (1) { | 1885 | parent = memcg; |
| 2025 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); | 1886 | |
| 2026 | if (!victim) { | 1887 | if (res_counter_soft_limit_excess(&memcg->res)) |
| 2027 | loop++; | 1888 | return VISIT; |
| 2028 | if (loop >= 2) { | 1889 | |
| 2029 | /* | 1890 | /* |
| 2030 | * If we have not been able to reclaim | 1891 | * If any parent up to the root in the hierarchy is over its soft limit |
| 2031 | * anything, it might because there are | 1892 | * then we have to obey and reclaim from this group as well. |
| 2032 | * no reclaimable pages under this hierarchy | 1893 | */ |
| 2033 | */ | 1894 | while ((parent = parent_mem_cgroup(parent))) { |
| 2034 | if (!total) | 1895 | if (res_counter_soft_limit_excess(&parent->res)) |
| 2035 | break; | 1896 | return VISIT; |
| 2036 | /* | 1897 | if (parent == root) |
| 2037 | * We want to do more targeted reclaim. | ||
| 2038 | * excess >> 2 is not to excessive so as to | ||
| 2039 | * reclaim too much, nor too less that we keep | ||
| 2040 | * coming back to reclaim from this cgroup | ||
| 2041 | */ | ||
| 2042 | if (total >= (excess >> 2) || | ||
| 2043 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) | ||
| 2044 | break; | ||
| 2045 | } | ||
| 2046 | continue; | ||
| 2047 | } | ||
| 2048 | if (!mem_cgroup_reclaimable(victim, false)) | ||
| 2049 | continue; | ||
| 2050 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, | ||
| 2051 | zone, &nr_scanned); | ||
| 2052 | *total_scanned += nr_scanned; | ||
| 2053 | if (!res_counter_soft_limit_excess(&root_memcg->res)) | ||
| 2054 | break; | 1898 | break; |
| 2055 | } | 1899 | } |
| 2056 | mem_cgroup_iter_break(root_memcg, victim); | 1900 | |
| 2057 | return total; | 1901 | if (!atomic_read(&memcg->children_in_excess)) |
| 1902 | return SKIP_TREE; | ||
| 1903 | return SKIP; | ||
| 2058 | } | 1904 | } |
| 2059 | 1905 | ||
| 1906 | static DEFINE_SPINLOCK(memcg_oom_lock); | ||
| 1907 | |||
| 2060 | /* | 1908 | /* |
| 2061 | * Check OOM-Killer is already running under our hierarchy. | 1909 | * Check OOM-Killer is already running under our hierarchy. |
| 2062 | * If someone is running, return false. | 1910 | * If someone is running, return false. |
| 2063 | * Has to be called with memcg_oom_lock | ||
| 2064 | */ | 1911 | */ |
| 2065 | static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) | 1912 | static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) |
| 2066 | { | 1913 | { |
| 2067 | struct mem_cgroup *iter, *failed = NULL; | 1914 | struct mem_cgroup *iter, *failed = NULL; |
| 2068 | 1915 | ||
| 1916 | spin_lock(&memcg_oom_lock); | ||
| 1917 | |||
| 2069 | for_each_mem_cgroup_tree(iter, memcg) { | 1918 | for_each_mem_cgroup_tree(iter, memcg) { |
| 2070 | if (iter->oom_lock) { | 1919 | if (iter->oom_lock) { |
| 2071 | /* | 1920 | /* |
| @@ -2079,33 +1928,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) | |||
| 2079 | iter->oom_lock = true; | 1928 | iter->oom_lock = true; |
| 2080 | } | 1929 | } |
| 2081 | 1930 | ||
| 2082 | if (!failed) | 1931 | if (failed) { |
| 2083 | return true; | 1932 | /* |
| 2084 | 1933 | * OK, we failed to lock the whole subtree so we have | |
| 2085 | /* | 1934 | * to clean up what we set up to the failing subtree |
| 2086 | * OK, we failed to lock the whole subtree so we have to clean up | 1935 | */ |
| 2087 | * what we set up to the failing subtree | 1936 | for_each_mem_cgroup_tree(iter, memcg) { |
| 2088 | */ | 1937 | if (iter == failed) { |
| 2089 | for_each_mem_cgroup_tree(iter, memcg) { | 1938 | mem_cgroup_iter_break(memcg, iter); |
| 2090 | if (iter == failed) { | 1939 | break; |
| 2091 | mem_cgroup_iter_break(memcg, iter); | 1940 | } |
| 2092 | break; | 1941 | iter->oom_lock = false; |
| 2093 | } | 1942 | } |
| 2094 | iter->oom_lock = false; | ||
| 2095 | } | 1943 | } |
| 2096 | return false; | 1944 | |
| 1945 | spin_unlock(&memcg_oom_lock); | ||
| 1946 | |||
| 1947 | return !failed; | ||
| 2097 | } | 1948 | } |
| 2098 | 1949 | ||
| 2099 | /* | 1950 | static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) |
| 2100 | * Has to be called with memcg_oom_lock | ||
| 2101 | */ | ||
| 2102 | static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) | ||
| 2103 | { | 1951 | { |
| 2104 | struct mem_cgroup *iter; | 1952 | struct mem_cgroup *iter; |
| 2105 | 1953 | ||
| 1954 | spin_lock(&memcg_oom_lock); | ||
| 2106 | for_each_mem_cgroup_tree(iter, memcg) | 1955 | for_each_mem_cgroup_tree(iter, memcg) |
| 2107 | iter->oom_lock = false; | 1956 | iter->oom_lock = false; |
| 2108 | return 0; | 1957 | spin_unlock(&memcg_oom_lock); |
| 2109 | } | 1958 | } |
| 2110 | 1959 | ||
| 2111 | static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) | 1960 | static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) |
| @@ -2129,7 +1978,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) | |||
| 2129 | atomic_add_unless(&iter->under_oom, -1, 0); | 1978 | atomic_add_unless(&iter->under_oom, -1, 0); |
| 2130 | } | 1979 | } |
| 2131 | 1980 | ||
| 2132 | static DEFINE_SPINLOCK(memcg_oom_lock); | ||
| 2133 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1981 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
| 2134 | 1982 | ||
| 2135 | struct oom_wait_info { | 1983 | struct oom_wait_info { |
| @@ -2159,6 +2007,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait, | |||
| 2159 | 2007 | ||
| 2160 | static void memcg_wakeup_oom(struct mem_cgroup *memcg) | 2008 | static void memcg_wakeup_oom(struct mem_cgroup *memcg) |
| 2161 | { | 2009 | { |
| 2010 | atomic_inc(&memcg->oom_wakeups); | ||
| 2162 | /* for filtering, pass "memcg" as argument. */ | 2011 | /* for filtering, pass "memcg" as argument. */ |
| 2163 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); | 2012 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); |
| 2164 | } | 2013 | } |
| @@ -2170,56 +2019,136 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
| 2170 | } | 2019 | } |
| 2171 | 2020 | ||
| 2172 | /* | 2021 | /* |
| 2173 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 2022 | * try to call OOM killer |
| 2174 | */ | 2023 | */ |
| 2175 | static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, | 2024 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
| 2176 | int order) | ||
| 2177 | { | 2025 | { |
| 2178 | struct oom_wait_info owait; | 2026 | bool locked; |
| 2179 | bool locked, need_to_kill; | 2027 | int wakeups; |
| 2180 | 2028 | ||
| 2181 | owait.memcg = memcg; | 2029 | if (!current->memcg_oom.may_oom) |
| 2182 | owait.wait.flags = 0; | 2030 | return; |
| 2183 | owait.wait.func = memcg_oom_wake_function; | 2031 | |
| 2184 | owait.wait.private = current; | 2032 | current->memcg_oom.in_memcg_oom = 1; |
| 2185 | INIT_LIST_HEAD(&owait.wait.task_list); | ||
| 2186 | need_to_kill = true; | ||
| 2187 | mem_cgroup_mark_under_oom(memcg); | ||
| 2188 | 2033 | ||
| 2189 | /* At first, try to OOM lock hierarchy under memcg.*/ | ||
| 2190 | spin_lock(&memcg_oom_lock); | ||
| 2191 | locked = mem_cgroup_oom_lock(memcg); | ||
| 2192 | /* | 2034 | /* |
| 2193 | * Even if signal_pending(), we can't quit charge() loop without | 2035 | * As with any blocking lock, a contender needs to start |
| 2194 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | 2036 | * listening for wakeups before attempting the trylock, |
| 2195 | * under OOM is always welcomed, use TASK_KILLABLE here. | 2037 | * otherwise it can miss the wakeup from the unlock and sleep |
| 2038 | * indefinitely. This is just open-coded because our locking | ||
| 2039 | * is so particular to memcg hierarchies. | ||
| 2196 | */ | 2040 | */ |
| 2197 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 2041 | wakeups = atomic_read(&memcg->oom_wakeups); |
| 2198 | if (!locked || memcg->oom_kill_disable) | 2042 | mem_cgroup_mark_under_oom(memcg); |
| 2199 | need_to_kill = false; | 2043 | |
| 2044 | locked = mem_cgroup_oom_trylock(memcg); | ||
| 2045 | |||
| 2200 | if (locked) | 2046 | if (locked) |
| 2201 | mem_cgroup_oom_notify(memcg); | 2047 | mem_cgroup_oom_notify(memcg); |
| 2202 | spin_unlock(&memcg_oom_lock); | ||
| 2203 | 2048 | ||
| 2204 | if (need_to_kill) { | 2049 | if (locked && !memcg->oom_kill_disable) { |
| 2205 | finish_wait(&memcg_oom_waitq, &owait.wait); | 2050 | mem_cgroup_unmark_under_oom(memcg); |
| 2206 | mem_cgroup_out_of_memory(memcg, mask, order); | 2051 | mem_cgroup_out_of_memory(memcg, mask, order); |
| 2052 | mem_cgroup_oom_unlock(memcg); | ||
| 2053 | /* | ||
| 2054 | * There is no guarantee that an OOM-lock contender | ||
| 2055 | * sees the wakeups triggered by the OOM kill | ||
| 2056 | * uncharges. Wake any sleepers explicitely. | ||
| 2057 | */ | ||
| 2058 | memcg_oom_recover(memcg); | ||
| 2207 | } else { | 2059 | } else { |
| 2208 | schedule(); | 2060 | /* |
| 2209 | finish_wait(&memcg_oom_waitq, &owait.wait); | 2061 | * A system call can just return -ENOMEM, but if this |
| 2062 | * is a page fault and somebody else is handling the | ||
| 2063 | * OOM already, we need to sleep on the OOM waitqueue | ||
| 2064 | * for this memcg until the situation is resolved. | ||
| 2065 | * Which can take some time because it might be | ||
| 2066 | * handled by a userspace task. | ||
| 2067 | * | ||
| 2068 | * However, this is the charge context, which means | ||
| 2069 | * that we may sit on a large call stack and hold | ||
| 2070 | * various filesystem locks, the mmap_sem etc. and we | ||
| 2071 | * don't want the OOM handler to deadlock on them | ||
| 2072 | * while we sit here and wait. Store the current OOM | ||
| 2073 | * context in the task_struct, then return -ENOMEM. | ||
| 2074 | * At the end of the page fault handler, with the | ||
| 2075 | * stack unwound, pagefault_out_of_memory() will check | ||
| 2076 | * back with us by calling | ||
| 2077 | * mem_cgroup_oom_synchronize(), possibly putting the | ||
| 2078 | * task to sleep. | ||
| 2079 | */ | ||
| 2080 | current->memcg_oom.oom_locked = locked; | ||
| 2081 | current->memcg_oom.wakeups = wakeups; | ||
| 2082 | css_get(&memcg->css); | ||
| 2083 | current->memcg_oom.wait_on_memcg = memcg; | ||
| 2210 | } | 2084 | } |
| 2211 | spin_lock(&memcg_oom_lock); | 2085 | } |
| 2212 | if (locked) | ||
| 2213 | mem_cgroup_oom_unlock(memcg); | ||
| 2214 | memcg_wakeup_oom(memcg); | ||
| 2215 | spin_unlock(&memcg_oom_lock); | ||
| 2216 | 2086 | ||
| 2217 | mem_cgroup_unmark_under_oom(memcg); | 2087 | /** |
| 2088 | * mem_cgroup_oom_synchronize - complete memcg OOM handling | ||
| 2089 | * | ||
| 2090 | * This has to be called at the end of a page fault if the the memcg | ||
| 2091 | * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. | ||
| 2092 | * | ||
| 2093 | * Memcg supports userspace OOM handling, so failed allocations must | ||
| 2094 | * sleep on a waitqueue until the userspace task resolves the | ||
| 2095 | * situation. Sleeping directly in the charge context with all kinds | ||
| 2096 | * of locks held is not a good idea, instead we remember an OOM state | ||
| 2097 | * in the task and mem_cgroup_oom_synchronize() has to be called at | ||
| 2098 | * the end of the page fault to put the task to sleep and clean up the | ||
| 2099 | * OOM state. | ||
| 2100 | * | ||
| 2101 | * Returns %true if an ongoing memcg OOM situation was detected and | ||
| 2102 | * finalized, %false otherwise. | ||
| 2103 | */ | ||
| 2104 | bool mem_cgroup_oom_synchronize(void) | ||
| 2105 | { | ||
| 2106 | struct oom_wait_info owait; | ||
| 2107 | struct mem_cgroup *memcg; | ||
| 2218 | 2108 | ||
| 2219 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 2109 | /* OOM is global, do not handle */ |
| 2110 | if (!current->memcg_oom.in_memcg_oom) | ||
| 2220 | return false; | 2111 | return false; |
| 2221 | /* Give chance to dying process */ | 2112 | |
| 2222 | schedule_timeout_uninterruptible(1); | 2113 | /* |
| 2114 | * We invoked the OOM killer but there is a chance that a kill | ||
| 2115 | * did not free up any charges. Everybody else might already | ||
| 2116 | * be sleeping, so restart the fault and keep the rampage | ||
| 2117 | * going until some charges are released. | ||
| 2118 | */ | ||
| 2119 | memcg = current->memcg_oom.wait_on_memcg; | ||
| 2120 | if (!memcg) | ||
| 2121 | goto out; | ||
| 2122 | |||
| 2123 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | ||
| 2124 | goto out_memcg; | ||
| 2125 | |||
| 2126 | owait.memcg = memcg; | ||
| 2127 | owait.wait.flags = 0; | ||
| 2128 | owait.wait.func = memcg_oom_wake_function; | ||
| 2129 | owait.wait.private = current; | ||
| 2130 | INIT_LIST_HEAD(&owait.wait.task_list); | ||
| 2131 | |||
| 2132 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | ||
| 2133 | /* Only sleep if we didn't miss any wakeups since OOM */ | ||
| 2134 | if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) | ||
| 2135 | schedule(); | ||
| 2136 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
| 2137 | out_memcg: | ||
| 2138 | mem_cgroup_unmark_under_oom(memcg); | ||
| 2139 | if (current->memcg_oom.oom_locked) { | ||
| 2140 | mem_cgroup_oom_unlock(memcg); | ||
| 2141 | /* | ||
| 2142 | * There is no guarantee that an OOM-lock contender | ||
| 2143 | * sees the wakeups triggered by the OOM kill | ||
| 2144 | * uncharges. Wake any sleepers explicitely. | ||
| 2145 | */ | ||
| 2146 | memcg_oom_recover(memcg); | ||
| 2147 | } | ||
| 2148 | css_put(&memcg->css); | ||
| 2149 | current->memcg_oom.wait_on_memcg = NULL; | ||
| 2150 | out: | ||
| 2151 | current->memcg_oom.in_memcg_oom = 0; | ||
| 2223 | return true; | 2152 | return true; |
| 2224 | } | 2153 | } |
| 2225 | 2154 | ||
| @@ -2288,7 +2217,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) | |||
| 2288 | } | 2217 | } |
| 2289 | 2218 | ||
| 2290 | void mem_cgroup_update_page_stat(struct page *page, | 2219 | void mem_cgroup_update_page_stat(struct page *page, |
| 2291 | enum mem_cgroup_page_stat_item idx, int val) | 2220 | enum mem_cgroup_stat_index idx, int val) |
| 2292 | { | 2221 | { |
| 2293 | struct mem_cgroup *memcg; | 2222 | struct mem_cgroup *memcg; |
| 2294 | struct page_cgroup *pc = lookup_page_cgroup(page); | 2223 | struct page_cgroup *pc = lookup_page_cgroup(page); |
| @@ -2297,18 +2226,11 @@ void mem_cgroup_update_page_stat(struct page *page, | |||
| 2297 | if (mem_cgroup_disabled()) | 2226 | if (mem_cgroup_disabled()) |
| 2298 | return; | 2227 | return; |
| 2299 | 2228 | ||
| 2229 | VM_BUG_ON(!rcu_read_lock_held()); | ||
| 2300 | memcg = pc->mem_cgroup; | 2230 | memcg = pc->mem_cgroup; |
| 2301 | if (unlikely(!memcg || !PageCgroupUsed(pc))) | 2231 | if (unlikely(!memcg || !PageCgroupUsed(pc))) |
| 2302 | return; | 2232 | return; |
| 2303 | 2233 | ||
| 2304 | switch (idx) { | ||
| 2305 | case MEMCG_NR_FILE_MAPPED: | ||
| 2306 | idx = MEM_CGROUP_STAT_FILE_MAPPED; | ||
| 2307 | break; | ||
| 2308 | default: | ||
| 2309 | BUG(); | ||
| 2310 | } | ||
| 2311 | |||
| 2312 | this_cpu_add(memcg->stat->count[idx], val); | 2234 | this_cpu_add(memcg->stat->count[idx], val); |
| 2313 | } | 2235 | } |
| 2314 | 2236 | ||
| @@ -2450,7 +2372,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) | |||
| 2450 | flush_work(&stock->work); | 2372 | flush_work(&stock->work); |
| 2451 | } | 2373 | } |
| 2452 | out: | 2374 | out: |
| 2453 | put_online_cpus(); | 2375 | put_online_cpus(); |
| 2454 | } | 2376 | } |
| 2455 | 2377 | ||
| 2456 | /* | 2378 | /* |
| @@ -2532,12 +2454,11 @@ enum { | |||
| 2532 | CHARGE_RETRY, /* need to retry but retry is not bad */ | 2454 | CHARGE_RETRY, /* need to retry but retry is not bad */ |
| 2533 | CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ | 2455 | CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ |
| 2534 | CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ | 2456 | CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ |
| 2535 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ | ||
| 2536 | }; | 2457 | }; |
| 2537 | 2458 | ||
| 2538 | static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | 2459 | static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, |
| 2539 | unsigned int nr_pages, unsigned int min_pages, | 2460 | unsigned int nr_pages, unsigned int min_pages, |
| 2540 | bool oom_check) | 2461 | bool invoke_oom) |
| 2541 | { | 2462 | { |
| 2542 | unsigned long csize = nr_pages * PAGE_SIZE; | 2463 | unsigned long csize = nr_pages * PAGE_SIZE; |
| 2543 | struct mem_cgroup *mem_over_limit; | 2464 | struct mem_cgroup *mem_over_limit; |
| @@ -2594,14 +2515,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
| 2594 | if (mem_cgroup_wait_acct_move(mem_over_limit)) | 2515 | if (mem_cgroup_wait_acct_move(mem_over_limit)) |
| 2595 | return CHARGE_RETRY; | 2516 | return CHARGE_RETRY; |
| 2596 | 2517 | ||
| 2597 | /* If we don't need to call oom-killer at el, return immediately */ | 2518 | if (invoke_oom) |
| 2598 | if (!oom_check) | 2519 | mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); |
| 2599 | return CHARGE_NOMEM; | ||
| 2600 | /* check OOM */ | ||
| 2601 | if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) | ||
| 2602 | return CHARGE_OOM_DIE; | ||
| 2603 | 2520 | ||
| 2604 | return CHARGE_RETRY; | 2521 | return CHARGE_NOMEM; |
| 2605 | } | 2522 | } |
| 2606 | 2523 | ||
| 2607 | /* | 2524 | /* |
| @@ -2704,7 +2621,7 @@ again: | |||
| 2704 | } | 2621 | } |
| 2705 | 2622 | ||
| 2706 | do { | 2623 | do { |
| 2707 | bool oom_check; | 2624 | bool invoke_oom = oom && !nr_oom_retries; |
| 2708 | 2625 | ||
| 2709 | /* If killed, bypass charge */ | 2626 | /* If killed, bypass charge */ |
| 2710 | if (fatal_signal_pending(current)) { | 2627 | if (fatal_signal_pending(current)) { |
| @@ -2712,14 +2629,8 @@ again: | |||
| 2712 | goto bypass; | 2629 | goto bypass; |
| 2713 | } | 2630 | } |
| 2714 | 2631 | ||
| 2715 | oom_check = false; | 2632 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, |
| 2716 | if (oom && !nr_oom_retries) { | 2633 | nr_pages, invoke_oom); |
| 2717 | oom_check = true; | ||
| 2718 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
| 2719 | } | ||
| 2720 | |||
| 2721 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, | ||
| 2722 | oom_check); | ||
| 2723 | switch (ret) { | 2634 | switch (ret) { |
| 2724 | case CHARGE_OK: | 2635 | case CHARGE_OK: |
| 2725 | break; | 2636 | break; |
| @@ -2732,16 +2643,12 @@ again: | |||
| 2732 | css_put(&memcg->css); | 2643 | css_put(&memcg->css); |
| 2733 | goto nomem; | 2644 | goto nomem; |
| 2734 | case CHARGE_NOMEM: /* OOM routine works */ | 2645 | case CHARGE_NOMEM: /* OOM routine works */ |
| 2735 | if (!oom) { | 2646 | if (!oom || invoke_oom) { |
| 2736 | css_put(&memcg->css); | 2647 | css_put(&memcg->css); |
| 2737 | goto nomem; | 2648 | goto nomem; |
| 2738 | } | 2649 | } |
| 2739 | /* If oom, we never return -ENOMEM */ | ||
| 2740 | nr_oom_retries--; | 2650 | nr_oom_retries--; |
| 2741 | break; | 2651 | break; |
| 2742 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ | ||
| 2743 | css_put(&memcg->css); | ||
| 2744 | goto bypass; | ||
| 2745 | } | 2652 | } |
| 2746 | } while (ret != CHARGE_OK); | 2653 | } while (ret != CHARGE_OK); |
| 2747 | 2654 | ||
| @@ -2882,7 +2789,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
| 2882 | * is accessed after testing USED bit. To make pc->mem_cgroup visible | 2789 | * is accessed after testing USED bit. To make pc->mem_cgroup visible |
| 2883 | * before USED bit, we need memory barrier here. | 2790 | * before USED bit, we need memory barrier here. |
| 2884 | * See mem_cgroup_add_lru_list(), etc. | 2791 | * See mem_cgroup_add_lru_list(), etc. |
| 2885 | */ | 2792 | */ |
| 2886 | smp_wmb(); | 2793 | smp_wmb(); |
| 2887 | SetPageCgroupUsed(pc); | 2794 | SetPageCgroupUsed(pc); |
| 2888 | 2795 | ||
| @@ -2905,9 +2812,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
| 2905 | unlock_page_cgroup(pc); | 2812 | unlock_page_cgroup(pc); |
| 2906 | 2813 | ||
| 2907 | /* | 2814 | /* |
| 2908 | * "charge_statistics" updated event counter. Then, check it. | 2815 | * "charge_statistics" updated event counter. |
| 2909 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
| 2910 | * if they exceeds softlimit. | ||
| 2911 | */ | 2816 | */ |
| 2912 | memcg_check_events(memcg, page); | 2817 | memcg_check_events(memcg, page); |
| 2913 | } | 2818 | } |
| @@ -3626,9 +3531,9 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
| 3626 | * the page allocator. Therefore, the following sequence when backed by | 3531 | * the page allocator. Therefore, the following sequence when backed by |
| 3627 | * the SLUB allocator: | 3532 | * the SLUB allocator: |
| 3628 | * | 3533 | * |
| 3629 | * memcg_stop_kmem_account(); | 3534 | * memcg_stop_kmem_account(); |
| 3630 | * kmalloc(<large_number>) | 3535 | * kmalloc(<large_number>) |
| 3631 | * memcg_resume_kmem_account(); | 3536 | * memcg_resume_kmem_account(); |
| 3632 | * | 3537 | * |
| 3633 | * would effectively ignore the fact that we should skip accounting, | 3538 | * would effectively ignore the fact that we should skip accounting, |
| 3634 | * since it will drive us directly to this function without passing | 3539 | * since it will drive us directly to this function without passing |
| @@ -3750,6 +3655,20 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
| 3750 | } | 3655 | } |
| 3751 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 3656 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
| 3752 | 3657 | ||
| 3658 | static inline | ||
| 3659 | void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, | ||
| 3660 | struct mem_cgroup *to, | ||
| 3661 | unsigned int nr_pages, | ||
| 3662 | enum mem_cgroup_stat_index idx) | ||
| 3663 | { | ||
| 3664 | /* Update stat data for mem_cgroup */ | ||
| 3665 | preempt_disable(); | ||
| 3666 | WARN_ON_ONCE(from->stat->count[idx] < nr_pages); | ||
| 3667 | __this_cpu_add(from->stat->count[idx], -nr_pages); | ||
| 3668 | __this_cpu_add(to->stat->count[idx], nr_pages); | ||
| 3669 | preempt_enable(); | ||
| 3670 | } | ||
| 3671 | |||
| 3753 | /** | 3672 | /** |
| 3754 | * mem_cgroup_move_account - move account of the page | 3673 | * mem_cgroup_move_account - move account of the page |
| 3755 | * @page: the page | 3674 | * @page: the page |
| @@ -3795,13 +3714,14 @@ static int mem_cgroup_move_account(struct page *page, | |||
| 3795 | 3714 | ||
| 3796 | move_lock_mem_cgroup(from, &flags); | 3715 | move_lock_mem_cgroup(from, &flags); |
| 3797 | 3716 | ||
| 3798 | if (!anon && page_mapped(page)) { | 3717 | if (!anon && page_mapped(page)) |
| 3799 | /* Update mapped_file data for mem_cgroup */ | 3718 | mem_cgroup_move_account_page_stat(from, to, nr_pages, |
| 3800 | preempt_disable(); | 3719 | MEM_CGROUP_STAT_FILE_MAPPED); |
| 3801 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 3720 | |
| 3802 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 3721 | if (PageWriteback(page)) |
| 3803 | preempt_enable(); | 3722 | mem_cgroup_move_account_page_stat(from, to, nr_pages, |
| 3804 | } | 3723 | MEM_CGROUP_STAT_WRITEBACK); |
| 3724 | |||
| 3805 | mem_cgroup_charge_statistics(from, page, anon, -nr_pages); | 3725 | mem_cgroup_charge_statistics(from, page, anon, -nr_pages); |
| 3806 | 3726 | ||
| 3807 | /* caller should have done css_get */ | 3727 | /* caller should have done css_get */ |
| @@ -4657,7 +4577,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
| 4657 | MEM_CGROUP_RECLAIM_SHRINK); | 4577 | MEM_CGROUP_RECLAIM_SHRINK); |
| 4658 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 4578 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
| 4659 | /* Usage is reduced ? */ | 4579 | /* Usage is reduced ? */ |
| 4660 | if (curusage >= oldusage) | 4580 | if (curusage >= oldusage) |
| 4661 | retry_count--; | 4581 | retry_count--; |
| 4662 | else | 4582 | else |
| 4663 | oldusage = curusage; | 4583 | oldusage = curusage; |
| @@ -4678,7 +4598,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
| 4678 | int enlarge = 0; | 4598 | int enlarge = 0; |
| 4679 | 4599 | ||
| 4680 | /* see mem_cgroup_resize_res_limit */ | 4600 | /* see mem_cgroup_resize_res_limit */ |
| 4681 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; | 4601 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; |
| 4682 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 4602 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
| 4683 | while (retry_count) { | 4603 | while (retry_count) { |
| 4684 | if (signal_pending(current)) { | 4604 | if (signal_pending(current)) { |
| @@ -4727,98 +4647,6 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
| 4727 | return ret; | 4647 | return ret; |
| 4728 | } | 4648 | } |
| 4729 | 4649 | ||
| 4730 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | ||
| 4731 | gfp_t gfp_mask, | ||
| 4732 | unsigned long *total_scanned) | ||
| 4733 | { | ||
| 4734 | unsigned long nr_reclaimed = 0; | ||
| 4735 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | ||
| 4736 | unsigned long reclaimed; | ||
| 4737 | int loop = 0; | ||
| 4738 | struct mem_cgroup_tree_per_zone *mctz; | ||
| 4739 | unsigned long long excess; | ||
| 4740 | unsigned long nr_scanned; | ||
| 4741 | |||
| 4742 | if (order > 0) | ||
| 4743 | return 0; | ||
| 4744 | |||
| 4745 | mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); | ||
| 4746 | /* | ||
| 4747 | * This loop can run a while, specially if mem_cgroup's continuously | ||
| 4748 | * keep exceeding their soft limit and putting the system under | ||
| 4749 | * pressure | ||
| 4750 | */ | ||
| 4751 | do { | ||
| 4752 | if (next_mz) | ||
| 4753 | mz = next_mz; | ||
| 4754 | else | ||
| 4755 | mz = mem_cgroup_largest_soft_limit_node(mctz); | ||
| 4756 | if (!mz) | ||
| 4757 | break; | ||
| 4758 | |||
| 4759 | nr_scanned = 0; | ||
| 4760 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, | ||
| 4761 | gfp_mask, &nr_scanned); | ||
| 4762 | nr_reclaimed += reclaimed; | ||
| 4763 | *total_scanned += nr_scanned; | ||
| 4764 | spin_lock(&mctz->lock); | ||
| 4765 | |||
| 4766 | /* | ||
| 4767 | * If we failed to reclaim anything from this memory cgroup | ||
| 4768 | * it is time to move on to the next cgroup | ||
| 4769 | */ | ||
| 4770 | next_mz = NULL; | ||
| 4771 | if (!reclaimed) { | ||
| 4772 | do { | ||
| 4773 | /* | ||
| 4774 | * Loop until we find yet another one. | ||
| 4775 | * | ||
| 4776 | * By the time we get the soft_limit lock | ||
| 4777 | * again, someone might have aded the | ||
| 4778 | * group back on the RB tree. Iterate to | ||
| 4779 | * make sure we get a different mem. | ||
| 4780 | * mem_cgroup_largest_soft_limit_node returns | ||
| 4781 | * NULL if no other cgroup is present on | ||
| 4782 | * the tree | ||
| 4783 | */ | ||
| 4784 | next_mz = | ||
| 4785 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
| 4786 | if (next_mz == mz) | ||
| 4787 | css_put(&next_mz->memcg->css); | ||
| 4788 | else /* next_mz == NULL or other memcg */ | ||
| 4789 | break; | ||
| 4790 | } while (1); | ||
| 4791 | } | ||
| 4792 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
| 4793 | excess = res_counter_soft_limit_excess(&mz->memcg->res); | ||
| 4794 | /* | ||
| 4795 | * One school of thought says that we should not add | ||
| 4796 | * back the node to the tree if reclaim returns 0. | ||
| 4797 | * But our reclaim could return 0, simply because due | ||
| 4798 | * to priority we are exposing a smaller subset of | ||
| 4799 | * memory to reclaim from. Consider this as a longer | ||
| 4800 | * term TODO. | ||
| 4801 | */ | ||
| 4802 | /* If excess == 0, no tree ops */ | ||
| 4803 | __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); | ||
| 4804 | spin_unlock(&mctz->lock); | ||
| 4805 | css_put(&mz->memcg->css); | ||
| 4806 | loop++; | ||
| 4807 | /* | ||
| 4808 | * Could not reclaim anything and there are no more | ||
| 4809 | * mem cgroups to try or we seem to be looping without | ||
| 4810 | * reclaiming anything. | ||
| 4811 | */ | ||
| 4812 | if (!nr_reclaimed && | ||
| 4813 | (next_mz == NULL || | ||
| 4814 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | ||
| 4815 | break; | ||
| 4816 | } while (!nr_reclaimed); | ||
| 4817 | if (next_mz) | ||
| 4818 | css_put(&next_mz->memcg->css); | ||
| 4819 | return nr_reclaimed; | ||
| 4820 | } | ||
| 4821 | |||
| 4822 | /** | 4650 | /** |
| 4823 | * mem_cgroup_force_empty_list - clears LRU of a group | 4651 | * mem_cgroup_force_empty_list - clears LRU of a group |
| 4824 | * @memcg: group to clear | 4652 | * @memcg: group to clear |
| @@ -4990,18 +4818,12 @@ static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, | |||
| 4990 | unsigned int event) | 4818 | unsigned int event) |
| 4991 | { | 4819 | { |
| 4992 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4820 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
| 4993 | int ret; | ||
| 4994 | 4821 | ||
| 4995 | if (mem_cgroup_is_root(memcg)) | 4822 | if (mem_cgroup_is_root(memcg)) |
| 4996 | return -EINVAL; | 4823 | return -EINVAL; |
| 4997 | css_get(&memcg->css); | 4824 | return mem_cgroup_force_empty(memcg); |
| 4998 | ret = mem_cgroup_force_empty(memcg); | ||
| 4999 | css_put(&memcg->css); | ||
| 5000 | |||
| 5001 | return ret; | ||
| 5002 | } | 4825 | } |
| 5003 | 4826 | ||
| 5004 | |||
| 5005 | static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, | 4827 | static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, |
| 5006 | struct cftype *cft) | 4828 | struct cftype *cft) |
| 5007 | { | 4829 | { |
| @@ -5139,7 +4961,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) | |||
| 5139 | */ | 4961 | */ |
| 5140 | mutex_lock(&memcg_create_mutex); | 4962 | mutex_lock(&memcg_create_mutex); |
| 5141 | mutex_lock(&set_limit_mutex); | 4963 | mutex_lock(&set_limit_mutex); |
| 5142 | if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { | 4964 | if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { |
| 5143 | if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { | 4965 | if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { |
| 5144 | ret = -EBUSY; | 4966 | ret = -EBUSY; |
| 5145 | goto out; | 4967 | goto out; |
| @@ -5149,7 +4971,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) | |||
| 5149 | 4971 | ||
| 5150 | ret = memcg_update_cache_sizes(memcg); | 4972 | ret = memcg_update_cache_sizes(memcg); |
| 5151 | if (ret) { | 4973 | if (ret) { |
| 5152 | res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); | 4974 | res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); |
| 5153 | goto out; | 4975 | goto out; |
| 5154 | } | 4976 | } |
| 5155 | static_key_slow_inc(&memcg_kmem_enabled_key); | 4977 | static_key_slow_inc(&memcg_kmem_enabled_key); |
| @@ -6089,8 +5911,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
| 6089 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 5911 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
| 6090 | mz = &pn->zoneinfo[zone]; | 5912 | mz = &pn->zoneinfo[zone]; |
| 6091 | lruvec_init(&mz->lruvec); | 5913 | lruvec_init(&mz->lruvec); |
| 6092 | mz->usage_in_excess = 0; | ||
| 6093 | mz->on_tree = false; | ||
| 6094 | mz->memcg = memcg; | 5914 | mz->memcg = memcg; |
| 6095 | } | 5915 | } |
| 6096 | memcg->nodeinfo[node] = pn; | 5916 | memcg->nodeinfo[node] = pn; |
| @@ -6146,7 +5966,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
| 6146 | int node; | 5966 | int node; |
| 6147 | size_t size = memcg_size(); | 5967 | size_t size = memcg_size(); |
| 6148 | 5968 | ||
| 6149 | mem_cgroup_remove_from_trees(memcg); | ||
| 6150 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 5969 | free_css_id(&mem_cgroup_subsys, &memcg->css); |
| 6151 | 5970 | ||
| 6152 | for_each_node(node) | 5971 | for_each_node(node) |
| @@ -6183,29 +6002,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
| 6183 | } | 6002 | } |
| 6184 | EXPORT_SYMBOL(parent_mem_cgroup); | 6003 | EXPORT_SYMBOL(parent_mem_cgroup); |
| 6185 | 6004 | ||
| 6186 | static void __init mem_cgroup_soft_limit_tree_init(void) | ||
| 6187 | { | ||
| 6188 | struct mem_cgroup_tree_per_node *rtpn; | ||
| 6189 | struct mem_cgroup_tree_per_zone *rtpz; | ||
| 6190 | int tmp, node, zone; | ||
| 6191 | |||
| 6192 | for_each_node(node) { | ||
| 6193 | tmp = node; | ||
| 6194 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
| 6195 | tmp = -1; | ||
| 6196 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
| 6197 | BUG_ON(!rtpn); | ||
| 6198 | |||
| 6199 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
| 6200 | |||
| 6201 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
| 6202 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
| 6203 | rtpz->rb_root = RB_ROOT; | ||
| 6204 | spin_lock_init(&rtpz->lock); | ||
| 6205 | } | ||
| 6206 | } | ||
| 6207 | } | ||
| 6208 | |||
| 6209 | static struct cgroup_subsys_state * __ref | 6005 | static struct cgroup_subsys_state * __ref |
| 6210 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | 6006 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) |
| 6211 | { | 6007 | { |
| @@ -6235,6 +6031,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
| 6235 | mutex_init(&memcg->thresholds_lock); | 6031 | mutex_init(&memcg->thresholds_lock); |
| 6236 | spin_lock_init(&memcg->move_lock); | 6032 | spin_lock_init(&memcg->move_lock); |
| 6237 | vmpressure_init(&memcg->vmpressure); | 6033 | vmpressure_init(&memcg->vmpressure); |
| 6034 | spin_lock_init(&memcg->soft_lock); | ||
| 6238 | 6035 | ||
| 6239 | return &memcg->css; | 6036 | return &memcg->css; |
| 6240 | 6037 | ||
| @@ -6312,6 +6109,13 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
| 6312 | 6109 | ||
| 6313 | mem_cgroup_invalidate_reclaim_iterators(memcg); | 6110 | mem_cgroup_invalidate_reclaim_iterators(memcg); |
| 6314 | mem_cgroup_reparent_charges(memcg); | 6111 | mem_cgroup_reparent_charges(memcg); |
| 6112 | if (memcg->soft_contributed) { | ||
| 6113 | while ((memcg = parent_mem_cgroup(memcg))) | ||
| 6114 | atomic_dec(&memcg->children_in_excess); | ||
| 6115 | |||
| 6116 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
| 6117 | atomic_dec(&root_mem_cgroup->children_in_excess); | ||
| 6118 | } | ||
| 6315 | mem_cgroup_destroy_all_caches(memcg); | 6119 | mem_cgroup_destroy_all_caches(memcg); |
| 6316 | vmpressure_cleanup(&memcg->vmpressure); | 6120 | vmpressure_cleanup(&memcg->vmpressure); |
| 6317 | } | 6121 | } |
| @@ -6986,7 +6790,6 @@ static int __init mem_cgroup_init(void) | |||
| 6986 | { | 6790 | { |
| 6987 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 6791 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
| 6988 | enable_swap_cgroup(); | 6792 | enable_swap_cgroup(); |
| 6989 | mem_cgroup_soft_limit_tree_init(); | ||
| 6990 | memcg_stock_init(); | 6793 | memcg_stock_init(); |
| 6991 | return 0; | 6794 | return 0; |
| 6992 | } | 6795 | } |
diff --git a/mm/memory.c b/mm/memory.c index 2b73dbde2274..ca0003947115 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -3695,7 +3695,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3695 | * but allow concurrent faults), and pte mapped but not yet locked. | 3695 | * but allow concurrent faults), and pte mapped but not yet locked. |
| 3696 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 3696 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
| 3697 | */ | 3697 | */ |
| 3698 | int handle_pte_fault(struct mm_struct *mm, | 3698 | static int handle_pte_fault(struct mm_struct *mm, |
| 3699 | struct vm_area_struct *vma, unsigned long address, | 3699 | struct vm_area_struct *vma, unsigned long address, |
| 3700 | pte_t *pte, pmd_t *pmd, unsigned int flags) | 3700 | pte_t *pte, pmd_t *pmd, unsigned int flags) |
| 3701 | { | 3701 | { |
| @@ -3754,22 +3754,14 @@ unlock: | |||
| 3754 | /* | 3754 | /* |
| 3755 | * By the time we get here, we already hold the mm semaphore | 3755 | * By the time we get here, we already hold the mm semaphore |
| 3756 | */ | 3756 | */ |
| 3757 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3757 | static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
| 3758 | unsigned long address, unsigned int flags) | 3758 | unsigned long address, unsigned int flags) |
| 3759 | { | 3759 | { |
| 3760 | pgd_t *pgd; | 3760 | pgd_t *pgd; |
| 3761 | pud_t *pud; | 3761 | pud_t *pud; |
| 3762 | pmd_t *pmd; | 3762 | pmd_t *pmd; |
| 3763 | pte_t *pte; | 3763 | pte_t *pte; |
| 3764 | 3764 | ||
| 3765 | __set_current_state(TASK_RUNNING); | ||
| 3766 | |||
| 3767 | count_vm_event(PGFAULT); | ||
| 3768 | mem_cgroup_count_vm_event(mm, PGFAULT); | ||
| 3769 | |||
| 3770 | /* do counter updates before entering really critical section. */ | ||
| 3771 | check_sync_rss_stat(current); | ||
| 3772 | |||
| 3773 | if (unlikely(is_vm_hugetlb_page(vma))) | 3765 | if (unlikely(is_vm_hugetlb_page(vma))) |
| 3774 | return hugetlb_fault(mm, vma, address, flags); | 3766 | return hugetlb_fault(mm, vma, address, flags); |
| 3775 | 3767 | ||
| @@ -3782,9 +3774,12 @@ retry: | |||
| 3782 | if (!pmd) | 3774 | if (!pmd) |
| 3783 | return VM_FAULT_OOM; | 3775 | return VM_FAULT_OOM; |
| 3784 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { | 3776 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { |
| 3777 | int ret = VM_FAULT_FALLBACK; | ||
| 3785 | if (!vma->vm_ops) | 3778 | if (!vma->vm_ops) |
| 3786 | return do_huge_pmd_anonymous_page(mm, vma, address, | 3779 | ret = do_huge_pmd_anonymous_page(mm, vma, address, |
| 3787 | pmd, flags); | 3780 | pmd, flags); |
| 3781 | if (!(ret & VM_FAULT_FALLBACK)) | ||
| 3782 | return ret; | ||
| 3788 | } else { | 3783 | } else { |
| 3789 | pmd_t orig_pmd = *pmd; | 3784 | pmd_t orig_pmd = *pmd; |
| 3790 | int ret; | 3785 | int ret; |
| @@ -3850,6 +3845,37 @@ retry: | |||
| 3850 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 3845 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); |
| 3851 | } | 3846 | } |
| 3852 | 3847 | ||
| 3848 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 3849 | unsigned long address, unsigned int flags) | ||
| 3850 | { | ||
| 3851 | int ret; | ||
| 3852 | |||
| 3853 | __set_current_state(TASK_RUNNING); | ||
| 3854 | |||
| 3855 | count_vm_event(PGFAULT); | ||
| 3856 | mem_cgroup_count_vm_event(mm, PGFAULT); | ||
| 3857 | |||
| 3858 | /* do counter updates before entering really critical section. */ | ||
| 3859 | check_sync_rss_stat(current); | ||
| 3860 | |||
| 3861 | /* | ||
| 3862 | * Enable the memcg OOM handling for faults triggered in user | ||
| 3863 | * space. Kernel faults are handled more gracefully. | ||
| 3864 | */ | ||
| 3865 | if (flags & FAULT_FLAG_USER) | ||
| 3866 | mem_cgroup_enable_oom(); | ||
| 3867 | |||
| 3868 | ret = __handle_mm_fault(mm, vma, address, flags); | ||
| 3869 | |||
| 3870 | if (flags & FAULT_FLAG_USER) | ||
| 3871 | mem_cgroup_disable_oom(); | ||
| 3872 | |||
| 3873 | if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) | ||
| 3874 | mem_cgroup_oom_synchronize(); | ||
| 3875 | |||
| 3876 | return ret; | ||
| 3877 | } | ||
| 3878 | |||
| 3853 | #ifndef __PAGETABLE_PUD_FOLDED | 3879 | #ifndef __PAGETABLE_PUD_FOLDED |
| 3854 | /* | 3880 | /* |
| 3855 | * Allocate page upper directory. | 3881 | * Allocate page upper directory. |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 98e75f2ac7bc..314e9d274381 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -678,9 +678,12 @@ out: | |||
| 678 | */ | 678 | */ |
| 679 | void pagefault_out_of_memory(void) | 679 | void pagefault_out_of_memory(void) |
| 680 | { | 680 | { |
| 681 | struct zonelist *zonelist = node_zonelist(first_online_node, | 681 | struct zonelist *zonelist; |
| 682 | GFP_KERNEL); | ||
| 683 | 682 | ||
| 683 | if (mem_cgroup_oom_synchronize()) | ||
| 684 | return; | ||
| 685 | |||
| 686 | zonelist = node_zonelist(first_online_node, GFP_KERNEL); | ||
| 684 | if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { | 687 | if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { |
| 685 | out_of_memory(NULL, 0, 0, NULL, false); | 688 | out_of_memory(NULL, 0, 0, NULL, false); |
| 686 | clear_zonelist_oom(zonelist, GFP_KERNEL); | 689 | clear_zonelist_oom(zonelist, GFP_KERNEL); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6c7b0187be8e..f5236f804aa6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -2143,11 +2143,17 @@ EXPORT_SYMBOL(account_page_dirtied); | |||
| 2143 | 2143 | ||
| 2144 | /* | 2144 | /* |
| 2145 | * Helper function for set_page_writeback family. | 2145 | * Helper function for set_page_writeback family. |
| 2146 | * | ||
| 2147 | * The caller must hold mem_cgroup_begin/end_update_page_stat() lock | ||
| 2148 | * while calling this function. | ||
| 2149 | * See test_set_page_writeback for example. | ||
| 2150 | * | ||
| 2146 | * NOTE: Unlike account_page_dirtied this does not rely on being atomic | 2151 | * NOTE: Unlike account_page_dirtied this does not rely on being atomic |
| 2147 | * wrt interrupts. | 2152 | * wrt interrupts. |
| 2148 | */ | 2153 | */ |
| 2149 | void account_page_writeback(struct page *page) | 2154 | void account_page_writeback(struct page *page) |
| 2150 | { | 2155 | { |
| 2156 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); | ||
| 2151 | inc_zone_page_state(page, NR_WRITEBACK); | 2157 | inc_zone_page_state(page, NR_WRITEBACK); |
| 2152 | } | 2158 | } |
| 2153 | EXPORT_SYMBOL(account_page_writeback); | 2159 | EXPORT_SYMBOL(account_page_writeback); |
| @@ -2364,7 +2370,10 @@ int test_clear_page_writeback(struct page *page) | |||
| 2364 | { | 2370 | { |
| 2365 | struct address_space *mapping = page_mapping(page); | 2371 | struct address_space *mapping = page_mapping(page); |
| 2366 | int ret; | 2372 | int ret; |
| 2373 | bool locked; | ||
| 2374 | unsigned long memcg_flags; | ||
| 2367 | 2375 | ||
| 2376 | mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); | ||
| 2368 | if (mapping) { | 2377 | if (mapping) { |
| 2369 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 2378 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
| 2370 | unsigned long flags; | 2379 | unsigned long flags; |
| @@ -2385,9 +2394,11 @@ int test_clear_page_writeback(struct page *page) | |||
| 2385 | ret = TestClearPageWriteback(page); | 2394 | ret = TestClearPageWriteback(page); |
| 2386 | } | 2395 | } |
| 2387 | if (ret) { | 2396 | if (ret) { |
| 2397 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); | ||
| 2388 | dec_zone_page_state(page, NR_WRITEBACK); | 2398 | dec_zone_page_state(page, NR_WRITEBACK); |
| 2389 | inc_zone_page_state(page, NR_WRITTEN); | 2399 | inc_zone_page_state(page, NR_WRITTEN); |
| 2390 | } | 2400 | } |
| 2401 | mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); | ||
| 2391 | return ret; | 2402 | return ret; |
| 2392 | } | 2403 | } |
| 2393 | 2404 | ||
| @@ -2395,7 +2406,10 @@ int test_set_page_writeback(struct page *page) | |||
| 2395 | { | 2406 | { |
| 2396 | struct address_space *mapping = page_mapping(page); | 2407 | struct address_space *mapping = page_mapping(page); |
| 2397 | int ret; | 2408 | int ret; |
| 2409 | bool locked; | ||
| 2410 | unsigned long memcg_flags; | ||
| 2398 | 2411 | ||
| 2412 | mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); | ||
| 2399 | if (mapping) { | 2413 | if (mapping) { |
| 2400 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 2414 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
| 2401 | unsigned long flags; | 2415 | unsigned long flags; |
| @@ -2422,6 +2436,7 @@ int test_set_page_writeback(struct page *page) | |||
| 2422 | } | 2436 | } |
| 2423 | if (!ret) | 2437 | if (!ret) |
| 2424 | account_page_writeback(page); | 2438 | account_page_writeback(page); |
| 2439 | mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); | ||
| 2425 | return ret; | 2440 | return ret; |
| 2426 | 2441 | ||
| 2427 | } | 2442 | } |
| @@ -1052,11 +1052,11 @@ void do_page_add_anon_rmap(struct page *page, | |||
| 1052 | { | 1052 | { |
| 1053 | int first = atomic_inc_and_test(&page->_mapcount); | 1053 | int first = atomic_inc_and_test(&page->_mapcount); |
| 1054 | if (first) { | 1054 | if (first) { |
| 1055 | if (!PageTransHuge(page)) | 1055 | if (PageTransHuge(page)) |
| 1056 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
| 1057 | else | ||
| 1058 | __inc_zone_page_state(page, | 1056 | __inc_zone_page_state(page, |
| 1059 | NR_ANON_TRANSPARENT_HUGEPAGES); | 1057 | NR_ANON_TRANSPARENT_HUGEPAGES); |
| 1058 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, | ||
| 1059 | hpage_nr_pages(page)); | ||
| 1060 | } | 1060 | } |
| 1061 | if (unlikely(PageKsm(page))) | 1061 | if (unlikely(PageKsm(page))) |
| 1062 | return; | 1062 | return; |
| @@ -1085,10 +1085,10 @@ void page_add_new_anon_rmap(struct page *page, | |||
| 1085 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 1085 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
| 1086 | SetPageSwapBacked(page); | 1086 | SetPageSwapBacked(page); |
| 1087 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 1087 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
| 1088 | if (!PageTransHuge(page)) | 1088 | if (PageTransHuge(page)) |
| 1089 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
| 1090 | else | ||
| 1091 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1089 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
| 1090 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, | ||
| 1091 | hpage_nr_pages(page)); | ||
| 1092 | __page_set_anon_rmap(page, vma, address, 1); | 1092 | __page_set_anon_rmap(page, vma, address, 1); |
| 1093 | if (!mlocked_vma_newpage(vma, page)) { | 1093 | if (!mlocked_vma_newpage(vma, page)) { |
| 1094 | SetPageActive(page); | 1094 | SetPageActive(page); |
| @@ -1111,7 +1111,7 @@ void page_add_file_rmap(struct page *page) | |||
| 1111 | mem_cgroup_begin_update_page_stat(page, &locked, &flags); | 1111 | mem_cgroup_begin_update_page_stat(page, &locked, &flags); |
| 1112 | if (atomic_inc_and_test(&page->_mapcount)) { | 1112 | if (atomic_inc_and_test(&page->_mapcount)) { |
| 1113 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 1113 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
| 1114 | mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); | 1114 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); |
| 1115 | } | 1115 | } |
| 1116 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | 1116 | mem_cgroup_end_update_page_stat(page, &locked, &flags); |
| 1117 | } | 1117 | } |
| @@ -1148,14 +1148,14 @@ void page_remove_rmap(struct page *page) | |||
| 1148 | goto out; | 1148 | goto out; |
| 1149 | if (anon) { | 1149 | if (anon) { |
| 1150 | mem_cgroup_uncharge_page(page); | 1150 | mem_cgroup_uncharge_page(page); |
| 1151 | if (!PageTransHuge(page)) | 1151 | if (PageTransHuge(page)) |
| 1152 | __dec_zone_page_state(page, NR_ANON_PAGES); | ||
| 1153 | else | ||
| 1154 | __dec_zone_page_state(page, | 1152 | __dec_zone_page_state(page, |
| 1155 | NR_ANON_TRANSPARENT_HUGEPAGES); | 1153 | NR_ANON_TRANSPARENT_HUGEPAGES); |
| 1154 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, | ||
| 1155 | -hpage_nr_pages(page)); | ||
| 1156 | } else { | 1156 | } else { |
| 1157 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 1157 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
| 1158 | mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); | 1158 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); |
| 1159 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | 1159 | mem_cgroup_end_update_page_stat(page, &locked, &flags); |
| 1160 | } | 1160 | } |
| 1161 | if (unlikely(PageMlocked(page))) | 1161 | if (unlikely(PageMlocked(page))) |
| @@ -432,6 +432,11 @@ static void activate_page_drain(int cpu) | |||
| 432 | pagevec_lru_move_fn(pvec, __activate_page, NULL); | 432 | pagevec_lru_move_fn(pvec, __activate_page, NULL); |
| 433 | } | 433 | } |
| 434 | 434 | ||
| 435 | static bool need_activate_page_drain(int cpu) | ||
| 436 | { | ||
| 437 | return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; | ||
| 438 | } | ||
| 439 | |||
| 435 | void activate_page(struct page *page) | 440 | void activate_page(struct page *page) |
| 436 | { | 441 | { |
| 437 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | 442 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
| @@ -449,6 +454,11 @@ static inline void activate_page_drain(int cpu) | |||
| 449 | { | 454 | { |
| 450 | } | 455 | } |
| 451 | 456 | ||
| 457 | static bool need_activate_page_drain(int cpu) | ||
| 458 | { | ||
| 459 | return false; | ||
| 460 | } | ||
| 461 | |||
| 452 | void activate_page(struct page *page) | 462 | void activate_page(struct page *page) |
| 453 | { | 463 | { |
| 454 | struct zone *zone = page_zone(page); | 464 | struct zone *zone = page_zone(page); |
| @@ -701,12 +711,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) | |||
| 701 | lru_add_drain(); | 711 | lru_add_drain(); |
| 702 | } | 712 | } |
| 703 | 713 | ||
| 704 | /* | 714 | static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); |
| 705 | * Returns 0 for success | 715 | |
| 706 | */ | 716 | void lru_add_drain_all(void) |
| 707 | int lru_add_drain_all(void) | ||
| 708 | { | 717 | { |
| 709 | return schedule_on_each_cpu(lru_add_drain_per_cpu); | 718 | static DEFINE_MUTEX(lock); |
| 719 | static struct cpumask has_work; | ||
| 720 | int cpu; | ||
| 721 | |||
| 722 | mutex_lock(&lock); | ||
| 723 | get_online_cpus(); | ||
| 724 | cpumask_clear(&has_work); | ||
| 725 | |||
| 726 | for_each_online_cpu(cpu) { | ||
| 727 | struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); | ||
| 728 | |||
| 729 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || | ||
| 730 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || | ||
| 731 | pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || | ||
| 732 | need_activate_page_drain(cpu)) { | ||
| 733 | INIT_WORK(work, lru_add_drain_per_cpu); | ||
| 734 | schedule_work_on(cpu, work); | ||
| 735 | cpumask_set_cpu(cpu, &has_work); | ||
| 736 | } | ||
| 737 | } | ||
| 738 | |||
| 739 | for_each_cpu(cpu, &has_work) | ||
| 740 | flush_work(&per_cpu(lru_add_drain_work, cpu)); | ||
| 741 | |||
| 742 | put_online_cpus(); | ||
| 743 | mutex_unlock(&lock); | ||
| 710 | } | 744 | } |
| 711 | 745 | ||
| 712 | /* | 746 | /* |
diff --git a/mm/truncate.c b/mm/truncate.c index e2e8a8a7eb9d..353b683afd6e 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
| @@ -567,7 +567,6 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | |||
| 567 | /** | 567 | /** |
| 568 | * truncate_pagecache - unmap and remove pagecache that has been truncated | 568 | * truncate_pagecache - unmap and remove pagecache that has been truncated |
| 569 | * @inode: inode | 569 | * @inode: inode |
| 570 | * @oldsize: old file size | ||
| 571 | * @newsize: new file size | 570 | * @newsize: new file size |
| 572 | * | 571 | * |
| 573 | * inode's new i_size must already be written before truncate_pagecache | 572 | * inode's new i_size must already be written before truncate_pagecache |
| @@ -580,7 +579,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | |||
| 580 | * situations such as writepage being called for a page that has already | 579 | * situations such as writepage being called for a page that has already |
| 581 | * had its underlying blocks deallocated. | 580 | * had its underlying blocks deallocated. |
| 582 | */ | 581 | */ |
| 583 | void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) | 582 | void truncate_pagecache(struct inode *inode, loff_t newsize) |
| 584 | { | 583 | { |
| 585 | struct address_space *mapping = inode->i_mapping; | 584 | struct address_space *mapping = inode->i_mapping; |
| 586 | loff_t holebegin = round_up(newsize, PAGE_SIZE); | 585 | loff_t holebegin = round_up(newsize, PAGE_SIZE); |
| @@ -614,12 +613,8 @@ EXPORT_SYMBOL(truncate_pagecache); | |||
| 614 | */ | 613 | */ |
| 615 | void truncate_setsize(struct inode *inode, loff_t newsize) | 614 | void truncate_setsize(struct inode *inode, loff_t newsize) |
| 616 | { | 615 | { |
| 617 | loff_t oldsize; | ||
| 618 | |||
| 619 | oldsize = inode->i_size; | ||
| 620 | i_size_write(inode, newsize); | 616 | i_size_write(inode, newsize); |
| 621 | 617 | truncate_pagecache(inode, newsize); | |
| 622 | truncate_pagecache(inode, oldsize, newsize); | ||
| 623 | } | 618 | } |
| 624 | EXPORT_SYMBOL(truncate_setsize); | 619 | EXPORT_SYMBOL(truncate_setsize); |
| 625 | 620 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index beb35778c69f..8ed1b775bdc9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -139,11 +139,23 @@ static bool global_reclaim(struct scan_control *sc) | |||
| 139 | { | 139 | { |
| 140 | return !sc->target_mem_cgroup; | 140 | return !sc->target_mem_cgroup; |
| 141 | } | 141 | } |
| 142 | |||
| 143 | static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) | ||
| 144 | { | ||
| 145 | struct mem_cgroup *root = sc->target_mem_cgroup; | ||
| 146 | return !mem_cgroup_disabled() && | ||
| 147 | mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE; | ||
| 148 | } | ||
| 142 | #else | 149 | #else |
| 143 | static bool global_reclaim(struct scan_control *sc) | 150 | static bool global_reclaim(struct scan_control *sc) |
| 144 | { | 151 | { |
| 145 | return true; | 152 | return true; |
| 146 | } | 153 | } |
| 154 | |||
| 155 | static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) | ||
| 156 | { | ||
| 157 | return false; | ||
| 158 | } | ||
| 147 | #endif | 159 | #endif |
| 148 | 160 | ||
| 149 | unsigned long zone_reclaimable_pages(struct zone *zone) | 161 | unsigned long zone_reclaimable_pages(struct zone *zone) |
| @@ -2164,9 +2176,11 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
| 2164 | } | 2176 | } |
| 2165 | } | 2177 | } |
| 2166 | 2178 | ||
| 2167 | static void shrink_zone(struct zone *zone, struct scan_control *sc) | 2179 | static int |
| 2180 | __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) | ||
| 2168 | { | 2181 | { |
| 2169 | unsigned long nr_reclaimed, nr_scanned; | 2182 | unsigned long nr_reclaimed, nr_scanned; |
| 2183 | int groups_scanned = 0; | ||
| 2170 | 2184 | ||
| 2171 | do { | 2185 | do { |
| 2172 | struct mem_cgroup *root = sc->target_mem_cgroup; | 2186 | struct mem_cgroup *root = sc->target_mem_cgroup; |
| @@ -2174,15 +2188,17 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) | |||
| 2174 | .zone = zone, | 2188 | .zone = zone, |
| 2175 | .priority = sc->priority, | 2189 | .priority = sc->priority, |
| 2176 | }; | 2190 | }; |
| 2177 | struct mem_cgroup *memcg; | 2191 | struct mem_cgroup *memcg = NULL; |
| 2192 | mem_cgroup_iter_filter filter = (soft_reclaim) ? | ||
| 2193 | mem_cgroup_soft_reclaim_eligible : NULL; | ||
| 2178 | 2194 | ||
| 2179 | nr_reclaimed = sc->nr_reclaimed; | 2195 | nr_reclaimed = sc->nr_reclaimed; |
| 2180 | nr_scanned = sc->nr_scanned; | 2196 | nr_scanned = sc->nr_scanned; |
| 2181 | 2197 | ||
| 2182 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 2198 | while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { |
| 2183 | do { | ||
| 2184 | struct lruvec *lruvec; | 2199 | struct lruvec *lruvec; |
| 2185 | 2200 | ||
| 2201 | groups_scanned++; | ||
| 2186 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2202 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
| 2187 | 2203 | ||
| 2188 | shrink_lruvec(lruvec, sc); | 2204 | shrink_lruvec(lruvec, sc); |
| @@ -2202,8 +2218,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) | |||
| 2202 | mem_cgroup_iter_break(root, memcg); | 2218 | mem_cgroup_iter_break(root, memcg); |
| 2203 | break; | 2219 | break; |
| 2204 | } | 2220 | } |
| 2205 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | 2221 | } |
| 2206 | } while (memcg); | ||
| 2207 | 2222 | ||
| 2208 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, | 2223 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, |
| 2209 | sc->nr_scanned - nr_scanned, | 2224 | sc->nr_scanned - nr_scanned, |
| @@ -2211,6 +2226,37 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) | |||
| 2211 | 2226 | ||
| 2212 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, | 2227 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, |
| 2213 | sc->nr_scanned - nr_scanned, sc)); | 2228 | sc->nr_scanned - nr_scanned, sc)); |
| 2229 | |||
| 2230 | return groups_scanned; | ||
| 2231 | } | ||
| 2232 | |||
| 2233 | |||
| 2234 | static void shrink_zone(struct zone *zone, struct scan_control *sc) | ||
| 2235 | { | ||
| 2236 | bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); | ||
| 2237 | unsigned long nr_scanned = sc->nr_scanned; | ||
| 2238 | int scanned_groups; | ||
| 2239 | |||
| 2240 | scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim); | ||
| 2241 | /* | ||
| 2242 | * memcg iterator might race with other reclaimer or start from | ||
| 2243 | * a incomplete tree walk so the tree walk in __shrink_zone | ||
| 2244 | * might have missed groups that are above the soft limit. Try | ||
| 2245 | * another loop to catch up with others. Do it just once to | ||
| 2246 | * prevent from reclaim latencies when other reclaimers always | ||
| 2247 | * preempt this one. | ||
| 2248 | */ | ||
| 2249 | if (do_soft_reclaim && !scanned_groups) | ||
| 2250 | __shrink_zone(zone, sc, do_soft_reclaim); | ||
| 2251 | |||
| 2252 | /* | ||
| 2253 | * No group is over the soft limit or those that are do not have | ||
| 2254 | * pages in the zone we are reclaiming so we have to reclaim everybody | ||
| 2255 | */ | ||
| 2256 | if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) { | ||
| 2257 | __shrink_zone(zone, sc, false); | ||
| 2258 | return; | ||
| 2259 | } | ||
| 2214 | } | 2260 | } |
| 2215 | 2261 | ||
| 2216 | /* Returns true if compaction should go ahead for a high-order request */ | 2262 | /* Returns true if compaction should go ahead for a high-order request */ |
| @@ -2274,8 +2320,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
| 2274 | { | 2320 | { |
| 2275 | struct zoneref *z; | 2321 | struct zoneref *z; |
| 2276 | struct zone *zone; | 2322 | struct zone *zone; |
| 2277 | unsigned long nr_soft_reclaimed; | ||
| 2278 | unsigned long nr_soft_scanned; | ||
| 2279 | bool aborted_reclaim = false; | 2323 | bool aborted_reclaim = false; |
| 2280 | 2324 | ||
| 2281 | /* | 2325 | /* |
| @@ -2315,18 +2359,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
| 2315 | continue; | 2359 | continue; |
| 2316 | } | 2360 | } |
| 2317 | } | 2361 | } |
| 2318 | /* | ||
| 2319 | * This steals pages from memory cgroups over softlimit | ||
| 2320 | * and returns the number of reclaimed pages and | ||
| 2321 | * scanned pages. This works for global memory pressure | ||
| 2322 | * and balancing, not for a memcg's limit. | ||
| 2323 | */ | ||
| 2324 | nr_soft_scanned = 0; | ||
| 2325 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
| 2326 | sc->order, sc->gfp_mask, | ||
| 2327 | &nr_soft_scanned); | ||
| 2328 | sc->nr_reclaimed += nr_soft_reclaimed; | ||
| 2329 | sc->nr_scanned += nr_soft_scanned; | ||
| 2330 | /* need some check for avoid more shrink_zone() */ | 2362 | /* need some check for avoid more shrink_zone() */ |
| 2331 | } | 2363 | } |
| 2332 | 2364 | ||
| @@ -2920,8 +2952,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
| 2920 | { | 2952 | { |
| 2921 | int i; | 2953 | int i; |
| 2922 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2954 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
| 2923 | unsigned long nr_soft_reclaimed; | ||
| 2924 | unsigned long nr_soft_scanned; | ||
| 2925 | struct scan_control sc = { | 2955 | struct scan_control sc = { |
| 2926 | .gfp_mask = GFP_KERNEL, | 2956 | .gfp_mask = GFP_KERNEL, |
| 2927 | .priority = DEF_PRIORITY, | 2957 | .priority = DEF_PRIORITY, |
| @@ -3036,15 +3066,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
| 3036 | 3066 | ||
| 3037 | sc.nr_scanned = 0; | 3067 | sc.nr_scanned = 0; |
| 3038 | 3068 | ||
| 3039 | nr_soft_scanned = 0; | ||
| 3040 | /* | ||
| 3041 | * Call soft limit reclaim before calling shrink_zone. | ||
| 3042 | */ | ||
| 3043 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
| 3044 | order, sc.gfp_mask, | ||
| 3045 | &nr_soft_scanned); | ||
| 3046 | sc.nr_reclaimed += nr_soft_reclaimed; | ||
| 3047 | |||
| 3048 | /* | 3069 | /* |
| 3049 | * There should be no need to raise the scanning | 3070 | * There should be no need to raise the scanning |
| 3050 | * priority if enough pages are already being scanned | 3071 | * priority if enough pages are already being scanned |
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 8a57d79b0b16..559d4ae6ebf4 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c | |||
| @@ -87,8 +87,8 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) | |||
| 87 | if (!cg_proto) | 87 | if (!cg_proto) |
| 88 | return -EINVAL; | 88 | return -EINVAL; |
| 89 | 89 | ||
| 90 | if (val > RESOURCE_MAX) | 90 | if (val > RES_COUNTER_MAX) |
| 91 | val = RESOURCE_MAX; | 91 | val = RES_COUNTER_MAX; |
| 92 | 92 | ||
| 93 | tcp = tcp_from_cgproto(cg_proto); | 93 | tcp = tcp_from_cgproto(cg_proto); |
| 94 | 94 | ||
| @@ -101,9 +101,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) | |||
| 101 | tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, | 101 | tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, |
| 102 | net->ipv4.sysctl_tcp_mem[i]); | 102 | net->ipv4.sysctl_tcp_mem[i]); |
| 103 | 103 | ||
| 104 | if (val == RESOURCE_MAX) | 104 | if (val == RES_COUNTER_MAX) |
| 105 | clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); | 105 | clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); |
| 106 | else if (val != RESOURCE_MAX) { | 106 | else if (val != RES_COUNTER_MAX) { |
| 107 | /* | 107 | /* |
| 108 | * The active bit needs to be written after the static_key | 108 | * The active bit needs to be written after the static_key |
| 109 | * update. This is what guarantees that the socket activation | 109 | * update. This is what guarantees that the socket activation |
| @@ -187,7 +187,7 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) | |||
| 187 | 187 | ||
| 188 | switch (cft->private) { | 188 | switch (cft->private) { |
| 189 | case RES_LIMIT: | 189 | case RES_LIMIT: |
| 190 | val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX); | 190 | val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX); |
| 191 | break; | 191 | break; |
| 192 | case RES_USAGE: | 192 | case RES_USAGE: |
| 193 | val = tcp_read_usage(memcg); | 193 | val = tcp_read_usage(memcg); |
