diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-22 12:04:48 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-22 12:04:48 -0400 |
commit | 95211279c5ad00a317c98221d7e4365e02f20836 (patch) | |
tree | 2ddc8625378d2915b8c96392f3cf6663b705ed55 | |
parent | 5375871d432ae9fc581014ac117b96aaee3cd0c7 (diff) | |
parent | 12724850e8064f64b6223d26d78c0597c742c65a (diff) |
Merge branch 'akpm' (Andrew's patch-bomb)
Merge first batch of patches from Andrew Morton:
"A few misc things and all the MM queue"
* emailed from Andrew Morton <akpm@linux-foundation.org>: (92 commits)
memcg: avoid THP split in task migration
thp: add HPAGE_PMD_* definitions for !CONFIG_TRANSPARENT_HUGEPAGE
memcg: clean up existing move charge code
mm/memcontrol.c: remove unnecessary 'break' in mem_cgroup_read()
mm/memcontrol.c: remove redundant BUG_ON() in mem_cgroup_usage_unregister_event()
mm/memcontrol.c: s/stealed/stolen/
memcg: fix performance of mem_cgroup_begin_update_page_stat()
memcg: remove PCG_FILE_MAPPED
memcg: use new logic for page stat accounting
memcg: remove PCG_MOVE_LOCK flag from page_cgroup
memcg: simplify move_account() check
memcg: remove EXPORT_SYMBOL(mem_cgroup_update_page_stat)
memcg: kill dead prev_priority stubs
memcg: remove PCG_CACHE page_cgroup flag
memcg: let css_get_next() rely upon rcu_read_lock()
cgroup: revert ss_id_lock to spinlock
idr: make idr_get_next() good for rcu_read_lock()
memcg: remove unnecessary thp check in page stat accounting
memcg: remove redundant returns
memcg: enum lru_list lru
...
77 files changed, 1902 insertions, 1235 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index a76a26a1db8a..b7413cb46dcb 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -290,7 +290,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7) | |||
290 | rsslim current limit in bytes on the rss | 290 | rsslim current limit in bytes on the rss |
291 | start_code address above which program text can run | 291 | start_code address above which program text can run |
292 | end_code address below which program text can run | 292 | end_code address below which program text can run |
293 | start_stack address of the start of the stack | 293 | start_stack address of the start of the main process stack |
294 | esp current value of ESP | 294 | esp current value of ESP |
295 | eip current value of EIP | 295 | eip current value of EIP |
296 | pending bitmap of pending signals | 296 | pending bitmap of pending signals |
@@ -325,7 +325,7 @@ address perms offset dev inode pathname | |||
325 | a7cb1000-a7cb2000 ---p 00000000 00:00 0 | 325 | a7cb1000-a7cb2000 ---p 00000000 00:00 0 |
326 | a7cb2000-a7eb2000 rw-p 00000000 00:00 0 | 326 | a7cb2000-a7eb2000 rw-p 00000000 00:00 0 |
327 | a7eb2000-a7eb3000 ---p 00000000 00:00 0 | 327 | a7eb2000-a7eb3000 ---p 00000000 00:00 0 |
328 | a7eb3000-a7ed5000 rw-p 00000000 00:00 0 | 328 | a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack:1001] |
329 | a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 | 329 | a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 |
330 | a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6 | 330 | a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6 |
331 | a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6 | 331 | a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6 |
@@ -357,11 +357,39 @@ is not associated with a file: | |||
357 | 357 | ||
358 | [heap] = the heap of the program | 358 | [heap] = the heap of the program |
359 | [stack] = the stack of the main process | 359 | [stack] = the stack of the main process |
360 | [stack:1001] = the stack of the thread with tid 1001 | ||
360 | [vdso] = the "virtual dynamic shared object", | 361 | [vdso] = the "virtual dynamic shared object", |
361 | the kernel system call handler | 362 | the kernel system call handler |
362 | 363 | ||
363 | or if empty, the mapping is anonymous. | 364 | or if empty, the mapping is anonymous. |
364 | 365 | ||
366 | The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint | ||
367 | of the individual tasks of a process. In this file you will see a mapping marked | ||
368 | as [stack] if that task sees it as a stack. This is a key difference from the | ||
369 | content of /proc/PID/maps, where you will see all mappings that are being used | ||
370 | as stack by all of those tasks. Hence, for the example above, the task-level | ||
371 | map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this: | ||
372 | |||
373 | 08048000-08049000 r-xp 00000000 03:00 8312 /opt/test | ||
374 | 08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test | ||
375 | 0804a000-0806b000 rw-p 00000000 00:00 0 [heap] | ||
376 | a7cb1000-a7cb2000 ---p 00000000 00:00 0 | ||
377 | a7cb2000-a7eb2000 rw-p 00000000 00:00 0 | ||
378 | a7eb2000-a7eb3000 ---p 00000000 00:00 0 | ||
379 | a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack] | ||
380 | a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 | ||
381 | a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6 | ||
382 | a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6 | ||
383 | a800b000-a800e000 rw-p 00000000 00:00 0 | ||
384 | a800e000-a8022000 r-xp 00000000 03:00 14462 /lib/libpthread.so.0 | ||
385 | a8022000-a8023000 r--p 00013000 03:00 14462 /lib/libpthread.so.0 | ||
386 | a8023000-a8024000 rw-p 00014000 03:00 14462 /lib/libpthread.so.0 | ||
387 | a8024000-a8027000 rw-p 00000000 00:00 0 | ||
388 | a8027000-a8043000 r-xp 00000000 03:00 8317 /lib/ld-linux.so.2 | ||
389 | a8043000-a8044000 r--p 0001b000 03:00 8317 /lib/ld-linux.so.2 | ||
390 | a8044000-a8045000 rw-p 0001c000 03:00 8317 /lib/ld-linux.so.2 | ||
391 | aff35000-aff4a000 rw-p 00000000 00:00 0 | ||
392 | ffffe000-fffff000 r-xp 00000000 00:00 0 [vdso] | ||
365 | 393 | ||
366 | The /proc/PID/smaps is an extension based on maps, showing the memory | 394 | The /proc/PID/smaps is an extension based on maps, showing the memory |
367 | consumption for each of the process's mappings. For each of mappings there | 395 | consumption for each of the process's mappings. For each of mappings there |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 8cadb7551fca..7986d79d9d17 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -2635,6 +2635,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
2635 | to facilitate early boot debugging. | 2635 | to facilitate early boot debugging. |
2636 | See also Documentation/trace/events.txt | 2636 | See also Documentation/trace/events.txt |
2637 | 2637 | ||
2638 | transparent_hugepage= | ||
2639 | [KNL] | ||
2640 | Format: [always|madvise|never] | ||
2641 | Can be used to control the default behavior of the system | ||
2642 | with respect to transparent hugepages. | ||
2643 | See Documentation/vm/transhuge.txt for more details. | ||
2644 | |||
2638 | tsc= Disable clocksource stability checks for TSC. | 2645 | tsc= Disable clocksource stability checks for TSC. |
2639 | Format: <string> | 2646 | Format: <string> |
2640 | [x86] reliable: mark tsc clocksource as reliable, this | 2647 | [x86] reliable: mark tsc clocksource as reliable, this |
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index 7445caa26d05..0b13f02d4059 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c | |||
@@ -98,6 +98,7 @@ | |||
98 | #define KPF_HWPOISON 19 | 98 | #define KPF_HWPOISON 19 |
99 | #define KPF_NOPAGE 20 | 99 | #define KPF_NOPAGE 20 |
100 | #define KPF_KSM 21 | 100 | #define KPF_KSM 21 |
101 | #define KPF_THP 22 | ||
101 | 102 | ||
102 | /* [32-] kernel hacking assistances */ | 103 | /* [32-] kernel hacking assistances */ |
103 | #define KPF_RESERVED 32 | 104 | #define KPF_RESERVED 32 |
@@ -147,6 +148,7 @@ static const char *page_flag_names[] = { | |||
147 | [KPF_HWPOISON] = "X:hwpoison", | 148 | [KPF_HWPOISON] = "X:hwpoison", |
148 | [KPF_NOPAGE] = "n:nopage", | 149 | [KPF_NOPAGE] = "n:nopage", |
149 | [KPF_KSM] = "x:ksm", | 150 | [KPF_KSM] = "x:ksm", |
151 | [KPF_THP] = "t:thp", | ||
150 | 152 | ||
151 | [KPF_RESERVED] = "r:reserved", | 153 | [KPF_RESERVED] = "r:reserved", |
152 | [KPF_MLOCKED] = "m:mlocked", | 154 | [KPF_MLOCKED] = "m:mlocked", |
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index df09b9650a81..4600cbe3d6be 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt | |||
@@ -60,6 +60,7 @@ There are three components to pagemap: | |||
60 | 19. HWPOISON | 60 | 19. HWPOISON |
61 | 20. NOPAGE | 61 | 20. NOPAGE |
62 | 21. KSM | 62 | 21. KSM |
63 | 22. THP | ||
63 | 64 | ||
64 | Short descriptions to the page flags: | 65 | Short descriptions to the page flags: |
65 | 66 | ||
@@ -97,6 +98,9 @@ Short descriptions to the page flags: | |||
97 | 21. KSM | 98 | 21. KSM |
98 | identical memory pages dynamically shared between one or more processes | 99 | identical memory pages dynamically shared between one or more processes |
99 | 100 | ||
101 | 22. THP | ||
102 | contiguous pages which construct transparent hugepages | ||
103 | |||
100 | [IO related page flags] | 104 | [IO related page flags] |
101 | 1. ERROR IO error occurred | 105 | 1. ERROR IO error occurred |
102 | 3. UPTODATE page has up-to-date data | 106 | 3. UPTODATE page has up-to-date data |
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index 023b8860dc97..c8f5b50db89c 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c | |||
@@ -776,7 +776,6 @@ static inline int handle_signal32(unsigned long signr, struct k_sigaction *ka, | |||
776 | siginfo_t *info, | 776 | siginfo_t *info, |
777 | sigset_t *oldset, struct pt_regs *regs) | 777 | sigset_t *oldset, struct pt_regs *regs) |
778 | { | 778 | { |
779 | sigset_t blocked; | ||
780 | int err; | 779 | int err; |
781 | 780 | ||
782 | if (ka->sa.sa_flags & SA_SIGINFO) | 781 | if (ka->sa.sa_flags & SA_SIGINFO) |
@@ -787,11 +786,7 @@ static inline int handle_signal32(unsigned long signr, struct k_sigaction *ka, | |||
787 | if (err) | 786 | if (err) |
788 | return err; | 787 | return err; |
789 | 788 | ||
790 | sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); | 789 | block_sigmask(ka, signr); |
791 | if (!(ka->sa.sa_flags & SA_NOMASK)) | ||
792 | sigaddset(&blocked, signr); | ||
793 | set_current_blocked(&blocked); | ||
794 | |||
795 | tracehook_signal_handler(signr, info, ka, regs, 0); | 790 | tracehook_signal_handler(signr, info, ka, regs, 0); |
796 | 791 | ||
797 | return 0; | 792 | return 0; |
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index d54c6e53aba0..7bb71b6fbd20 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c | |||
@@ -465,7 +465,6 @@ static inline int | |||
465 | handle_signal(unsigned long signr, struct k_sigaction *ka, | 465 | handle_signal(unsigned long signr, struct k_sigaction *ka, |
466 | siginfo_t *info, sigset_t *oldset, struct pt_regs *regs) | 466 | siginfo_t *info, sigset_t *oldset, struct pt_regs *regs) |
467 | { | 467 | { |
468 | sigset_t blocked; | ||
469 | int err; | 468 | int err; |
470 | 469 | ||
471 | if (ka->sa.sa_flags & SA_SIGINFO) | 470 | if (ka->sa.sa_flags & SA_SIGINFO) |
@@ -476,11 +475,7 @@ handle_signal(unsigned long signr, struct k_sigaction *ka, | |||
476 | if (err) | 475 | if (err) |
477 | return err; | 476 | return err; |
478 | 477 | ||
479 | sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); | 478 | block_sigmask(ka, signr); |
480 | if (!(ka->sa.sa_flags & SA_NOMASK)) | ||
481 | sigaddset(&blocked, signr); | ||
482 | set_current_blocked(&blocked); | ||
483 | |||
484 | tracehook_signal_handler(signr, info, ka, regs, 0); | 479 | tracehook_signal_handler(signr, info, ka, regs, 0); |
485 | 480 | ||
486 | return 0; | 481 | return 0; |
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index f0836cd0e2f2..d8a67e60be80 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c | |||
@@ -479,18 +479,14 @@ static inline int handle_signal(unsigned long signr, struct k_sigaction *ka, | |||
479 | siginfo_t *info, | 479 | siginfo_t *info, |
480 | sigset_t *oldset, struct pt_regs *regs) | 480 | sigset_t *oldset, struct pt_regs *regs) |
481 | { | 481 | { |
482 | sigset_t blocked; | ||
483 | int err; | 482 | int err; |
484 | 483 | ||
485 | err = setup_rt_frame(ka, regs, signr, oldset, | 484 | err = setup_rt_frame(ka, regs, signr, oldset, |
486 | (ka->sa.sa_flags & SA_SIGINFO) ? info : NULL); | 485 | (ka->sa.sa_flags & SA_SIGINFO) ? info : NULL); |
487 | if (err) | 486 | if (err) |
488 | return err; | 487 | return err; |
489 | sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); | ||
490 | if (!(ka->sa.sa_flags & SA_NOMASK)) | ||
491 | sigaddset(&blocked, signr); | ||
492 | set_current_blocked(&blocked); | ||
493 | 488 | ||
489 | block_sigmask(ka, signr); | ||
494 | tracehook_signal_handler(signr, info, ka, regs, 0); | 490 | tracehook_signal_handler(signr, info, ka, regs, 0); |
495 | 491 | ||
496 | return 0; | 492 | return 0; |
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 051489082d59..ef59642ff1bf 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c | |||
@@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
195 | { | 195 | { |
196 | struct vm_area_struct *vma; | 196 | struct vm_area_struct *vma; |
197 | struct mm_struct *mm = current->mm; | 197 | struct mm_struct *mm = current->mm; |
198 | unsigned long addr = addr0; | 198 | unsigned long addr = addr0, start_addr; |
199 | 199 | ||
200 | /* requested length too big for entire address space */ | 200 | /* requested length too big for entire address space */ |
201 | if (len > TASK_SIZE) | 201 | if (len > TASK_SIZE) |
@@ -223,25 +223,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
223 | mm->free_area_cache = mm->mmap_base; | 223 | mm->free_area_cache = mm->mmap_base; |
224 | } | 224 | } |
225 | 225 | ||
226 | try_again: | ||
226 | /* either no address requested or can't fit in requested address hole */ | 227 | /* either no address requested or can't fit in requested address hole */ |
227 | addr = mm->free_area_cache; | 228 | start_addr = addr = mm->free_area_cache; |
228 | |||
229 | /* make sure it can fit in the remaining address space */ | ||
230 | if (addr > len) { | ||
231 | unsigned long tmp_addr = align_addr(addr - len, filp, | ||
232 | ALIGN_TOPDOWN); | ||
233 | |||
234 | vma = find_vma(mm, tmp_addr); | ||
235 | if (!vma || tmp_addr + len <= vma->vm_start) | ||
236 | /* remember the address as a hint for next time */ | ||
237 | return mm->free_area_cache = tmp_addr; | ||
238 | } | ||
239 | |||
240 | if (mm->mmap_base < len) | ||
241 | goto bottomup; | ||
242 | 229 | ||
243 | addr = mm->mmap_base-len; | 230 | if (addr < len) |
231 | goto fail; | ||
244 | 232 | ||
233 | addr -= len; | ||
245 | do { | 234 | do { |
246 | addr = align_addr(addr, filp, ALIGN_TOPDOWN); | 235 | addr = align_addr(addr, filp, ALIGN_TOPDOWN); |
247 | 236 | ||
@@ -263,6 +252,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
263 | addr = vma->vm_start-len; | 252 | addr = vma->vm_start-len; |
264 | } while (len < vma->vm_start); | 253 | } while (len < vma->vm_start); |
265 | 254 | ||
255 | fail: | ||
256 | /* | ||
257 | * if hint left us with no space for the requested | ||
258 | * mapping then try again: | ||
259 | */ | ||
260 | if (start_addr != mm->mmap_base) { | ||
261 | mm->free_area_cache = mm->mmap_base; | ||
262 | mm->cached_hole_size = 0; | ||
263 | goto try_again; | ||
264 | } | ||
265 | |||
266 | bottomup: | 266 | bottomup: |
267 | /* | 267 | /* |
268 | * A failed mmap() very likely causes application failure, | 268 | * A failed mmap() very likely causes application failure, |
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index b466cab5ba15..328cb37bb827 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) | |||
172 | spinlock_t *ptl; | 172 | spinlock_t *ptl; |
173 | int i; | 173 | int i; |
174 | 174 | ||
175 | down_write(&mm->mmap_sem); | ||
175 | pgd = pgd_offset(mm, 0xA0000); | 176 | pgd = pgd_offset(mm, 0xA0000); |
176 | if (pgd_none_or_clear_bad(pgd)) | 177 | if (pgd_none_or_clear_bad(pgd)) |
177 | goto out; | 178 | goto out; |
@@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) | |||
190 | } | 191 | } |
191 | pte_unmap_unlock(pte, ptl); | 192 | pte_unmap_unlock(pte, ptl); |
192 | out: | 193 | out: |
194 | up_write(&mm->mmap_sem); | ||
193 | flush_tlb(); | 195 | flush_tlb(); |
194 | } | 196 | } |
195 | 197 | ||
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 8ecbb4bba4b3..f6679a7fb8ca 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c | |||
@@ -308,10 +308,11 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, | |||
308 | { | 308 | { |
309 | struct hstate *h = hstate_file(file); | 309 | struct hstate *h = hstate_file(file); |
310 | struct mm_struct *mm = current->mm; | 310 | struct mm_struct *mm = current->mm; |
311 | struct vm_area_struct *vma, *prev_vma; | 311 | struct vm_area_struct *vma; |
312 | unsigned long base = mm->mmap_base, addr = addr0; | 312 | unsigned long base = mm->mmap_base; |
313 | unsigned long addr = addr0; | ||
313 | unsigned long largest_hole = mm->cached_hole_size; | 314 | unsigned long largest_hole = mm->cached_hole_size; |
314 | int first_time = 1; | 315 | unsigned long start_addr; |
315 | 316 | ||
316 | /* don't allow allocations above current base */ | 317 | /* don't allow allocations above current base */ |
317 | if (mm->free_area_cache > base) | 318 | if (mm->free_area_cache > base) |
@@ -322,6 +323,8 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, | |||
322 | mm->free_area_cache = base; | 323 | mm->free_area_cache = base; |
323 | } | 324 | } |
324 | try_again: | 325 | try_again: |
326 | start_addr = mm->free_area_cache; | ||
327 | |||
325 | /* make sure it can fit in the remaining address space */ | 328 | /* make sure it can fit in the remaining address space */ |
326 | if (mm->free_area_cache < len) | 329 | if (mm->free_area_cache < len) |
327 | goto fail; | 330 | goto fail; |
@@ -337,22 +340,14 @@ try_again: | |||
337 | if (!vma) | 340 | if (!vma) |
338 | return addr; | 341 | return addr; |
339 | 342 | ||
340 | /* | 343 | if (addr + len <= vma->vm_start) { |
341 | * new region fits between prev_vma->vm_end and | ||
342 | * vma->vm_start, use it: | ||
343 | */ | ||
344 | prev_vma = vma->vm_prev; | ||
345 | if (addr + len <= vma->vm_start && | ||
346 | (!prev_vma || (addr >= prev_vma->vm_end))) { | ||
347 | /* remember the address as a hint for next time */ | 344 | /* remember the address as a hint for next time */ |
348 | mm->cached_hole_size = largest_hole; | 345 | mm->cached_hole_size = largest_hole; |
349 | return (mm->free_area_cache = addr); | 346 | return (mm->free_area_cache = addr); |
350 | } else { | 347 | } else if (mm->free_area_cache == vma->vm_end) { |
351 | /* pull free_area_cache down to the first hole */ | 348 | /* pull free_area_cache down to the first hole */ |
352 | if (mm->free_area_cache == vma->vm_end) { | 349 | mm->free_area_cache = vma->vm_start; |
353 | mm->free_area_cache = vma->vm_start; | 350 | mm->cached_hole_size = largest_hole; |
354 | mm->cached_hole_size = largest_hole; | ||
355 | } | ||
356 | } | 351 | } |
357 | 352 | ||
358 | /* remember the largest hole we saw so far */ | 353 | /* remember the largest hole we saw so far */ |
@@ -368,10 +363,9 @@ fail: | |||
368 | * if hint left us with no space for the requested | 363 | * if hint left us with no space for the requested |
369 | * mapping then try again: | 364 | * mapping then try again: |
370 | */ | 365 | */ |
371 | if (first_time) { | 366 | if (start_addr != base) { |
372 | mm->free_area_cache = base; | 367 | mm->free_area_cache = base; |
373 | largest_hole = 0; | 368 | largest_hole = 0; |
374 | first_time = 0; | ||
375 | goto try_again; | 369 | goto try_again; |
376 | } | 370 | } |
377 | /* | 371 | /* |
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 46db56845f18..740b0a355431 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c | |||
@@ -60,7 +60,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, | |||
60 | eb->nid = nid; | 60 | eb->nid = nid; |
61 | 61 | ||
62 | if (emu_nid_to_phys[nid] == NUMA_NO_NODE) | 62 | if (emu_nid_to_phys[nid] == NUMA_NO_NODE) |
63 | emu_nid_to_phys[nid] = pb->nid; | 63 | emu_nid_to_phys[nid] = nid; |
64 | 64 | ||
65 | pb->start += size; | 65 | pb->start += size; |
66 | if (pb->start >= pb->end) { | 66 | if (pb->start >= pb->end) { |
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index f2220b5bdce6..b69b000349fc 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c | |||
@@ -260,10 +260,7 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3, | |||
260 | goto badframe; | 260 | goto badframe; |
261 | 261 | ||
262 | sigdelsetmask(&set, ~_BLOCKABLE); | 262 | sigdelsetmask(&set, ~_BLOCKABLE); |
263 | spin_lock_irq(¤t->sighand->siglock); | 263 | set_current_blocked(&set); |
264 | current->blocked = set; | ||
265 | recalc_sigpending(); | ||
266 | spin_unlock_irq(¤t->sighand->siglock); | ||
267 | 264 | ||
268 | if (restore_sigcontext(regs, frame)) | 265 | if (restore_sigcontext(regs, frame)) |
269 | goto badframe; | 266 | goto badframe; |
@@ -336,8 +333,8 @@ gen_return_code(unsigned char *codemem) | |||
336 | } | 333 | } |
337 | 334 | ||
338 | 335 | ||
339 | static void setup_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | 336 | static int setup_frame(int sig, struct k_sigaction *ka, siginfo_t *info, |
340 | sigset_t *set, struct pt_regs *regs) | 337 | sigset_t *set, struct pt_regs *regs) |
341 | { | 338 | { |
342 | struct rt_sigframe *frame; | 339 | struct rt_sigframe *frame; |
343 | int err = 0; | 340 | int err = 0; |
@@ -422,12 +419,11 @@ static void setup_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | |||
422 | current->comm, current->pid, signal, frame, regs->pc); | 419 | current->comm, current->pid, signal, frame, regs->pc); |
423 | #endif | 420 | #endif |
424 | 421 | ||
425 | return; | 422 | return 0; |
426 | 423 | ||
427 | give_sigsegv: | 424 | give_sigsegv: |
428 | if (sig == SIGSEGV) | 425 | force_sigsegv(sig, current); |
429 | ka->sa.sa_handler = SIG_DFL; | 426 | return -EFAULT; |
430 | force_sig(SIGSEGV, current); | ||
431 | } | 427 | } |
432 | 428 | ||
433 | /* | 429 | /* |
@@ -449,11 +445,8 @@ asmlinkage long xtensa_rt_sigsuspend(sigset_t __user *unewset, | |||
449 | return -EFAULT; | 445 | return -EFAULT; |
450 | 446 | ||
451 | sigdelsetmask(&newset, ~_BLOCKABLE); | 447 | sigdelsetmask(&newset, ~_BLOCKABLE); |
452 | spin_lock_irq(¤t->sighand->siglock); | ||
453 | saveset = current->blocked; | 448 | saveset = current->blocked; |
454 | current->blocked = newset; | 449 | set_current_blocked(&newset); |
455 | recalc_sigpending(); | ||
456 | spin_unlock_irq(¤t->sighand->siglock); | ||
457 | 450 | ||
458 | regs->areg[2] = -EINTR; | 451 | regs->areg[2] = -EINTR; |
459 | while (1) { | 452 | while (1) { |
@@ -536,17 +529,11 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) | |||
536 | 529 | ||
537 | /* Whee! Actually deliver the signal. */ | 530 | /* Whee! Actually deliver the signal. */ |
538 | /* Set up the stack frame */ | 531 | /* Set up the stack frame */ |
539 | setup_frame(signr, &ka, &info, oldset, regs); | 532 | ret = setup_frame(signr, &ka, &info, oldset, regs); |
540 | 533 | if (ret) | |
541 | if (ka.sa.sa_flags & SA_ONESHOT) | 534 | return ret; |
542 | ka.sa.sa_handler = SIG_DFL; | ||
543 | 535 | ||
544 | spin_lock_irq(¤t->sighand->siglock); | 536 | block_sigmask(&ka, signr); |
545 | sigorsets(¤t->blocked, ¤t->blocked, &ka.sa.sa_mask); | ||
546 | if (!(ka.sa.sa_flags & SA_NODEFER)) | ||
547 | sigaddset(¤t->blocked, signr); | ||
548 | recalc_sigpending(); | ||
549 | spin_unlock_irq(¤t->sighand->siglock); | ||
550 | if (current->ptrace & PT_SINGLESTEP) | 537 | if (current->ptrace & PT_SINGLESTEP) |
551 | task_pt_regs(current)->icountlevel = 1; | 538 | task_pt_regs(current)->icountlevel = 1; |
552 | 539 | ||
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 1c15e9b33575..d0f59c3f87ef 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c | |||
@@ -507,8 +507,7 @@ int intel_idle_cpu_init(int cpu) | |||
507 | int num_substates; | 507 | int num_substates; |
508 | 508 | ||
509 | if (cstate > max_cstate) { | 509 | if (cstate > max_cstate) { |
510 | printk(PREFIX "max_cstate %d reached\n", | 510 | printk(PREFIX "max_cstate %d reached\n", max_cstate); |
511 | max_cstate); | ||
512 | break; | 511 | break; |
513 | } | 512 | } |
514 | 513 | ||
@@ -524,8 +523,9 @@ int intel_idle_cpu_init(int cpu) | |||
524 | dev->states_usage[dev->state_count].driver_data = | 523 | dev->states_usage[dev->state_count].driver_data = |
525 | (void *)get_driver_data(cstate); | 524 | (void *)get_driver_data(cstate); |
526 | 525 | ||
527 | dev->state_count += 1; | 526 | dev->state_count += 1; |
528 | } | 527 | } |
528 | |||
529 | dev->cpu = cpu; | 529 | dev->cpu = cpu; |
530 | 530 | ||
531 | if (cpuidle_register_device(dev)) { | 531 | if (cpuidle_register_device(dev)) { |
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index ecb8e2203ac8..136e86faa1e1 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c | |||
@@ -346,7 +346,7 @@ static struct sysrq_key_op sysrq_term_op = { | |||
346 | 346 | ||
347 | static void moom_callback(struct work_struct *ignored) | 347 | static void moom_callback(struct work_struct *ignored) |
348 | { | 348 | { |
349 | out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0, NULL); | 349 | out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0, NULL, true); |
350 | } | 350 | } |
351 | 351 | ||
352 | static DECLARE_WORK(moom_work, moom_callback); | 352 | static DECLARE_WORK(moom_work, moom_callback); |
@@ -822,7 +822,7 @@ static int exec_mmap(struct mm_struct *mm) | |||
822 | /* Notify parent that we're no longer interested in the old VM */ | 822 | /* Notify parent that we're no longer interested in the old VM */ |
823 | tsk = current; | 823 | tsk = current; |
824 | old_mm = current->mm; | 824 | old_mm = current->mm; |
825 | sync_mm_rss(tsk, old_mm); | 825 | sync_mm_rss(old_mm); |
826 | mm_release(tsk, old_mm); | 826 | mm_release(tsk, old_mm); |
827 | 827 | ||
828 | if (old_mm) { | 828 | if (old_mm) { |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 81932fa1861a..ea251749d9d5 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -41,6 +41,25 @@ const struct file_operations hugetlbfs_file_operations; | |||
41 | static const struct inode_operations hugetlbfs_dir_inode_operations; | 41 | static const struct inode_operations hugetlbfs_dir_inode_operations; |
42 | static const struct inode_operations hugetlbfs_inode_operations; | 42 | static const struct inode_operations hugetlbfs_inode_operations; |
43 | 43 | ||
44 | struct hugetlbfs_config { | ||
45 | uid_t uid; | ||
46 | gid_t gid; | ||
47 | umode_t mode; | ||
48 | long nr_blocks; | ||
49 | long nr_inodes; | ||
50 | struct hstate *hstate; | ||
51 | }; | ||
52 | |||
53 | struct hugetlbfs_inode_info { | ||
54 | struct shared_policy policy; | ||
55 | struct inode vfs_inode; | ||
56 | }; | ||
57 | |||
58 | static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) | ||
59 | { | ||
60 | return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); | ||
61 | } | ||
62 | |||
44 | static struct backing_dev_info hugetlbfs_backing_dev_info = { | 63 | static struct backing_dev_info hugetlbfs_backing_dev_info = { |
45 | .name = "hugetlbfs", | 64 | .name = "hugetlbfs", |
46 | .ra_pages = 0, /* No readahead */ | 65 | .ra_pages = 0, /* No readahead */ |
@@ -154,10 +173,12 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | |||
154 | return addr; | 173 | return addr; |
155 | } | 174 | } |
156 | 175 | ||
157 | start_addr = mm->free_area_cache; | 176 | if (len > mm->cached_hole_size) |
158 | 177 | start_addr = mm->free_area_cache; | |
159 | if (len <= mm->cached_hole_size) | 178 | else { |
160 | start_addr = TASK_UNMAPPED_BASE; | 179 | start_addr = TASK_UNMAPPED_BASE; |
180 | mm->cached_hole_size = 0; | ||
181 | } | ||
161 | 182 | ||
162 | full_search: | 183 | full_search: |
163 | addr = ALIGN(start_addr, huge_page_size(h)); | 184 | addr = ALIGN(start_addr, huge_page_size(h)); |
@@ -171,13 +192,18 @@ full_search: | |||
171 | */ | 192 | */ |
172 | if (start_addr != TASK_UNMAPPED_BASE) { | 193 | if (start_addr != TASK_UNMAPPED_BASE) { |
173 | start_addr = TASK_UNMAPPED_BASE; | 194 | start_addr = TASK_UNMAPPED_BASE; |
195 | mm->cached_hole_size = 0; | ||
174 | goto full_search; | 196 | goto full_search; |
175 | } | 197 | } |
176 | return -ENOMEM; | 198 | return -ENOMEM; |
177 | } | 199 | } |
178 | 200 | ||
179 | if (!vma || addr + len <= vma->vm_start) | 201 | if (!vma || addr + len <= vma->vm_start) { |
202 | mm->free_area_cache = addr + len; | ||
180 | return addr; | 203 | return addr; |
204 | } | ||
205 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
206 | mm->cached_hole_size = vma->vm_start - addr; | ||
181 | addr = ALIGN(vma->vm_end, huge_page_size(h)); | 207 | addr = ALIGN(vma->vm_end, huge_page_size(h)); |
182 | } | 208 | } |
183 | } | 209 | } |
@@ -238,17 +264,10 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, | |||
238 | loff_t isize; | 264 | loff_t isize; |
239 | ssize_t retval = 0; | 265 | ssize_t retval = 0; |
240 | 266 | ||
241 | mutex_lock(&inode->i_mutex); | ||
242 | |||
243 | /* validate length */ | 267 | /* validate length */ |
244 | if (len == 0) | 268 | if (len == 0) |
245 | goto out; | 269 | goto out; |
246 | 270 | ||
247 | isize = i_size_read(inode); | ||
248 | if (!isize) | ||
249 | goto out; | ||
250 | |||
251 | end_index = (isize - 1) >> huge_page_shift(h); | ||
252 | for (;;) { | 271 | for (;;) { |
253 | struct page *page; | 272 | struct page *page; |
254 | unsigned long nr, ret; | 273 | unsigned long nr, ret; |
@@ -256,18 +275,21 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, | |||
256 | 275 | ||
257 | /* nr is the maximum number of bytes to copy from this page */ | 276 | /* nr is the maximum number of bytes to copy from this page */ |
258 | nr = huge_page_size(h); | 277 | nr = huge_page_size(h); |
278 | isize = i_size_read(inode); | ||
279 | if (!isize) | ||
280 | goto out; | ||
281 | end_index = (isize - 1) >> huge_page_shift(h); | ||
259 | if (index >= end_index) { | 282 | if (index >= end_index) { |
260 | if (index > end_index) | 283 | if (index > end_index) |
261 | goto out; | 284 | goto out; |
262 | nr = ((isize - 1) & ~huge_page_mask(h)) + 1; | 285 | nr = ((isize - 1) & ~huge_page_mask(h)) + 1; |
263 | if (nr <= offset) { | 286 | if (nr <= offset) |
264 | goto out; | 287 | goto out; |
265 | } | ||
266 | } | 288 | } |
267 | nr = nr - offset; | 289 | nr = nr - offset; |
268 | 290 | ||
269 | /* Find the page */ | 291 | /* Find the page */ |
270 | page = find_get_page(mapping, index); | 292 | page = find_lock_page(mapping, index); |
271 | if (unlikely(page == NULL)) { | 293 | if (unlikely(page == NULL)) { |
272 | /* | 294 | /* |
273 | * We have a HOLE, zero out the user-buffer for the | 295 | * We have a HOLE, zero out the user-buffer for the |
@@ -279,17 +301,18 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, | |||
279 | else | 301 | else |
280 | ra = 0; | 302 | ra = 0; |
281 | } else { | 303 | } else { |
304 | unlock_page(page); | ||
305 | |||
282 | /* | 306 | /* |
283 | * We have the page, copy it to user space buffer. | 307 | * We have the page, copy it to user space buffer. |
284 | */ | 308 | */ |
285 | ra = hugetlbfs_read_actor(page, offset, buf, len, nr); | 309 | ra = hugetlbfs_read_actor(page, offset, buf, len, nr); |
286 | ret = ra; | 310 | ret = ra; |
311 | page_cache_release(page); | ||
287 | } | 312 | } |
288 | if (ra < 0) { | 313 | if (ra < 0) { |
289 | if (retval == 0) | 314 | if (retval == 0) |
290 | retval = ra; | 315 | retval = ra; |
291 | if (page) | ||
292 | page_cache_release(page); | ||
293 | goto out; | 316 | goto out; |
294 | } | 317 | } |
295 | 318 | ||
@@ -299,16 +322,12 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, | |||
299 | index += offset >> huge_page_shift(h); | 322 | index += offset >> huge_page_shift(h); |
300 | offset &= ~huge_page_mask(h); | 323 | offset &= ~huge_page_mask(h); |
301 | 324 | ||
302 | if (page) | ||
303 | page_cache_release(page); | ||
304 | |||
305 | /* short read or no more work */ | 325 | /* short read or no more work */ |
306 | if ((ret != nr) || (len == 0)) | 326 | if ((ret != nr) || (len == 0)) |
307 | break; | 327 | break; |
308 | } | 328 | } |
309 | out: | 329 | out: |
310 | *ppos = ((loff_t)index << huge_page_shift(h)) + offset; | 330 | *ppos = ((loff_t)index << huge_page_shift(h)) + offset; |
311 | mutex_unlock(&inode->i_mutex); | ||
312 | return retval; | 331 | return retval; |
313 | } | 332 | } |
314 | 333 | ||
@@ -607,9 +626,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
607 | spin_lock(&sbinfo->stat_lock); | 626 | spin_lock(&sbinfo->stat_lock); |
608 | /* If no limits set, just report 0 for max/free/used | 627 | /* If no limits set, just report 0 for max/free/used |
609 | * blocks, like simple_statfs() */ | 628 | * blocks, like simple_statfs() */ |
610 | if (sbinfo->max_blocks >= 0) { | 629 | if (sbinfo->spool) { |
611 | buf->f_blocks = sbinfo->max_blocks; | 630 | long free_pages; |
612 | buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; | 631 | |
632 | spin_lock(&sbinfo->spool->lock); | ||
633 | buf->f_blocks = sbinfo->spool->max_hpages; | ||
634 | free_pages = sbinfo->spool->max_hpages | ||
635 | - sbinfo->spool->used_hpages; | ||
636 | buf->f_bavail = buf->f_bfree = free_pages; | ||
637 | spin_unlock(&sbinfo->spool->lock); | ||
613 | buf->f_files = sbinfo->max_inodes; | 638 | buf->f_files = sbinfo->max_inodes; |
614 | buf->f_ffree = sbinfo->free_inodes; | 639 | buf->f_ffree = sbinfo->free_inodes; |
615 | } | 640 | } |
@@ -625,6 +650,10 @@ static void hugetlbfs_put_super(struct super_block *sb) | |||
625 | 650 | ||
626 | if (sbi) { | 651 | if (sbi) { |
627 | sb->s_fs_info = NULL; | 652 | sb->s_fs_info = NULL; |
653 | |||
654 | if (sbi->spool) | ||
655 | hugepage_put_subpool(sbi->spool); | ||
656 | |||
628 | kfree(sbi); | 657 | kfree(sbi); |
629 | } | 658 | } |
630 | } | 659 | } |
@@ -853,10 +882,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) | |||
853 | sb->s_fs_info = sbinfo; | 882 | sb->s_fs_info = sbinfo; |
854 | sbinfo->hstate = config.hstate; | 883 | sbinfo->hstate = config.hstate; |
855 | spin_lock_init(&sbinfo->stat_lock); | 884 | spin_lock_init(&sbinfo->stat_lock); |
856 | sbinfo->max_blocks = config.nr_blocks; | ||
857 | sbinfo->free_blocks = config.nr_blocks; | ||
858 | sbinfo->max_inodes = config.nr_inodes; | 885 | sbinfo->max_inodes = config.nr_inodes; |
859 | sbinfo->free_inodes = config.nr_inodes; | 886 | sbinfo->free_inodes = config.nr_inodes; |
887 | sbinfo->spool = NULL; | ||
888 | if (config.nr_blocks != -1) { | ||
889 | sbinfo->spool = hugepage_new_subpool(config.nr_blocks); | ||
890 | if (!sbinfo->spool) | ||
891 | goto out_free; | ||
892 | } | ||
860 | sb->s_maxbytes = MAX_LFS_FILESIZE; | 893 | sb->s_maxbytes = MAX_LFS_FILESIZE; |
861 | sb->s_blocksize = huge_page_size(config.hstate); | 894 | sb->s_blocksize = huge_page_size(config.hstate); |
862 | sb->s_blocksize_bits = huge_page_shift(config.hstate); | 895 | sb->s_blocksize_bits = huge_page_shift(config.hstate); |
@@ -868,38 +901,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) | |||
868 | goto out_free; | 901 | goto out_free; |
869 | return 0; | 902 | return 0; |
870 | out_free: | 903 | out_free: |
904 | if (sbinfo->spool) | ||
905 | kfree(sbinfo->spool); | ||
871 | kfree(sbinfo); | 906 | kfree(sbinfo); |
872 | return -ENOMEM; | 907 | return -ENOMEM; |
873 | } | 908 | } |
874 | 909 | ||
875 | int hugetlb_get_quota(struct address_space *mapping, long delta) | ||
876 | { | ||
877 | int ret = 0; | ||
878 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); | ||
879 | |||
880 | if (sbinfo->free_blocks > -1) { | ||
881 | spin_lock(&sbinfo->stat_lock); | ||
882 | if (sbinfo->free_blocks - delta >= 0) | ||
883 | sbinfo->free_blocks -= delta; | ||
884 | else | ||
885 | ret = -ENOMEM; | ||
886 | spin_unlock(&sbinfo->stat_lock); | ||
887 | } | ||
888 | |||
889 | return ret; | ||
890 | } | ||
891 | |||
892 | void hugetlb_put_quota(struct address_space *mapping, long delta) | ||
893 | { | ||
894 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); | ||
895 | |||
896 | if (sbinfo->free_blocks > -1) { | ||
897 | spin_lock(&sbinfo->stat_lock); | ||
898 | sbinfo->free_blocks += delta; | ||
899 | spin_unlock(&sbinfo->stat_lock); | ||
900 | } | ||
901 | } | ||
902 | |||
903 | static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, | 910 | static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, |
904 | int flags, const char *dev_name, void *data) | 911 | int flags, const char *dev_name, void *data) |
905 | { | 912 | { |
@@ -919,8 +926,8 @@ static int can_do_hugetlb_shm(void) | |||
919 | return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); | 926 | return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); |
920 | } | 927 | } |
921 | 928 | ||
922 | struct file *hugetlb_file_setup(const char *name, size_t size, | 929 | struct file *hugetlb_file_setup(const char *name, unsigned long addr, |
923 | vm_flags_t acctflag, | 930 | size_t size, vm_flags_t acctflag, |
924 | struct user_struct **user, int creat_flags) | 931 | struct user_struct **user, int creat_flags) |
925 | { | 932 | { |
926 | int error = -ENOMEM; | 933 | int error = -ENOMEM; |
@@ -929,6 +936,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size, | |||
929 | struct path path; | 936 | struct path path; |
930 | struct dentry *root; | 937 | struct dentry *root; |
931 | struct qstr quick_string; | 938 | struct qstr quick_string; |
939 | struct hstate *hstate; | ||
940 | unsigned long num_pages; | ||
932 | 941 | ||
933 | *user = NULL; | 942 | *user = NULL; |
934 | if (!hugetlbfs_vfsmount) | 943 | if (!hugetlbfs_vfsmount) |
@@ -937,7 +946,11 @@ struct file *hugetlb_file_setup(const char *name, size_t size, | |||
937 | if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { | 946 | if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { |
938 | *user = current_user(); | 947 | *user = current_user(); |
939 | if (user_shm_lock(size, *user)) { | 948 | if (user_shm_lock(size, *user)) { |
940 | printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n"); | 949 | task_lock(current); |
950 | printk_once(KERN_WARNING | ||
951 | "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", | ||
952 | current->comm, current->pid); | ||
953 | task_unlock(current); | ||
941 | } else { | 954 | } else { |
942 | *user = NULL; | 955 | *user = NULL; |
943 | return ERR_PTR(-EPERM); | 956 | return ERR_PTR(-EPERM); |
@@ -958,10 +971,12 @@ struct file *hugetlb_file_setup(const char *name, size_t size, | |||
958 | if (!inode) | 971 | if (!inode) |
959 | goto out_dentry; | 972 | goto out_dentry; |
960 | 973 | ||
974 | hstate = hstate_inode(inode); | ||
975 | size += addr & ~huge_page_mask(hstate); | ||
976 | num_pages = ALIGN(size, huge_page_size(hstate)) >> | ||
977 | huge_page_shift(hstate); | ||
961 | error = -ENOMEM; | 978 | error = -ENOMEM; |
962 | if (hugetlb_reserve_pages(inode, 0, | 979 | if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag)) |
963 | size >> huge_page_shift(hstate_inode(inode)), NULL, | ||
964 | acctflag)) | ||
965 | goto out_inode; | 980 | goto out_inode; |
966 | 981 | ||
967 | d_instantiate(path.dentry, inode); | 982 | d_instantiate(path.dentry, inode); |
@@ -997,6 +1012,7 @@ static int __init init_hugetlbfs_fs(void) | |||
997 | if (error) | 1012 | if (error) |
998 | return error; | 1013 | return error; |
999 | 1014 | ||
1015 | error = -ENOMEM; | ||
1000 | hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", | 1016 | hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", |
1001 | sizeof(struct hugetlbfs_inode_info), | 1017 | sizeof(struct hugetlbfs_inode_info), |
1002 | 0, 0, init_once); | 1018 | 0, 0, init_once); |
@@ -1015,10 +1031,10 @@ static int __init init_hugetlbfs_fs(void) | |||
1015 | } | 1031 | } |
1016 | 1032 | ||
1017 | error = PTR_ERR(vfsmount); | 1033 | error = PTR_ERR(vfsmount); |
1034 | unregister_filesystem(&hugetlbfs_fs_type); | ||
1018 | 1035 | ||
1019 | out: | 1036 | out: |
1020 | if (error) | 1037 | kmem_cache_destroy(hugetlbfs_inode_cachep); |
1021 | kmem_cache_destroy(hugetlbfs_inode_cachep); | ||
1022 | out2: | 1038 | out2: |
1023 | bdi_destroy(&hugetlbfs_backing_dev_info); | 1039 | bdi_destroy(&hugetlbfs_backing_dev_info); |
1024 | return error; | 1040 | return error; |
diff --git a/fs/namei.c b/fs/namei.c index 13e6a1f191a9..a94a7f9a03ea 100644 --- a/fs/namei.c +++ b/fs/namei.c | |||
@@ -1455,9 +1455,15 @@ done: | |||
1455 | } | 1455 | } |
1456 | EXPORT_SYMBOL(full_name_hash); | 1456 | EXPORT_SYMBOL(full_name_hash); |
1457 | 1457 | ||
1458 | #ifdef CONFIG_64BIT | ||
1458 | #define ONEBYTES 0x0101010101010101ul | 1459 | #define ONEBYTES 0x0101010101010101ul |
1459 | #define SLASHBYTES 0x2f2f2f2f2f2f2f2ful | 1460 | #define SLASHBYTES 0x2f2f2f2f2f2f2f2ful |
1460 | #define HIGHBITS 0x8080808080808080ul | 1461 | #define HIGHBITS 0x8080808080808080ul |
1462 | #else | ||
1463 | #define ONEBYTES 0x01010101ul | ||
1464 | #define SLASHBYTES 0x2f2f2f2ful | ||
1465 | #define HIGHBITS 0x80808080ul | ||
1466 | #endif | ||
1461 | 1467 | ||
1462 | /* Return the high bit set in the first byte that is a zero */ | 1468 | /* Return the high bit set in the first byte that is a zero */ |
1463 | static inline unsigned long has_zero(unsigned long a) | 1469 | static inline unsigned long has_zero(unsigned long a) |
diff --git a/fs/proc/base.c b/fs/proc/base.c index 965d4bde3a3b..3b42c1418f31 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -2989,9 +2989,9 @@ static const struct pid_entry tgid_base_stuff[] = { | |||
2989 | INF("cmdline", S_IRUGO, proc_pid_cmdline), | 2989 | INF("cmdline", S_IRUGO, proc_pid_cmdline), |
2990 | ONE("stat", S_IRUGO, proc_tgid_stat), | 2990 | ONE("stat", S_IRUGO, proc_tgid_stat), |
2991 | ONE("statm", S_IRUGO, proc_pid_statm), | 2991 | ONE("statm", S_IRUGO, proc_pid_statm), |
2992 | REG("maps", S_IRUGO, proc_maps_operations), | 2992 | REG("maps", S_IRUGO, proc_pid_maps_operations), |
2993 | #ifdef CONFIG_NUMA | 2993 | #ifdef CONFIG_NUMA |
2994 | REG("numa_maps", S_IRUGO, proc_numa_maps_operations), | 2994 | REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), |
2995 | #endif | 2995 | #endif |
2996 | REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), | 2996 | REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), |
2997 | LNK("cwd", proc_cwd_link), | 2997 | LNK("cwd", proc_cwd_link), |
@@ -3002,7 +3002,7 @@ static const struct pid_entry tgid_base_stuff[] = { | |||
3002 | REG("mountstats", S_IRUSR, proc_mountstats_operations), | 3002 | REG("mountstats", S_IRUSR, proc_mountstats_operations), |
3003 | #ifdef CONFIG_PROC_PAGE_MONITOR | 3003 | #ifdef CONFIG_PROC_PAGE_MONITOR |
3004 | REG("clear_refs", S_IWUSR, proc_clear_refs_operations), | 3004 | REG("clear_refs", S_IWUSR, proc_clear_refs_operations), |
3005 | REG("smaps", S_IRUGO, proc_smaps_operations), | 3005 | REG("smaps", S_IRUGO, proc_pid_smaps_operations), |
3006 | REG("pagemap", S_IRUGO, proc_pagemap_operations), | 3006 | REG("pagemap", S_IRUGO, proc_pagemap_operations), |
3007 | #endif | 3007 | #endif |
3008 | #ifdef CONFIG_SECURITY | 3008 | #ifdef CONFIG_SECURITY |
@@ -3348,9 +3348,9 @@ static const struct pid_entry tid_base_stuff[] = { | |||
3348 | INF("cmdline", S_IRUGO, proc_pid_cmdline), | 3348 | INF("cmdline", S_IRUGO, proc_pid_cmdline), |
3349 | ONE("stat", S_IRUGO, proc_tid_stat), | 3349 | ONE("stat", S_IRUGO, proc_tid_stat), |
3350 | ONE("statm", S_IRUGO, proc_pid_statm), | 3350 | ONE("statm", S_IRUGO, proc_pid_statm), |
3351 | REG("maps", S_IRUGO, proc_maps_operations), | 3351 | REG("maps", S_IRUGO, proc_tid_maps_operations), |
3352 | #ifdef CONFIG_NUMA | 3352 | #ifdef CONFIG_NUMA |
3353 | REG("numa_maps", S_IRUGO, proc_numa_maps_operations), | 3353 | REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations), |
3354 | #endif | 3354 | #endif |
3355 | REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), | 3355 | REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), |
3356 | LNK("cwd", proc_cwd_link), | 3356 | LNK("cwd", proc_cwd_link), |
@@ -3360,7 +3360,7 @@ static const struct pid_entry tid_base_stuff[] = { | |||
3360 | REG("mountinfo", S_IRUGO, proc_mountinfo_operations), | 3360 | REG("mountinfo", S_IRUGO, proc_mountinfo_operations), |
3361 | #ifdef CONFIG_PROC_PAGE_MONITOR | 3361 | #ifdef CONFIG_PROC_PAGE_MONITOR |
3362 | REG("clear_refs", S_IWUSR, proc_clear_refs_operations), | 3362 | REG("clear_refs", S_IWUSR, proc_clear_refs_operations), |
3363 | REG("smaps", S_IRUGO, proc_smaps_operations), | 3363 | REG("smaps", S_IRUGO, proc_tid_smaps_operations), |
3364 | REG("pagemap", S_IRUGO, proc_pagemap_operations), | 3364 | REG("pagemap", S_IRUGO, proc_pagemap_operations), |
3365 | #endif | 3365 | #endif |
3366 | #ifdef CONFIG_SECURITY | 3366 | #ifdef CONFIG_SECURITY |
diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 292577531ad1..c44efe19798f 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h | |||
@@ -53,9 +53,12 @@ extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, | |||
53 | struct pid *pid, struct task_struct *task); | 53 | struct pid *pid, struct task_struct *task); |
54 | extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); | 54 | extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); |
55 | 55 | ||
56 | extern const struct file_operations proc_maps_operations; | 56 | extern const struct file_operations proc_pid_maps_operations; |
57 | extern const struct file_operations proc_numa_maps_operations; | 57 | extern const struct file_operations proc_tid_maps_operations; |
58 | extern const struct file_operations proc_smaps_operations; | 58 | extern const struct file_operations proc_pid_numa_maps_operations; |
59 | extern const struct file_operations proc_tid_numa_maps_operations; | ||
60 | extern const struct file_operations proc_pid_smaps_operations; | ||
61 | extern const struct file_operations proc_tid_smaps_operations; | ||
59 | extern const struct file_operations proc_clear_refs_operations; | 62 | extern const struct file_operations proc_clear_refs_operations; |
60 | extern const struct file_operations proc_pagemap_operations; | 63 | extern const struct file_operations proc_pagemap_operations; |
61 | extern const struct file_operations proc_net_operations; | 64 | extern const struct file_operations proc_net_operations; |
diff --git a/fs/proc/page.c b/fs/proc/page.c index 6d8e6a9e93ab..7fcd0d60a968 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c | |||
@@ -115,6 +115,8 @@ u64 stable_page_flags(struct page *page) | |||
115 | u |= 1 << KPF_COMPOUND_TAIL; | 115 | u |= 1 << KPF_COMPOUND_TAIL; |
116 | if (PageHuge(page)) | 116 | if (PageHuge(page)) |
117 | u |= 1 << KPF_HUGE; | 117 | u |= 1 << KPF_HUGE; |
118 | else if (PageTransCompound(page)) | ||
119 | u |= 1 << KPF_THP; | ||
118 | 120 | ||
119 | /* | 121 | /* |
120 | * Caveats on high order pages: page->_count will only be set | 122 | * Caveats on high order pages: page->_count will only be set |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7dcd2a250495..9694cc283511 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -209,16 +209,20 @@ static int do_maps_open(struct inode *inode, struct file *file, | |||
209 | return ret; | 209 | return ret; |
210 | } | 210 | } |
211 | 211 | ||
212 | static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) | 212 | static void |
213 | show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) | ||
213 | { | 214 | { |
214 | struct mm_struct *mm = vma->vm_mm; | 215 | struct mm_struct *mm = vma->vm_mm; |
215 | struct file *file = vma->vm_file; | 216 | struct file *file = vma->vm_file; |
217 | struct proc_maps_private *priv = m->private; | ||
218 | struct task_struct *task = priv->task; | ||
216 | vm_flags_t flags = vma->vm_flags; | 219 | vm_flags_t flags = vma->vm_flags; |
217 | unsigned long ino = 0; | 220 | unsigned long ino = 0; |
218 | unsigned long long pgoff = 0; | 221 | unsigned long long pgoff = 0; |
219 | unsigned long start, end; | 222 | unsigned long start, end; |
220 | dev_t dev = 0; | 223 | dev_t dev = 0; |
221 | int len; | 224 | int len; |
225 | const char *name = NULL; | ||
222 | 226 | ||
223 | if (file) { | 227 | if (file) { |
224 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | 228 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
@@ -252,36 +256,57 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) | |||
252 | if (file) { | 256 | if (file) { |
253 | pad_len_spaces(m, len); | 257 | pad_len_spaces(m, len); |
254 | seq_path(m, &file->f_path, "\n"); | 258 | seq_path(m, &file->f_path, "\n"); |
255 | } else { | 259 | goto done; |
256 | const char *name = arch_vma_name(vma); | 260 | } |
257 | if (!name) { | 261 | |
258 | if (mm) { | 262 | name = arch_vma_name(vma); |
259 | if (vma->vm_start <= mm->brk && | 263 | if (!name) { |
260 | vma->vm_end >= mm->start_brk) { | 264 | pid_t tid; |
261 | name = "[heap]"; | 265 | |
262 | } else if (vma->vm_start <= mm->start_stack && | 266 | if (!mm) { |
263 | vma->vm_end >= mm->start_stack) { | 267 | name = "[vdso]"; |
264 | name = "[stack]"; | 268 | goto done; |
265 | } | 269 | } |
270 | |||
271 | if (vma->vm_start <= mm->brk && | ||
272 | vma->vm_end >= mm->start_brk) { | ||
273 | name = "[heap]"; | ||
274 | goto done; | ||
275 | } | ||
276 | |||
277 | tid = vm_is_stack(task, vma, is_pid); | ||
278 | |||
279 | if (tid != 0) { | ||
280 | /* | ||
281 | * Thread stack in /proc/PID/task/TID/maps or | ||
282 | * the main process stack. | ||
283 | */ | ||
284 | if (!is_pid || (vma->vm_start <= mm->start_stack && | ||
285 | vma->vm_end >= mm->start_stack)) { | ||
286 | name = "[stack]"; | ||
266 | } else { | 287 | } else { |
267 | name = "[vdso]"; | 288 | /* Thread stack in /proc/PID/maps */ |
289 | pad_len_spaces(m, len); | ||
290 | seq_printf(m, "[stack:%d]", tid); | ||
268 | } | 291 | } |
269 | } | 292 | } |
270 | if (name) { | 293 | } |
271 | pad_len_spaces(m, len); | 294 | |
272 | seq_puts(m, name); | 295 | done: |
273 | } | 296 | if (name) { |
297 | pad_len_spaces(m, len); | ||
298 | seq_puts(m, name); | ||
274 | } | 299 | } |
275 | seq_putc(m, '\n'); | 300 | seq_putc(m, '\n'); |
276 | } | 301 | } |
277 | 302 | ||
278 | static int show_map(struct seq_file *m, void *v) | 303 | static int show_map(struct seq_file *m, void *v, int is_pid) |
279 | { | 304 | { |
280 | struct vm_area_struct *vma = v; | 305 | struct vm_area_struct *vma = v; |
281 | struct proc_maps_private *priv = m->private; | 306 | struct proc_maps_private *priv = m->private; |
282 | struct task_struct *task = priv->task; | 307 | struct task_struct *task = priv->task; |
283 | 308 | ||
284 | show_map_vma(m, vma); | 309 | show_map_vma(m, vma, is_pid); |
285 | 310 | ||
286 | if (m->count < m->size) /* vma is copied successfully */ | 311 | if (m->count < m->size) /* vma is copied successfully */ |
287 | m->version = (vma != get_gate_vma(task->mm)) | 312 | m->version = (vma != get_gate_vma(task->mm)) |
@@ -289,20 +314,49 @@ static int show_map(struct seq_file *m, void *v) | |||
289 | return 0; | 314 | return 0; |
290 | } | 315 | } |
291 | 316 | ||
317 | static int show_pid_map(struct seq_file *m, void *v) | ||
318 | { | ||
319 | return show_map(m, v, 1); | ||
320 | } | ||
321 | |||
322 | static int show_tid_map(struct seq_file *m, void *v) | ||
323 | { | ||
324 | return show_map(m, v, 0); | ||
325 | } | ||
326 | |||
292 | static const struct seq_operations proc_pid_maps_op = { | 327 | static const struct seq_operations proc_pid_maps_op = { |
293 | .start = m_start, | 328 | .start = m_start, |
294 | .next = m_next, | 329 | .next = m_next, |
295 | .stop = m_stop, | 330 | .stop = m_stop, |
296 | .show = show_map | 331 | .show = show_pid_map |
297 | }; | 332 | }; |
298 | 333 | ||
299 | static int maps_open(struct inode *inode, struct file *file) | 334 | static const struct seq_operations proc_tid_maps_op = { |
335 | .start = m_start, | ||
336 | .next = m_next, | ||
337 | .stop = m_stop, | ||
338 | .show = show_tid_map | ||
339 | }; | ||
340 | |||
341 | static int pid_maps_open(struct inode *inode, struct file *file) | ||
300 | { | 342 | { |
301 | return do_maps_open(inode, file, &proc_pid_maps_op); | 343 | return do_maps_open(inode, file, &proc_pid_maps_op); |
302 | } | 344 | } |
303 | 345 | ||
304 | const struct file_operations proc_maps_operations = { | 346 | static int tid_maps_open(struct inode *inode, struct file *file) |
305 | .open = maps_open, | 347 | { |
348 | return do_maps_open(inode, file, &proc_tid_maps_op); | ||
349 | } | ||
350 | |||
351 | const struct file_operations proc_pid_maps_operations = { | ||
352 | .open = pid_maps_open, | ||
353 | .read = seq_read, | ||
354 | .llseek = seq_lseek, | ||
355 | .release = seq_release_private, | ||
356 | }; | ||
357 | |||
358 | const struct file_operations proc_tid_maps_operations = { | ||
359 | .open = tid_maps_open, | ||
306 | .read = seq_read, | 360 | .read = seq_read, |
307 | .llseek = seq_lseek, | 361 | .llseek = seq_lseek, |
308 | .release = seq_release_private, | 362 | .release = seq_release_private, |
@@ -394,21 +448,15 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
394 | pte_t *pte; | 448 | pte_t *pte; |
395 | spinlock_t *ptl; | 449 | spinlock_t *ptl; |
396 | 450 | ||
397 | spin_lock(&walk->mm->page_table_lock); | 451 | if (pmd_trans_huge_lock(pmd, vma) == 1) { |
398 | if (pmd_trans_huge(*pmd)) { | 452 | smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); |
399 | if (pmd_trans_splitting(*pmd)) { | ||
400 | spin_unlock(&walk->mm->page_table_lock); | ||
401 | wait_split_huge_page(vma->anon_vma, pmd); | ||
402 | } else { | ||
403 | smaps_pte_entry(*(pte_t *)pmd, addr, | ||
404 | HPAGE_PMD_SIZE, walk); | ||
405 | spin_unlock(&walk->mm->page_table_lock); | ||
406 | mss->anonymous_thp += HPAGE_PMD_SIZE; | ||
407 | return 0; | ||
408 | } | ||
409 | } else { | ||
410 | spin_unlock(&walk->mm->page_table_lock); | 453 | spin_unlock(&walk->mm->page_table_lock); |
454 | mss->anonymous_thp += HPAGE_PMD_SIZE; | ||
455 | return 0; | ||
411 | } | 456 | } |
457 | |||
458 | if (pmd_trans_unstable(pmd)) | ||
459 | return 0; | ||
412 | /* | 460 | /* |
413 | * The mmap_sem held all the way back in m_start() is what | 461 | * The mmap_sem held all the way back in m_start() is what |
414 | * keeps khugepaged out of here and from collapsing things | 462 | * keeps khugepaged out of here and from collapsing things |
@@ -422,7 +470,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
422 | return 0; | 470 | return 0; |
423 | } | 471 | } |
424 | 472 | ||
425 | static int show_smap(struct seq_file *m, void *v) | 473 | static int show_smap(struct seq_file *m, void *v, int is_pid) |
426 | { | 474 | { |
427 | struct proc_maps_private *priv = m->private; | 475 | struct proc_maps_private *priv = m->private; |
428 | struct task_struct *task = priv->task; | 476 | struct task_struct *task = priv->task; |
@@ -440,7 +488,7 @@ static int show_smap(struct seq_file *m, void *v) | |||
440 | if (vma->vm_mm && !is_vm_hugetlb_page(vma)) | 488 | if (vma->vm_mm && !is_vm_hugetlb_page(vma)) |
441 | walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); | 489 | walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); |
442 | 490 | ||
443 | show_map_vma(m, vma); | 491 | show_map_vma(m, vma, is_pid); |
444 | 492 | ||
445 | seq_printf(m, | 493 | seq_printf(m, |
446 | "Size: %8lu kB\n" | 494 | "Size: %8lu kB\n" |
@@ -479,20 +527,49 @@ static int show_smap(struct seq_file *m, void *v) | |||
479 | return 0; | 527 | return 0; |
480 | } | 528 | } |
481 | 529 | ||
530 | static int show_pid_smap(struct seq_file *m, void *v) | ||
531 | { | ||
532 | return show_smap(m, v, 1); | ||
533 | } | ||
534 | |||
535 | static int show_tid_smap(struct seq_file *m, void *v) | ||
536 | { | ||
537 | return show_smap(m, v, 0); | ||
538 | } | ||
539 | |||
482 | static const struct seq_operations proc_pid_smaps_op = { | 540 | static const struct seq_operations proc_pid_smaps_op = { |
483 | .start = m_start, | 541 | .start = m_start, |
484 | .next = m_next, | 542 | .next = m_next, |
485 | .stop = m_stop, | 543 | .stop = m_stop, |
486 | .show = show_smap | 544 | .show = show_pid_smap |
545 | }; | ||
546 | |||
547 | static const struct seq_operations proc_tid_smaps_op = { | ||
548 | .start = m_start, | ||
549 | .next = m_next, | ||
550 | .stop = m_stop, | ||
551 | .show = show_tid_smap | ||
487 | }; | 552 | }; |
488 | 553 | ||
489 | static int smaps_open(struct inode *inode, struct file *file) | 554 | static int pid_smaps_open(struct inode *inode, struct file *file) |
490 | { | 555 | { |
491 | return do_maps_open(inode, file, &proc_pid_smaps_op); | 556 | return do_maps_open(inode, file, &proc_pid_smaps_op); |
492 | } | 557 | } |
493 | 558 | ||
494 | const struct file_operations proc_smaps_operations = { | 559 | static int tid_smaps_open(struct inode *inode, struct file *file) |
495 | .open = smaps_open, | 560 | { |
561 | return do_maps_open(inode, file, &proc_tid_smaps_op); | ||
562 | } | ||
563 | |||
564 | const struct file_operations proc_pid_smaps_operations = { | ||
565 | .open = pid_smaps_open, | ||
566 | .read = seq_read, | ||
567 | .llseek = seq_lseek, | ||
568 | .release = seq_release_private, | ||
569 | }; | ||
570 | |||
571 | const struct file_operations proc_tid_smaps_operations = { | ||
572 | .open = tid_smaps_open, | ||
496 | .read = seq_read, | 573 | .read = seq_read, |
497 | .llseek = seq_lseek, | 574 | .llseek = seq_lseek, |
498 | .release = seq_release_private, | 575 | .release = seq_release_private, |
@@ -507,6 +584,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, | |||
507 | struct page *page; | 584 | struct page *page; |
508 | 585 | ||
509 | split_huge_page_pmd(walk->mm, pmd); | 586 | split_huge_page_pmd(walk->mm, pmd); |
587 | if (pmd_trans_unstable(pmd)) | ||
588 | return 0; | ||
510 | 589 | ||
511 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 590 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
512 | for (; addr != end; pte++, addr += PAGE_SIZE) { | 591 | for (; addr != end; pte++, addr += PAGE_SIZE) { |
@@ -598,11 +677,18 @@ const struct file_operations proc_clear_refs_operations = { | |||
598 | .llseek = noop_llseek, | 677 | .llseek = noop_llseek, |
599 | }; | 678 | }; |
600 | 679 | ||
680 | typedef struct { | ||
681 | u64 pme; | ||
682 | } pagemap_entry_t; | ||
683 | |||
601 | struct pagemapread { | 684 | struct pagemapread { |
602 | int pos, len; | 685 | int pos, len; |
603 | u64 *buffer; | 686 | pagemap_entry_t *buffer; |
604 | }; | 687 | }; |
605 | 688 | ||
689 | #define PAGEMAP_WALK_SIZE (PMD_SIZE) | ||
690 | #define PAGEMAP_WALK_MASK (PMD_MASK) | ||
691 | |||
606 | #define PM_ENTRY_BYTES sizeof(u64) | 692 | #define PM_ENTRY_BYTES sizeof(u64) |
607 | #define PM_STATUS_BITS 3 | 693 | #define PM_STATUS_BITS 3 |
608 | #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) | 694 | #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) |
@@ -620,10 +706,15 @@ struct pagemapread { | |||
620 | #define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) | 706 | #define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) |
621 | #define PM_END_OF_BUFFER 1 | 707 | #define PM_END_OF_BUFFER 1 |
622 | 708 | ||
623 | static int add_to_pagemap(unsigned long addr, u64 pfn, | 709 | static inline pagemap_entry_t make_pme(u64 val) |
710 | { | ||
711 | return (pagemap_entry_t) { .pme = val }; | ||
712 | } | ||
713 | |||
714 | static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, | ||
624 | struct pagemapread *pm) | 715 | struct pagemapread *pm) |
625 | { | 716 | { |
626 | pm->buffer[pm->pos++] = pfn; | 717 | pm->buffer[pm->pos++] = *pme; |
627 | if (pm->pos >= pm->len) | 718 | if (pm->pos >= pm->len) |
628 | return PM_END_OF_BUFFER; | 719 | return PM_END_OF_BUFFER; |
629 | return 0; | 720 | return 0; |
@@ -635,8 +726,10 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, | |||
635 | struct pagemapread *pm = walk->private; | 726 | struct pagemapread *pm = walk->private; |
636 | unsigned long addr; | 727 | unsigned long addr; |
637 | int err = 0; | 728 | int err = 0; |
729 | pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); | ||
730 | |||
638 | for (addr = start; addr < end; addr += PAGE_SIZE) { | 731 | for (addr = start; addr < end; addr += PAGE_SIZE) { |
639 | err = add_to_pagemap(addr, PM_NOT_PRESENT, pm); | 732 | err = add_to_pagemap(addr, &pme, pm); |
640 | if (err) | 733 | if (err) |
641 | break; | 734 | break; |
642 | } | 735 | } |
@@ -649,17 +742,35 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte) | |||
649 | return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); | 742 | return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); |
650 | } | 743 | } |
651 | 744 | ||
652 | static u64 pte_to_pagemap_entry(pte_t pte) | 745 | static void pte_to_pagemap_entry(pagemap_entry_t *pme, pte_t pte) |
653 | { | 746 | { |
654 | u64 pme = 0; | ||
655 | if (is_swap_pte(pte)) | 747 | if (is_swap_pte(pte)) |
656 | pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte)) | 748 | *pme = make_pme(PM_PFRAME(swap_pte_to_pagemap_entry(pte)) |
657 | | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP; | 749 | | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP); |
658 | else if (pte_present(pte)) | 750 | else if (pte_present(pte)) |
659 | pme = PM_PFRAME(pte_pfn(pte)) | 751 | *pme = make_pme(PM_PFRAME(pte_pfn(pte)) |
660 | | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; | 752 | | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); |
661 | return pme; | 753 | } |
754 | |||
755 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
756 | static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, | ||
757 | pmd_t pmd, int offset) | ||
758 | { | ||
759 | /* | ||
760 | * Currently pmd for thp is always present because thp can not be | ||
761 | * swapped-out, migrated, or HWPOISONed (split in such cases instead.) | ||
762 | * This if-check is just to prepare for future implementation. | ||
763 | */ | ||
764 | if (pmd_present(pmd)) | ||
765 | *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) | ||
766 | | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); | ||
662 | } | 767 | } |
768 | #else | ||
769 | static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, | ||
770 | pmd_t pmd, int offset) | ||
771 | { | ||
772 | } | ||
773 | #endif | ||
663 | 774 | ||
664 | static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | 775 | static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
665 | struct mm_walk *walk) | 776 | struct mm_walk *walk) |
@@ -668,13 +779,30 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
668 | struct pagemapread *pm = walk->private; | 779 | struct pagemapread *pm = walk->private; |
669 | pte_t *pte; | 780 | pte_t *pte; |
670 | int err = 0; | 781 | int err = 0; |
782 | pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); | ||
671 | 783 | ||
672 | split_huge_page_pmd(walk->mm, pmd); | 784 | if (pmd_trans_unstable(pmd)) |
785 | return 0; | ||
673 | 786 | ||
674 | /* find the first VMA at or above 'addr' */ | 787 | /* find the first VMA at or above 'addr' */ |
675 | vma = find_vma(walk->mm, addr); | 788 | vma = find_vma(walk->mm, addr); |
789 | spin_lock(&walk->mm->page_table_lock); | ||
790 | if (pmd_trans_huge_lock(pmd, vma) == 1) { | ||
791 | for (; addr != end; addr += PAGE_SIZE) { | ||
792 | unsigned long offset; | ||
793 | |||
794 | offset = (addr & ~PAGEMAP_WALK_MASK) >> | ||
795 | PAGE_SHIFT; | ||
796 | thp_pmd_to_pagemap_entry(&pme, *pmd, offset); | ||
797 | err = add_to_pagemap(addr, &pme, pm); | ||
798 | if (err) | ||
799 | break; | ||
800 | } | ||
801 | spin_unlock(&walk->mm->page_table_lock); | ||
802 | return err; | ||
803 | } | ||
804 | |||
676 | for (; addr != end; addr += PAGE_SIZE) { | 805 | for (; addr != end; addr += PAGE_SIZE) { |
677 | u64 pfn = PM_NOT_PRESENT; | ||
678 | 806 | ||
679 | /* check to see if we've left 'vma' behind | 807 | /* check to see if we've left 'vma' behind |
680 | * and need a new, higher one */ | 808 | * and need a new, higher one */ |
@@ -686,11 +814,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
686 | if (vma && (vma->vm_start <= addr) && | 814 | if (vma && (vma->vm_start <= addr) && |
687 | !is_vm_hugetlb_page(vma)) { | 815 | !is_vm_hugetlb_page(vma)) { |
688 | pte = pte_offset_map(pmd, addr); | 816 | pte = pte_offset_map(pmd, addr); |
689 | pfn = pte_to_pagemap_entry(*pte); | 817 | pte_to_pagemap_entry(&pme, *pte); |
690 | /* unmap before userspace copy */ | 818 | /* unmap before userspace copy */ |
691 | pte_unmap(pte); | 819 | pte_unmap(pte); |
692 | } | 820 | } |
693 | err = add_to_pagemap(addr, pfn, pm); | 821 | err = add_to_pagemap(addr, &pme, pm); |
694 | if (err) | 822 | if (err) |
695 | return err; | 823 | return err; |
696 | } | 824 | } |
@@ -701,13 +829,12 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
701 | } | 829 | } |
702 | 830 | ||
703 | #ifdef CONFIG_HUGETLB_PAGE | 831 | #ifdef CONFIG_HUGETLB_PAGE |
704 | static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) | 832 | static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, |
833 | pte_t pte, int offset) | ||
705 | { | 834 | { |
706 | u64 pme = 0; | ||
707 | if (pte_present(pte)) | 835 | if (pte_present(pte)) |
708 | pme = PM_PFRAME(pte_pfn(pte) + offset) | 836 | *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
709 | | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; | 837 | | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); |
710 | return pme; | ||
711 | } | 838 | } |
712 | 839 | ||
713 | /* This function walks within one hugetlb entry in the single call */ | 840 | /* This function walks within one hugetlb entry in the single call */ |
@@ -717,12 +844,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, | |||
717 | { | 844 | { |
718 | struct pagemapread *pm = walk->private; | 845 | struct pagemapread *pm = walk->private; |
719 | int err = 0; | 846 | int err = 0; |
720 | u64 pfn; | 847 | pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); |
721 | 848 | ||
722 | for (; addr != end; addr += PAGE_SIZE) { | 849 | for (; addr != end; addr += PAGE_SIZE) { |
723 | int offset = (addr & ~hmask) >> PAGE_SHIFT; | 850 | int offset = (addr & ~hmask) >> PAGE_SHIFT; |
724 | pfn = huge_pte_to_pagemap_entry(*pte, offset); | 851 | huge_pte_to_pagemap_entry(&pme, *pte, offset); |
725 | err = add_to_pagemap(addr, pfn, pm); | 852 | err = add_to_pagemap(addr, &pme, pm); |
726 | if (err) | 853 | if (err) |
727 | return err; | 854 | return err; |
728 | } | 855 | } |
@@ -757,8 +884,6 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, | |||
757 | * determine which areas of memory are actually mapped and llseek to | 884 | * determine which areas of memory are actually mapped and llseek to |
758 | * skip over unmapped regions. | 885 | * skip over unmapped regions. |
759 | */ | 886 | */ |
760 | #define PAGEMAP_WALK_SIZE (PMD_SIZE) | ||
761 | #define PAGEMAP_WALK_MASK (PMD_MASK) | ||
762 | static ssize_t pagemap_read(struct file *file, char __user *buf, | 887 | static ssize_t pagemap_read(struct file *file, char __user *buf, |
763 | size_t count, loff_t *ppos) | 888 | size_t count, loff_t *ppos) |
764 | { | 889 | { |
@@ -941,26 +1066,21 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, | |||
941 | pte_t *pte; | 1066 | pte_t *pte; |
942 | 1067 | ||
943 | md = walk->private; | 1068 | md = walk->private; |
944 | spin_lock(&walk->mm->page_table_lock); | 1069 | |
945 | if (pmd_trans_huge(*pmd)) { | 1070 | if (pmd_trans_huge_lock(pmd, md->vma) == 1) { |
946 | if (pmd_trans_splitting(*pmd)) { | 1071 | pte_t huge_pte = *(pte_t *)pmd; |
947 | spin_unlock(&walk->mm->page_table_lock); | 1072 | struct page *page; |
948 | wait_split_huge_page(md->vma->anon_vma, pmd); | 1073 | |
949 | } else { | 1074 | page = can_gather_numa_stats(huge_pte, md->vma, addr); |
950 | pte_t huge_pte = *(pte_t *)pmd; | 1075 | if (page) |
951 | struct page *page; | 1076 | gather_stats(page, md, pte_dirty(huge_pte), |
952 | 1077 | HPAGE_PMD_SIZE/PAGE_SIZE); | |
953 | page = can_gather_numa_stats(huge_pte, md->vma, addr); | ||
954 | if (page) | ||
955 | gather_stats(page, md, pte_dirty(huge_pte), | ||
956 | HPAGE_PMD_SIZE/PAGE_SIZE); | ||
957 | spin_unlock(&walk->mm->page_table_lock); | ||
958 | return 0; | ||
959 | } | ||
960 | } else { | ||
961 | spin_unlock(&walk->mm->page_table_lock); | 1078 | spin_unlock(&walk->mm->page_table_lock); |
1079 | return 0; | ||
962 | } | 1080 | } |
963 | 1081 | ||
1082 | if (pmd_trans_unstable(pmd)) | ||
1083 | return 0; | ||
964 | orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); | 1084 | orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); |
965 | do { | 1085 | do { |
966 | struct page *page = can_gather_numa_stats(*pte, md->vma, addr); | 1086 | struct page *page = can_gather_numa_stats(*pte, md->vma, addr); |
@@ -1002,7 +1122,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, | |||
1002 | /* | 1122 | /* |
1003 | * Display pages allocated per node and memory policy via /proc. | 1123 | * Display pages allocated per node and memory policy via /proc. |
1004 | */ | 1124 | */ |
1005 | static int show_numa_map(struct seq_file *m, void *v) | 1125 | static int show_numa_map(struct seq_file *m, void *v, int is_pid) |
1006 | { | 1126 | { |
1007 | struct numa_maps_private *numa_priv = m->private; | 1127 | struct numa_maps_private *numa_priv = m->private; |
1008 | struct proc_maps_private *proc_priv = &numa_priv->proc_maps; | 1128 | struct proc_maps_private *proc_priv = &numa_priv->proc_maps; |
@@ -1039,9 +1159,19 @@ static int show_numa_map(struct seq_file *m, void *v) | |||
1039 | seq_path(m, &file->f_path, "\n\t= "); | 1159 | seq_path(m, &file->f_path, "\n\t= "); |
1040 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { | 1160 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { |
1041 | seq_printf(m, " heap"); | 1161 | seq_printf(m, " heap"); |
1042 | } else if (vma->vm_start <= mm->start_stack && | 1162 | } else { |
1043 | vma->vm_end >= mm->start_stack) { | 1163 | pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid); |
1044 | seq_printf(m, " stack"); | 1164 | if (tid != 0) { |
1165 | /* | ||
1166 | * Thread stack in /proc/PID/task/TID/maps or | ||
1167 | * the main process stack. | ||
1168 | */ | ||
1169 | if (!is_pid || (vma->vm_start <= mm->start_stack && | ||
1170 | vma->vm_end >= mm->start_stack)) | ||
1171 | seq_printf(m, " stack"); | ||
1172 | else | ||
1173 | seq_printf(m, " stack:%d", tid); | ||
1174 | } | ||
1045 | } | 1175 | } |
1046 | 1176 | ||
1047 | if (is_vm_hugetlb_page(vma)) | 1177 | if (is_vm_hugetlb_page(vma)) |
@@ -1084,21 +1214,39 @@ out: | |||
1084 | return 0; | 1214 | return 0; |
1085 | } | 1215 | } |
1086 | 1216 | ||
1217 | static int show_pid_numa_map(struct seq_file *m, void *v) | ||
1218 | { | ||
1219 | return show_numa_map(m, v, 1); | ||
1220 | } | ||
1221 | |||
1222 | static int show_tid_numa_map(struct seq_file *m, void *v) | ||
1223 | { | ||
1224 | return show_numa_map(m, v, 0); | ||
1225 | } | ||
1226 | |||
1087 | static const struct seq_operations proc_pid_numa_maps_op = { | 1227 | static const struct seq_operations proc_pid_numa_maps_op = { |
1088 | .start = m_start, | 1228 | .start = m_start, |
1089 | .next = m_next, | 1229 | .next = m_next, |
1090 | .stop = m_stop, | 1230 | .stop = m_stop, |
1091 | .show = show_numa_map, | 1231 | .show = show_pid_numa_map, |
1232 | }; | ||
1233 | |||
1234 | static const struct seq_operations proc_tid_numa_maps_op = { | ||
1235 | .start = m_start, | ||
1236 | .next = m_next, | ||
1237 | .stop = m_stop, | ||
1238 | .show = show_tid_numa_map, | ||
1092 | }; | 1239 | }; |
1093 | 1240 | ||
1094 | static int numa_maps_open(struct inode *inode, struct file *file) | 1241 | static int numa_maps_open(struct inode *inode, struct file *file, |
1242 | const struct seq_operations *ops) | ||
1095 | { | 1243 | { |
1096 | struct numa_maps_private *priv; | 1244 | struct numa_maps_private *priv; |
1097 | int ret = -ENOMEM; | 1245 | int ret = -ENOMEM; |
1098 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); | 1246 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); |
1099 | if (priv) { | 1247 | if (priv) { |
1100 | priv->proc_maps.pid = proc_pid(inode); | 1248 | priv->proc_maps.pid = proc_pid(inode); |
1101 | ret = seq_open(file, &proc_pid_numa_maps_op); | 1249 | ret = seq_open(file, ops); |
1102 | if (!ret) { | 1250 | if (!ret) { |
1103 | struct seq_file *m = file->private_data; | 1251 | struct seq_file *m = file->private_data; |
1104 | m->private = priv; | 1252 | m->private = priv; |
@@ -1109,8 +1257,25 @@ static int numa_maps_open(struct inode *inode, struct file *file) | |||
1109 | return ret; | 1257 | return ret; |
1110 | } | 1258 | } |
1111 | 1259 | ||
1112 | const struct file_operations proc_numa_maps_operations = { | 1260 | static int pid_numa_maps_open(struct inode *inode, struct file *file) |
1113 | .open = numa_maps_open, | 1261 | { |
1262 | return numa_maps_open(inode, file, &proc_pid_numa_maps_op); | ||
1263 | } | ||
1264 | |||
1265 | static int tid_numa_maps_open(struct inode *inode, struct file *file) | ||
1266 | { | ||
1267 | return numa_maps_open(inode, file, &proc_tid_numa_maps_op); | ||
1268 | } | ||
1269 | |||
1270 | const struct file_operations proc_pid_numa_maps_operations = { | ||
1271 | .open = pid_numa_maps_open, | ||
1272 | .read = seq_read, | ||
1273 | .llseek = seq_lseek, | ||
1274 | .release = seq_release_private, | ||
1275 | }; | ||
1276 | |||
1277 | const struct file_operations proc_tid_numa_maps_operations = { | ||
1278 | .open = tid_numa_maps_open, | ||
1114 | .read = seq_read, | 1279 | .read = seq_read, |
1115 | .llseek = seq_lseek, | 1280 | .llseek = seq_lseek, |
1116 | .release = seq_release_private, | 1281 | .release = seq_release_private, |
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 980de547c070..74fe164d1b23 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c | |||
@@ -134,9 +134,11 @@ static void pad_len_spaces(struct seq_file *m, int len) | |||
134 | /* | 134 | /* |
135 | * display a single VMA to a sequenced file | 135 | * display a single VMA to a sequenced file |
136 | */ | 136 | */ |
137 | static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) | 137 | static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, |
138 | int is_pid) | ||
138 | { | 139 | { |
139 | struct mm_struct *mm = vma->vm_mm; | 140 | struct mm_struct *mm = vma->vm_mm; |
141 | struct proc_maps_private *priv = m->private; | ||
140 | unsigned long ino = 0; | 142 | unsigned long ino = 0; |
141 | struct file *file; | 143 | struct file *file; |
142 | dev_t dev = 0; | 144 | dev_t dev = 0; |
@@ -168,10 +170,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) | |||
168 | pad_len_spaces(m, len); | 170 | pad_len_spaces(m, len); |
169 | seq_path(m, &file->f_path, ""); | 171 | seq_path(m, &file->f_path, ""); |
170 | } else if (mm) { | 172 | } else if (mm) { |
171 | if (vma->vm_start <= mm->start_stack && | 173 | pid_t tid = vm_is_stack(priv->task, vma, is_pid); |
172 | vma->vm_end >= mm->start_stack) { | 174 | |
175 | if (tid != 0) { | ||
173 | pad_len_spaces(m, len); | 176 | pad_len_spaces(m, len); |
174 | seq_puts(m, "[stack]"); | 177 | /* |
178 | * Thread stack in /proc/PID/task/TID/maps or | ||
179 | * the main process stack. | ||
180 | */ | ||
181 | if (!is_pid || (vma->vm_start <= mm->start_stack && | ||
182 | vma->vm_end >= mm->start_stack)) | ||
183 | seq_printf(m, "[stack]"); | ||
184 | else | ||
185 | seq_printf(m, "[stack:%d]", tid); | ||
175 | } | 186 | } |
176 | } | 187 | } |
177 | 188 | ||
@@ -182,11 +193,22 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) | |||
182 | /* | 193 | /* |
183 | * display mapping lines for a particular process's /proc/pid/maps | 194 | * display mapping lines for a particular process's /proc/pid/maps |
184 | */ | 195 | */ |
185 | static int show_map(struct seq_file *m, void *_p) | 196 | static int show_map(struct seq_file *m, void *_p, int is_pid) |
186 | { | 197 | { |
187 | struct rb_node *p = _p; | 198 | struct rb_node *p = _p; |
188 | 199 | ||
189 | return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); | 200 | return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb), |
201 | is_pid); | ||
202 | } | ||
203 | |||
204 | static int show_pid_map(struct seq_file *m, void *_p) | ||
205 | { | ||
206 | return show_map(m, _p, 1); | ||
207 | } | ||
208 | |||
209 | static int show_tid_map(struct seq_file *m, void *_p) | ||
210 | { | ||
211 | return show_map(m, _p, 0); | ||
190 | } | 212 | } |
191 | 213 | ||
192 | static void *m_start(struct seq_file *m, loff_t *pos) | 214 | static void *m_start(struct seq_file *m, loff_t *pos) |
@@ -240,10 +262,18 @@ static const struct seq_operations proc_pid_maps_ops = { | |||
240 | .start = m_start, | 262 | .start = m_start, |
241 | .next = m_next, | 263 | .next = m_next, |
242 | .stop = m_stop, | 264 | .stop = m_stop, |
243 | .show = show_map | 265 | .show = show_pid_map |
266 | }; | ||
267 | |||
268 | static const struct seq_operations proc_tid_maps_ops = { | ||
269 | .start = m_start, | ||
270 | .next = m_next, | ||
271 | .stop = m_stop, | ||
272 | .show = show_tid_map | ||
244 | }; | 273 | }; |
245 | 274 | ||
246 | static int maps_open(struct inode *inode, struct file *file) | 275 | static int maps_open(struct inode *inode, struct file *file, |
276 | const struct seq_operations *ops) | ||
247 | { | 277 | { |
248 | struct proc_maps_private *priv; | 278 | struct proc_maps_private *priv; |
249 | int ret = -ENOMEM; | 279 | int ret = -ENOMEM; |
@@ -251,7 +281,7 @@ static int maps_open(struct inode *inode, struct file *file) | |||
251 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); | 281 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); |
252 | if (priv) { | 282 | if (priv) { |
253 | priv->pid = proc_pid(inode); | 283 | priv->pid = proc_pid(inode); |
254 | ret = seq_open(file, &proc_pid_maps_ops); | 284 | ret = seq_open(file, ops); |
255 | if (!ret) { | 285 | if (!ret) { |
256 | struct seq_file *m = file->private_data; | 286 | struct seq_file *m = file->private_data; |
257 | m->private = priv; | 287 | m->private = priv; |
@@ -262,8 +292,25 @@ static int maps_open(struct inode *inode, struct file *file) | |||
262 | return ret; | 292 | return ret; |
263 | } | 293 | } |
264 | 294 | ||
265 | const struct file_operations proc_maps_operations = { | 295 | static int pid_maps_open(struct inode *inode, struct file *file) |
266 | .open = maps_open, | 296 | { |
297 | return maps_open(inode, file, &proc_pid_maps_ops); | ||
298 | } | ||
299 | |||
300 | static int tid_maps_open(struct inode *inode, struct file *file) | ||
301 | { | ||
302 | return maps_open(inode, file, &proc_tid_maps_ops); | ||
303 | } | ||
304 | |||
305 | const struct file_operations proc_pid_maps_operations = { | ||
306 | .open = pid_maps_open, | ||
307 | .read = seq_read, | ||
308 | .llseek = seq_lseek, | ||
309 | .release = seq_release_private, | ||
310 | }; | ||
311 | |||
312 | const struct file_operations proc_tid_maps_operations = { | ||
313 | .open = tid_maps_open, | ||
267 | .read = seq_read, | 314 | .read = seq_read, |
268 | .llseek = seq_lseek, | 315 | .llseek = seq_lseek, |
269 | .release = seq_release_private, | 316 | .release = seq_release_private, |
diff --git a/fs/seq_file.c b/fs/seq_file.c index 4023d6be939b..aa242dc99373 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c | |||
@@ -140,9 +140,21 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) | |||
140 | 140 | ||
141 | mutex_lock(&m->lock); | 141 | mutex_lock(&m->lock); |
142 | 142 | ||
143 | /* | ||
144 | * seq_file->op->..m_start/m_stop/m_next may do special actions | ||
145 | * or optimisations based on the file->f_version, so we want to | ||
146 | * pass the file->f_version to those methods. | ||
147 | * | ||
148 | * seq_file->version is just copy of f_version, and seq_file | ||
149 | * methods can treat it simply as file version. | ||
150 | * It is copied in first and copied out after all operations. | ||
151 | * It is convenient to have it as part of structure to avoid the | ||
152 | * need of passing another argument to all the seq_file methods. | ||
153 | */ | ||
154 | m->version = file->f_version; | ||
155 | |||
143 | /* Don't assume *ppos is where we left it */ | 156 | /* Don't assume *ppos is where we left it */ |
144 | if (unlikely(*ppos != m->read_pos)) { | 157 | if (unlikely(*ppos != m->read_pos)) { |
145 | m->read_pos = *ppos; | ||
146 | while ((err = traverse(m, *ppos)) == -EAGAIN) | 158 | while ((err = traverse(m, *ppos)) == -EAGAIN) |
147 | ; | 159 | ; |
148 | if (err) { | 160 | if (err) { |
@@ -152,21 +164,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) | |||
152 | m->index = 0; | 164 | m->index = 0; |
153 | m->count = 0; | 165 | m->count = 0; |
154 | goto Done; | 166 | goto Done; |
167 | } else { | ||
168 | m->read_pos = *ppos; | ||
155 | } | 169 | } |
156 | } | 170 | } |
157 | 171 | ||
158 | /* | ||
159 | * seq_file->op->..m_start/m_stop/m_next may do special actions | ||
160 | * or optimisations based on the file->f_version, so we want to | ||
161 | * pass the file->f_version to those methods. | ||
162 | * | ||
163 | * seq_file->version is just copy of f_version, and seq_file | ||
164 | * methods can treat it simply as file version. | ||
165 | * It is copied in first and copied out after all operations. | ||
166 | * It is convenient to have it as part of structure to avoid the | ||
167 | * need of passing another argument to all the seq_file methods. | ||
168 | */ | ||
169 | m->version = file->f_version; | ||
170 | /* grab buffer if we didn't have one */ | 172 | /* grab buffer if we didn't have one */ |
171 | if (!m->buf) { | 173 | if (!m->buf) { |
172 | m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); | 174 | m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 76bff2bff15e..a03c098b0cce 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -425,6 +425,8 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, | |||
425 | unsigned long size); | 425 | unsigned long size); |
426 | #endif | 426 | #endif |
427 | 427 | ||
428 | #ifdef CONFIG_MMU | ||
429 | |||
428 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE | 430 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE |
429 | static inline int pmd_trans_huge(pmd_t pmd) | 431 | static inline int pmd_trans_huge(pmd_t pmd) |
430 | { | 432 | { |
@@ -441,7 +443,66 @@ static inline int pmd_write(pmd_t pmd) | |||
441 | return 0; | 443 | return 0; |
442 | } | 444 | } |
443 | #endif /* __HAVE_ARCH_PMD_WRITE */ | 445 | #endif /* __HAVE_ARCH_PMD_WRITE */ |
446 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
447 | |||
448 | /* | ||
449 | * This function is meant to be used by sites walking pagetables with | ||
450 | * the mmap_sem hold in read mode to protect against MADV_DONTNEED and | ||
451 | * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd | ||
452 | * into a null pmd and the transhuge page fault can convert a null pmd | ||
453 | * into an hugepmd or into a regular pmd (if the hugepage allocation | ||
454 | * fails). While holding the mmap_sem in read mode the pmd becomes | ||
455 | * stable and stops changing under us only if it's not null and not a | ||
456 | * transhuge pmd. When those races occurs and this function makes a | ||
457 | * difference vs the standard pmd_none_or_clear_bad, the result is | ||
458 | * undefined so behaving like if the pmd was none is safe (because it | ||
459 | * can return none anyway). The compiler level barrier() is critically | ||
460 | * important to compute the two checks atomically on the same pmdval. | ||
461 | */ | ||
462 | static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) | ||
463 | { | ||
464 | /* depend on compiler for an atomic pmd read */ | ||
465 | pmd_t pmdval = *pmd; | ||
466 | /* | ||
467 | * The barrier will stabilize the pmdval in a register or on | ||
468 | * the stack so that it will stop changing under the code. | ||
469 | */ | ||
470 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
471 | barrier(); | ||
472 | #endif | ||
473 | if (pmd_none(pmdval)) | ||
474 | return 1; | ||
475 | if (unlikely(pmd_bad(pmdval))) { | ||
476 | if (!pmd_trans_huge(pmdval)) | ||
477 | pmd_clear_bad(pmd); | ||
478 | return 1; | ||
479 | } | ||
480 | return 0; | ||
481 | } | ||
482 | |||
483 | /* | ||
484 | * This is a noop if Transparent Hugepage Support is not built into | ||
485 | * the kernel. Otherwise it is equivalent to | ||
486 | * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in | ||
487 | * places that already verified the pmd is not none and they want to | ||
488 | * walk ptes while holding the mmap sem in read mode (write mode don't | ||
489 | * need this). If THP is not enabled, the pmd can't go away under the | ||
490 | * code even if MADV_DONTNEED runs, but if THP is enabled we need to | ||
491 | * run a pmd_trans_unstable before walking the ptes after | ||
492 | * split_huge_page_pmd returns (because it may have run when the pmd | ||
493 | * become null, but then a page fault can map in a THP and not a | ||
494 | * regular page). | ||
495 | */ | ||
496 | static inline int pmd_trans_unstable(pmd_t *pmd) | ||
497 | { | ||
498 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
499 | return pmd_none_or_trans_huge_or_clear_bad(pmd); | ||
500 | #else | ||
501 | return 0; | ||
444 | #endif | 502 | #endif |
503 | } | ||
504 | |||
505 | #endif /* CONFIG_MMU */ | ||
445 | 506 | ||
446 | #endif /* !__ASSEMBLY__ */ | 507 | #endif /* !__ASSEMBLY__ */ |
447 | 508 | ||
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 501adb1b2f43..5a85b3415c1b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -498,7 +498,7 @@ struct cgroup_subsys { | |||
498 | struct list_head sibling; | 498 | struct list_head sibling; |
499 | /* used when use_id == true */ | 499 | /* used when use_id == true */ |
500 | struct idr idr; | 500 | struct idr idr; |
501 | rwlock_t id_lock; | 501 | spinlock_t id_lock; |
502 | 502 | ||
503 | /* should be defined only by modular subsystems */ | 503 | /* should be defined only by modular subsystems */ |
504 | struct module *module; | 504 | struct module *module; |
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index bb2bbdbe5464..51a90b7f2d60 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -23,6 +23,7 @@ extern int fragmentation_index(struct zone *zone, unsigned int order); | |||
23 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, | 23 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, |
24 | int order, gfp_t gfp_mask, nodemask_t *mask, | 24 | int order, gfp_t gfp_mask, nodemask_t *mask, |
25 | bool sync); | 25 | bool sync); |
26 | extern int compact_pgdat(pg_data_t *pgdat, int order); | ||
26 | extern unsigned long compaction_suitable(struct zone *zone, int order); | 27 | extern unsigned long compaction_suitable(struct zone *zone, int order); |
27 | 28 | ||
28 | /* Do not skip compaction more than 64 times */ | 29 | /* Do not skip compaction more than 64 times */ |
@@ -33,20 +34,26 @@ extern unsigned long compaction_suitable(struct zone *zone, int order); | |||
33 | * allocation success. 1 << compact_defer_limit compactions are skipped up | 34 | * allocation success. 1 << compact_defer_limit compactions are skipped up |
34 | * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT | 35 | * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT |
35 | */ | 36 | */ |
36 | static inline void defer_compaction(struct zone *zone) | 37 | static inline void defer_compaction(struct zone *zone, int order) |
37 | { | 38 | { |
38 | zone->compact_considered = 0; | 39 | zone->compact_considered = 0; |
39 | zone->compact_defer_shift++; | 40 | zone->compact_defer_shift++; |
40 | 41 | ||
42 | if (order < zone->compact_order_failed) | ||
43 | zone->compact_order_failed = order; | ||
44 | |||
41 | if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) | 45 | if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) |
42 | zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; | 46 | zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; |
43 | } | 47 | } |
44 | 48 | ||
45 | /* Returns true if compaction should be skipped this time */ | 49 | /* Returns true if compaction should be skipped this time */ |
46 | static inline bool compaction_deferred(struct zone *zone) | 50 | static inline bool compaction_deferred(struct zone *zone, int order) |
47 | { | 51 | { |
48 | unsigned long defer_limit = 1UL << zone->compact_defer_shift; | 52 | unsigned long defer_limit = 1UL << zone->compact_defer_shift; |
49 | 53 | ||
54 | if (order < zone->compact_order_failed) | ||
55 | return false; | ||
56 | |||
50 | /* Avoid possible overflow */ | 57 | /* Avoid possible overflow */ |
51 | if (++zone->compact_considered > defer_limit) | 58 | if (++zone->compact_considered > defer_limit) |
52 | zone->compact_considered = defer_limit; | 59 | zone->compact_considered = defer_limit; |
@@ -62,16 +69,21 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
62 | return COMPACT_CONTINUE; | 69 | return COMPACT_CONTINUE; |
63 | } | 70 | } |
64 | 71 | ||
72 | static inline int compact_pgdat(pg_data_t *pgdat, int order) | ||
73 | { | ||
74 | return COMPACT_CONTINUE; | ||
75 | } | ||
76 | |||
65 | static inline unsigned long compaction_suitable(struct zone *zone, int order) | 77 | static inline unsigned long compaction_suitable(struct zone *zone, int order) |
66 | { | 78 | { |
67 | return COMPACT_SKIPPED; | 79 | return COMPACT_SKIPPED; |
68 | } | 80 | } |
69 | 81 | ||
70 | static inline void defer_compaction(struct zone *zone) | 82 | static inline void defer_compaction(struct zone *zone, int order) |
71 | { | 83 | { |
72 | } | 84 | } |
73 | 85 | ||
74 | static inline bool compaction_deferred(struct zone *zone) | 86 | static inline bool compaction_deferred(struct zone *zone, int order) |
75 | { | 87 | { |
76 | return 1; | 88 | return 1; |
77 | } | 89 | } |
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index e9eaec522655..7a7e5fd2a277 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -89,42 +89,33 @@ extern void rebuild_sched_domains(void); | |||
89 | extern void cpuset_print_task_mems_allowed(struct task_struct *p); | 89 | extern void cpuset_print_task_mems_allowed(struct task_struct *p); |
90 | 90 | ||
91 | /* | 91 | /* |
92 | * reading current mems_allowed and mempolicy in the fastpath must protected | 92 | * get_mems_allowed is required when making decisions involving mems_allowed |
93 | * by get_mems_allowed() | 93 | * such as during page allocation. mems_allowed can be updated in parallel |
94 | * and depending on the new value an operation can fail potentially causing | ||
95 | * process failure. A retry loop with get_mems_allowed and put_mems_allowed | ||
96 | * prevents these artificial failures. | ||
94 | */ | 97 | */ |
95 | static inline void get_mems_allowed(void) | 98 | static inline unsigned int get_mems_allowed(void) |
96 | { | 99 | { |
97 | current->mems_allowed_change_disable++; | 100 | return read_seqcount_begin(¤t->mems_allowed_seq); |
98 | |||
99 | /* | ||
100 | * ensure that reading mems_allowed and mempolicy happens after the | ||
101 | * update of ->mems_allowed_change_disable. | ||
102 | * | ||
103 | * the write-side task finds ->mems_allowed_change_disable is not 0, | ||
104 | * and knows the read-side task is reading mems_allowed or mempolicy, | ||
105 | * so it will clear old bits lazily. | ||
106 | */ | ||
107 | smp_mb(); | ||
108 | } | 101 | } |
109 | 102 | ||
110 | static inline void put_mems_allowed(void) | 103 | /* |
104 | * If this returns false, the operation that took place after get_mems_allowed | ||
105 | * may have failed. It is up to the caller to retry the operation if | ||
106 | * appropriate. | ||
107 | */ | ||
108 | static inline bool put_mems_allowed(unsigned int seq) | ||
111 | { | 109 | { |
112 | /* | 110 | return !read_seqcount_retry(¤t->mems_allowed_seq, seq); |
113 | * ensure that reading mems_allowed and mempolicy before reducing | ||
114 | * mems_allowed_change_disable. | ||
115 | * | ||
116 | * the write-side task will know that the read-side task is still | ||
117 | * reading mems_allowed or mempolicy, don't clears old bits in the | ||
118 | * nodemask. | ||
119 | */ | ||
120 | smp_mb(); | ||
121 | --ACCESS_ONCE(current->mems_allowed_change_disable); | ||
122 | } | 111 | } |
123 | 112 | ||
124 | static inline void set_mems_allowed(nodemask_t nodemask) | 113 | static inline void set_mems_allowed(nodemask_t nodemask) |
125 | { | 114 | { |
126 | task_lock(current); | 115 | task_lock(current); |
116 | write_seqcount_begin(¤t->mems_allowed_seq); | ||
127 | current->mems_allowed = nodemask; | 117 | current->mems_allowed = nodemask; |
118 | write_seqcount_end(¤t->mems_allowed_seq); | ||
128 | task_unlock(current); | 119 | task_unlock(current); |
129 | } | 120 | } |
130 | 121 | ||
@@ -234,12 +225,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) | |||
234 | { | 225 | { |
235 | } | 226 | } |
236 | 227 | ||
237 | static inline void get_mems_allowed(void) | 228 | static inline unsigned int get_mems_allowed(void) |
238 | { | 229 | { |
230 | return 0; | ||
239 | } | 231 | } |
240 | 232 | ||
241 | static inline void put_mems_allowed(void) | 233 | static inline bool put_mems_allowed(unsigned int seq) |
242 | { | 234 | { |
235 | return true; | ||
243 | } | 236 | } |
244 | 237 | ||
245 | #endif /* !CONFIG_CPUSETS */ | 238 | #endif /* !CONFIG_CPUSETS */ |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1b921299abc4..c8af7a2efb52 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -51,6 +51,9 @@ extern pmd_t *page_check_address_pmd(struct page *page, | |||
51 | unsigned long address, | 51 | unsigned long address, |
52 | enum page_check_address_pmd_flag flag); | 52 | enum page_check_address_pmd_flag flag); |
53 | 53 | ||
54 | #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) | ||
55 | #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) | ||
56 | |||
54 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 57 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
55 | #define HPAGE_PMD_SHIFT HPAGE_SHIFT | 58 | #define HPAGE_PMD_SHIFT HPAGE_SHIFT |
56 | #define HPAGE_PMD_MASK HPAGE_MASK | 59 | #define HPAGE_PMD_MASK HPAGE_MASK |
@@ -102,8 +105,6 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); | |||
102 | BUG_ON(pmd_trans_splitting(*____pmd) || \ | 105 | BUG_ON(pmd_trans_splitting(*____pmd) || \ |
103 | pmd_trans_huge(*____pmd)); \ | 106 | pmd_trans_huge(*____pmd)); \ |
104 | } while (0) | 107 | } while (0) |
105 | #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) | ||
106 | #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) | ||
107 | #if HPAGE_PMD_ORDER > MAX_ORDER | 108 | #if HPAGE_PMD_ORDER > MAX_ORDER |
108 | #error "hugepages can't be allocated by the buddy allocator" | 109 | #error "hugepages can't be allocated by the buddy allocator" |
109 | #endif | 110 | #endif |
@@ -113,6 +114,18 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, | |||
113 | unsigned long start, | 114 | unsigned long start, |
114 | unsigned long end, | 115 | unsigned long end, |
115 | long adjust_next); | 116 | long adjust_next); |
117 | extern int __pmd_trans_huge_lock(pmd_t *pmd, | ||
118 | struct vm_area_struct *vma); | ||
119 | /* mmap_sem must be held on entry */ | ||
120 | static inline int pmd_trans_huge_lock(pmd_t *pmd, | ||
121 | struct vm_area_struct *vma) | ||
122 | { | ||
123 | VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem)); | ||
124 | if (pmd_trans_huge(*pmd)) | ||
125 | return __pmd_trans_huge_lock(pmd, vma); | ||
126 | else | ||
127 | return 0; | ||
128 | } | ||
116 | static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, | 129 | static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, |
117 | unsigned long start, | 130 | unsigned long start, |
118 | unsigned long end, | 131 | unsigned long end, |
@@ -146,9 +159,9 @@ static inline struct page *compound_trans_head(struct page *page) | |||
146 | return page; | 159 | return page; |
147 | } | 160 | } |
148 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ | 161 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ |
149 | #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) | 162 | #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) |
150 | #define HPAGE_PMD_MASK ({ BUG(); 0; }) | 163 | #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) |
151 | #define HPAGE_PMD_SIZE ({ BUG(); 0; }) | 164 | #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) |
152 | 165 | ||
153 | #define hpage_nr_pages(x) 1 | 166 | #define hpage_nr_pages(x) 1 |
154 | 167 | ||
@@ -176,6 +189,11 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, | |||
176 | long adjust_next) | 189 | long adjust_next) |
177 | { | 190 | { |
178 | } | 191 | } |
192 | static inline int pmd_trans_huge_lock(pmd_t *pmd, | ||
193 | struct vm_area_struct *vma) | ||
194 | { | ||
195 | return 0; | ||
196 | } | ||
179 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 197 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
180 | 198 | ||
181 | #endif /* _LINUX_HUGE_MM_H */ | 199 | #endif /* _LINUX_HUGE_MM_H */ |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d9d6c868b86b..000837e126e6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -14,6 +14,15 @@ struct user_struct; | |||
14 | #include <linux/shm.h> | 14 | #include <linux/shm.h> |
15 | #include <asm/tlbflush.h> | 15 | #include <asm/tlbflush.h> |
16 | 16 | ||
17 | struct hugepage_subpool { | ||
18 | spinlock_t lock; | ||
19 | long count; | ||
20 | long max_hpages, used_hpages; | ||
21 | }; | ||
22 | |||
23 | struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); | ||
24 | void hugepage_put_subpool(struct hugepage_subpool *spool); | ||
25 | |||
17 | int PageHuge(struct page *page); | 26 | int PageHuge(struct page *page); |
18 | 27 | ||
19 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma); | 28 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma); |
@@ -128,35 +137,14 @@ enum { | |||
128 | }; | 137 | }; |
129 | 138 | ||
130 | #ifdef CONFIG_HUGETLBFS | 139 | #ifdef CONFIG_HUGETLBFS |
131 | struct hugetlbfs_config { | ||
132 | uid_t uid; | ||
133 | gid_t gid; | ||
134 | umode_t mode; | ||
135 | long nr_blocks; | ||
136 | long nr_inodes; | ||
137 | struct hstate *hstate; | ||
138 | }; | ||
139 | |||
140 | struct hugetlbfs_sb_info { | 140 | struct hugetlbfs_sb_info { |
141 | long max_blocks; /* blocks allowed */ | ||
142 | long free_blocks; /* blocks free */ | ||
143 | long max_inodes; /* inodes allowed */ | 141 | long max_inodes; /* inodes allowed */ |
144 | long free_inodes; /* inodes free */ | 142 | long free_inodes; /* inodes free */ |
145 | spinlock_t stat_lock; | 143 | spinlock_t stat_lock; |
146 | struct hstate *hstate; | 144 | struct hstate *hstate; |
145 | struct hugepage_subpool *spool; | ||
147 | }; | 146 | }; |
148 | 147 | ||
149 | |||
150 | struct hugetlbfs_inode_info { | ||
151 | struct shared_policy policy; | ||
152 | struct inode vfs_inode; | ||
153 | }; | ||
154 | |||
155 | static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) | ||
156 | { | ||
157 | return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); | ||
158 | } | ||
159 | |||
160 | static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) | 148 | static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) |
161 | { | 149 | { |
162 | return sb->s_fs_info; | 150 | return sb->s_fs_info; |
@@ -164,10 +152,9 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) | |||
164 | 152 | ||
165 | extern const struct file_operations hugetlbfs_file_operations; | 153 | extern const struct file_operations hugetlbfs_file_operations; |
166 | extern const struct vm_operations_struct hugetlb_vm_ops; | 154 | extern const struct vm_operations_struct hugetlb_vm_ops; |
167 | struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, | 155 | struct file *hugetlb_file_setup(const char *name, unsigned long addr, |
156 | size_t size, vm_flags_t acct, | ||
168 | struct user_struct **user, int creat_flags); | 157 | struct user_struct **user, int creat_flags); |
169 | int hugetlb_get_quota(struct address_space *mapping, long delta); | ||
170 | void hugetlb_put_quota(struct address_space *mapping, long delta); | ||
171 | 158 | ||
172 | static inline int is_file_hugepages(struct file *file) | 159 | static inline int is_file_hugepages(struct file *file) |
173 | { | 160 | { |
@@ -179,15 +166,11 @@ static inline int is_file_hugepages(struct file *file) | |||
179 | return 0; | 166 | return 0; |
180 | } | 167 | } |
181 | 168 | ||
182 | static inline void set_file_hugepages(struct file *file) | ||
183 | { | ||
184 | file->f_op = &hugetlbfs_file_operations; | ||
185 | } | ||
186 | #else /* !CONFIG_HUGETLBFS */ | 169 | #else /* !CONFIG_HUGETLBFS */ |
187 | 170 | ||
188 | #define is_file_hugepages(file) 0 | 171 | #define is_file_hugepages(file) 0 |
189 | #define set_file_hugepages(file) BUG() | 172 | static inline struct file * |
190 | static inline struct file *hugetlb_file_setup(const char *name, size_t size, | 173 | hugetlb_file_setup(const char *name, unsigned long addr, size_t size, |
191 | vm_flags_t acctflag, struct user_struct **user, int creat_flags) | 174 | vm_flags_t acctflag, struct user_struct **user, int creat_flags) |
192 | { | 175 | { |
193 | return ERR_PTR(-ENOSYS); | 176 | return ERR_PTR(-ENOSYS); |
diff --git a/include/linux/init_task.h b/include/linux/init_task.h index f994d51f70f2..e4baff5f7ff4 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h | |||
@@ -29,6 +29,13 @@ extern struct fs_struct init_fs; | |||
29 | #define INIT_GROUP_RWSEM(sig) | 29 | #define INIT_GROUP_RWSEM(sig) |
30 | #endif | 30 | #endif |
31 | 31 | ||
32 | #ifdef CONFIG_CPUSETS | ||
33 | #define INIT_CPUSET_SEQ \ | ||
34 | .mems_allowed_seq = SEQCNT_ZERO, | ||
35 | #else | ||
36 | #define INIT_CPUSET_SEQ | ||
37 | #endif | ||
38 | |||
32 | #define INIT_SIGNALS(sig) { \ | 39 | #define INIT_SIGNALS(sig) { \ |
33 | .nr_threads = 1, \ | 40 | .nr_threads = 1, \ |
34 | .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ | 41 | .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ |
@@ -192,6 +199,7 @@ extern struct cred init_cred; | |||
192 | INIT_FTRACE_GRAPH \ | 199 | INIT_FTRACE_GRAPH \ |
193 | INIT_TRACE_RECURSION \ | 200 | INIT_TRACE_RECURSION \ |
194 | INIT_TASK_RCU_PREEMPT(tsk) \ | 201 | INIT_TASK_RCU_PREEMPT(tsk) \ |
202 | INIT_CPUSET_SEQ \ | ||
195 | } | 203 | } |
196 | 204 | ||
197 | 205 | ||
diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h index bd92a89f4b0a..26a65711676f 100644 --- a/include/linux/kernel-page-flags.h +++ b/include/linux/kernel-page-flags.h | |||
@@ -30,6 +30,7 @@ | |||
30 | #define KPF_NOPAGE 20 | 30 | #define KPF_NOPAGE 20 |
31 | 31 | ||
32 | #define KPF_KSM 21 | 32 | #define KPF_KSM 21 |
33 | #define KPF_THP 22 | ||
33 | 34 | ||
34 | /* kernel hacking assistances | 35 | /* kernel hacking assistances |
35 | * WARNING: subject to change, never rely on them! | 36 | * WARNING: subject to change, never rely on them! |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b80de520670b..f94efd2f6c27 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -77,7 +77,8 @@ extern void mem_cgroup_uncharge_end(void); | |||
77 | extern void mem_cgroup_uncharge_page(struct page *page); | 77 | extern void mem_cgroup_uncharge_page(struct page *page); |
78 | extern void mem_cgroup_uncharge_cache_page(struct page *page); | 78 | extern void mem_cgroup_uncharge_cache_page(struct page *page); |
79 | 79 | ||
80 | extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask); | 80 | extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, |
81 | int order); | ||
81 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); | 82 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); |
82 | 83 | ||
83 | extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); | 84 | extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); |
@@ -140,6 +141,34 @@ static inline bool mem_cgroup_disabled(void) | |||
140 | return false; | 141 | return false; |
141 | } | 142 | } |
142 | 143 | ||
144 | void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked, | ||
145 | unsigned long *flags); | ||
146 | |||
147 | extern atomic_t memcg_moving; | ||
148 | |||
149 | static inline void mem_cgroup_begin_update_page_stat(struct page *page, | ||
150 | bool *locked, unsigned long *flags) | ||
151 | { | ||
152 | if (mem_cgroup_disabled()) | ||
153 | return; | ||
154 | rcu_read_lock(); | ||
155 | *locked = false; | ||
156 | if (atomic_read(&memcg_moving)) | ||
157 | __mem_cgroup_begin_update_page_stat(page, locked, flags); | ||
158 | } | ||
159 | |||
160 | void __mem_cgroup_end_update_page_stat(struct page *page, | ||
161 | unsigned long *flags); | ||
162 | static inline void mem_cgroup_end_update_page_stat(struct page *page, | ||
163 | bool *locked, unsigned long *flags) | ||
164 | { | ||
165 | if (mem_cgroup_disabled()) | ||
166 | return; | ||
167 | if (*locked) | ||
168 | __mem_cgroup_end_update_page_stat(page, flags); | ||
169 | rcu_read_unlock(); | ||
170 | } | ||
171 | |||
143 | void mem_cgroup_update_page_stat(struct page *page, | 172 | void mem_cgroup_update_page_stat(struct page *page, |
144 | enum mem_cgroup_page_stat_item idx, | 173 | enum mem_cgroup_page_stat_item idx, |
145 | int val); | 174 | int val); |
@@ -298,21 +327,6 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root, | |||
298 | { | 327 | { |
299 | } | 328 | } |
300 | 329 | ||
301 | static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg) | ||
302 | { | ||
303 | return 0; | ||
304 | } | ||
305 | |||
306 | static inline void mem_cgroup_note_reclaim_priority(struct mem_cgroup *memcg, | ||
307 | int priority) | ||
308 | { | ||
309 | } | ||
310 | |||
311 | static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *memcg, | ||
312 | int priority) | ||
313 | { | ||
314 | } | ||
315 | |||
316 | static inline bool mem_cgroup_disabled(void) | 330 | static inline bool mem_cgroup_disabled(void) |
317 | { | 331 | { |
318 | return true; | 332 | return true; |
@@ -355,6 +369,16 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
355 | { | 369 | { |
356 | } | 370 | } |
357 | 371 | ||
372 | static inline void mem_cgroup_begin_update_page_stat(struct page *page, | ||
373 | bool *locked, unsigned long *flags) | ||
374 | { | ||
375 | } | ||
376 | |||
377 | static inline void mem_cgroup_end_update_page_stat(struct page *page, | ||
378 | bool *locked, unsigned long *flags) | ||
379 | { | ||
380 | } | ||
381 | |||
358 | static inline void mem_cgroup_inc_page_stat(struct page *page, | 382 | static inline void mem_cgroup_inc_page_stat(struct page *page, |
359 | enum mem_cgroup_page_stat_item idx) | 383 | enum mem_cgroup_page_stat_item idx) |
360 | { | 384 | { |
@@ -391,7 +415,7 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
391 | struct page *newpage) | 415 | struct page *newpage) |
392 | { | 416 | { |
393 | } | 417 | } |
394 | #endif /* CONFIG_CGROUP_MEM_CONT */ | 418 | #endif /* CONFIG_CGROUP_MEM_RES_CTLR */ |
395 | 419 | ||
396 | #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) | 420 | #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) |
397 | static inline bool | 421 | static inline bool |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 05ed2828a553..855c337b20c3 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -8,7 +8,6 @@ | |||
8 | typedef struct page *new_page_t(struct page *, unsigned long private, int **); | 8 | typedef struct page *new_page_t(struct page *, unsigned long private, int **); |
9 | 9 | ||
10 | #ifdef CONFIG_MIGRATION | 10 | #ifdef CONFIG_MIGRATION |
11 | #define PAGE_MIGRATION 1 | ||
12 | 11 | ||
13 | extern void putback_lru_pages(struct list_head *l); | 12 | extern void putback_lru_pages(struct list_head *l); |
14 | extern int migrate_page(struct address_space *, | 13 | extern int migrate_page(struct address_space *, |
@@ -32,7 +31,6 @@ extern void migrate_page_copy(struct page *newpage, struct page *page); | |||
32 | extern int migrate_huge_page_move_mapping(struct address_space *mapping, | 31 | extern int migrate_huge_page_move_mapping(struct address_space *mapping, |
33 | struct page *newpage, struct page *page); | 32 | struct page *newpage, struct page *page); |
34 | #else | 33 | #else |
35 | #define PAGE_MIGRATION 0 | ||
36 | 34 | ||
37 | static inline void putback_lru_pages(struct list_head *l) {} | 35 | static inline void putback_lru_pages(struct list_head *l) {} |
38 | static inline int migrate_pages(struct list_head *l, new_page_t x, | 36 | static inline int migrate_pages(struct list_head *l, new_page_t x, |
diff --git a/include/linux/mm.h b/include/linux/mm.h index b5bb54d6d667..ee67e326b6f8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -1040,6 +1040,9 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma, | |||
1040 | !vma_growsup(vma->vm_next, addr); | 1040 | !vma_growsup(vma->vm_next, addr); |
1041 | } | 1041 | } |
1042 | 1042 | ||
1043 | extern pid_t | ||
1044 | vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group); | ||
1045 | |||
1043 | extern unsigned long move_page_tables(struct vm_area_struct *vma, | 1046 | extern unsigned long move_page_tables(struct vm_area_struct *vma, |
1044 | unsigned long old_addr, struct vm_area_struct *new_vma, | 1047 | unsigned long old_addr, struct vm_area_struct *new_vma, |
1045 | unsigned long new_addr, unsigned long len); | 1048 | unsigned long new_addr, unsigned long len); |
@@ -1058,19 +1061,20 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
1058 | /* | 1061 | /* |
1059 | * per-process(per-mm_struct) statistics. | 1062 | * per-process(per-mm_struct) statistics. |
1060 | */ | 1063 | */ |
1061 | static inline void set_mm_counter(struct mm_struct *mm, int member, long value) | ||
1062 | { | ||
1063 | atomic_long_set(&mm->rss_stat.count[member], value); | ||
1064 | } | ||
1065 | |||
1066 | #if defined(SPLIT_RSS_COUNTING) | ||
1067 | unsigned long get_mm_counter(struct mm_struct *mm, int member); | ||
1068 | #else | ||
1069 | static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) | 1064 | static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) |
1070 | { | 1065 | { |
1071 | return atomic_long_read(&mm->rss_stat.count[member]); | 1066 | long val = atomic_long_read(&mm->rss_stat.count[member]); |
1072 | } | 1067 | |
1068 | #ifdef SPLIT_RSS_COUNTING | ||
1069 | /* | ||
1070 | * counter is updated in asynchronous manner and may go to minus. | ||
1071 | * But it's never be expected number for users. | ||
1072 | */ | ||
1073 | if (val < 0) | ||
1074 | val = 0; | ||
1073 | #endif | 1075 | #endif |
1076 | return (unsigned long)val; | ||
1077 | } | ||
1074 | 1078 | ||
1075 | static inline void add_mm_counter(struct mm_struct *mm, int member, long value) | 1079 | static inline void add_mm_counter(struct mm_struct *mm, int member, long value) |
1076 | { | 1080 | { |
@@ -1127,9 +1131,9 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, | |||
1127 | } | 1131 | } |
1128 | 1132 | ||
1129 | #if defined(SPLIT_RSS_COUNTING) | 1133 | #if defined(SPLIT_RSS_COUNTING) |
1130 | void sync_mm_rss(struct task_struct *task, struct mm_struct *mm); | 1134 | void sync_mm_rss(struct mm_struct *mm); |
1131 | #else | 1135 | #else |
1132 | static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) | 1136 | static inline void sync_mm_rss(struct mm_struct *mm) |
1133 | { | 1137 | { |
1134 | } | 1138 | } |
1135 | #endif | 1139 | #endif |
@@ -1291,8 +1295,6 @@ extern void get_pfn_range_for_nid(unsigned int nid, | |||
1291 | extern unsigned long find_min_pfn_with_active_regions(void); | 1295 | extern unsigned long find_min_pfn_with_active_regions(void); |
1292 | extern void free_bootmem_with_active_regions(int nid, | 1296 | extern void free_bootmem_with_active_regions(int nid, |
1293 | unsigned long max_low_pfn); | 1297 | unsigned long max_low_pfn); |
1294 | int add_from_early_node_map(struct range *range, int az, | ||
1295 | int nr_range, int nid); | ||
1296 | extern void sparse_memory_present_with_active_regions(int nid); | 1298 | extern void sparse_memory_present_with_active_regions(int nid); |
1297 | 1299 | ||
1298 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 1300 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 650ba2fb3301..dff711509661 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -365,6 +365,7 @@ struct zone { | |||
365 | */ | 365 | */ |
366 | unsigned int compact_considered; | 366 | unsigned int compact_considered; |
367 | unsigned int compact_defer_shift; | 367 | unsigned int compact_defer_shift; |
368 | int compact_order_failed; | ||
368 | #endif | 369 | #endif |
369 | 370 | ||
370 | ZONE_PADDING(_pad1_) | 371 | ZONE_PADDING(_pad1_) |
diff --git a/include/linux/oom.h b/include/linux/oom.h index 552fba9c7d5a..3d7647536b03 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
@@ -49,7 +49,7 @@ extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); | |||
49 | extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); | 49 | extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); |
50 | 50 | ||
51 | extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | 51 | extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
52 | int order, nodemask_t *mask); | 52 | int order, nodemask_t *mask, bool force_kill); |
53 | extern int register_oom_notifier(struct notifier_block *nb); | 53 | extern int register_oom_notifier(struct notifier_block *nb); |
54 | extern int unregister_oom_notifier(struct notifier_block *nb); | 54 | extern int unregister_oom_notifier(struct notifier_block *nb); |
55 | 55 | ||
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e90a673be67e..6b25758e028e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h | |||
@@ -414,11 +414,26 @@ static inline int PageTransHuge(struct page *page) | |||
414 | return PageHead(page); | 414 | return PageHead(page); |
415 | } | 415 | } |
416 | 416 | ||
417 | /* | ||
418 | * PageTransCompound returns true for both transparent huge pages | ||
419 | * and hugetlbfs pages, so it should only be called when it's known | ||
420 | * that hugetlbfs pages aren't involved. | ||
421 | */ | ||
417 | static inline int PageTransCompound(struct page *page) | 422 | static inline int PageTransCompound(struct page *page) |
418 | { | 423 | { |
419 | return PageCompound(page); | 424 | return PageCompound(page); |
420 | } | 425 | } |
421 | 426 | ||
427 | /* | ||
428 | * PageTransTail returns true for both transparent huge pages | ||
429 | * and hugetlbfs pages, so it should only be called when it's known | ||
430 | * that hugetlbfs pages aren't involved. | ||
431 | */ | ||
432 | static inline int PageTransTail(struct page *page) | ||
433 | { | ||
434 | return PageTail(page); | ||
435 | } | ||
436 | |||
422 | #else | 437 | #else |
423 | 438 | ||
424 | static inline int PageTransHuge(struct page *page) | 439 | static inline int PageTransHuge(struct page *page) |
@@ -430,6 +445,11 @@ static inline int PageTransCompound(struct page *page) | |||
430 | { | 445 | { |
431 | return 0; | 446 | return 0; |
432 | } | 447 | } |
448 | |||
449 | static inline int PageTransTail(struct page *page) | ||
450 | { | ||
451 | return 0; | ||
452 | } | ||
433 | #endif | 453 | #endif |
434 | 454 | ||
435 | #ifdef CONFIG_MMU | 455 | #ifdef CONFIG_MMU |
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index a2d11771c84b..a88cdba27809 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h | |||
@@ -4,12 +4,8 @@ | |||
4 | enum { | 4 | enum { |
5 | /* flags for mem_cgroup */ | 5 | /* flags for mem_cgroup */ |
6 | PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ | 6 | PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ |
7 | PCG_CACHE, /* charged as cache */ | ||
8 | PCG_USED, /* this object is in use. */ | 7 | PCG_USED, /* this object is in use. */ |
9 | PCG_MIGRATION, /* under page migration */ | 8 | PCG_MIGRATION, /* under page migration */ |
10 | /* flags for mem_cgroup and file and I/O status */ | ||
11 | PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ | ||
12 | PCG_FILE_MAPPED, /* page is accounted as "mapped" */ | ||
13 | __NR_PCG_FLAGS, | 9 | __NR_PCG_FLAGS, |
14 | }; | 10 | }; |
15 | 11 | ||
@@ -64,19 +60,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ | |||
64 | static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \ | 60 | static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \ |
65 | { return test_and_clear_bit(PCG_##lname, &pc->flags); } | 61 | { return test_and_clear_bit(PCG_##lname, &pc->flags); } |
66 | 62 | ||
67 | /* Cache flag is set only once (at allocation) */ | ||
68 | TESTPCGFLAG(Cache, CACHE) | ||
69 | CLEARPCGFLAG(Cache, CACHE) | ||
70 | SETPCGFLAG(Cache, CACHE) | ||
71 | |||
72 | TESTPCGFLAG(Used, USED) | 63 | TESTPCGFLAG(Used, USED) |
73 | CLEARPCGFLAG(Used, USED) | 64 | CLEARPCGFLAG(Used, USED) |
74 | SETPCGFLAG(Used, USED) | 65 | SETPCGFLAG(Used, USED) |
75 | 66 | ||
76 | SETPCGFLAG(FileMapped, FILE_MAPPED) | ||
77 | CLEARPCGFLAG(FileMapped, FILE_MAPPED) | ||
78 | TESTPCGFLAG(FileMapped, FILE_MAPPED) | ||
79 | |||
80 | SETPCGFLAG(Migration, MIGRATION) | 67 | SETPCGFLAG(Migration, MIGRATION) |
81 | CLEARPCGFLAG(Migration, MIGRATION) | 68 | CLEARPCGFLAG(Migration, MIGRATION) |
82 | TESTPCGFLAG(Migration, MIGRATION) | 69 | TESTPCGFLAG(Migration, MIGRATION) |
@@ -85,7 +72,7 @@ static inline void lock_page_cgroup(struct page_cgroup *pc) | |||
85 | { | 72 | { |
86 | /* | 73 | /* |
87 | * Don't take this lock in IRQ context. | 74 | * Don't take this lock in IRQ context. |
88 | * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION | 75 | * This lock is for pc->mem_cgroup, USED, MIGRATION |
89 | */ | 76 | */ |
90 | bit_spin_lock(PCG_LOCK, &pc->flags); | 77 | bit_spin_lock(PCG_LOCK, &pc->flags); |
91 | } | 78 | } |
@@ -95,24 +82,6 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc) | |||
95 | bit_spin_unlock(PCG_LOCK, &pc->flags); | 82 | bit_spin_unlock(PCG_LOCK, &pc->flags); |
96 | } | 83 | } |
97 | 84 | ||
98 | static inline void move_lock_page_cgroup(struct page_cgroup *pc, | ||
99 | unsigned long *flags) | ||
100 | { | ||
101 | /* | ||
102 | * We know updates to pc->flags of page cache's stats are from both of | ||
103 | * usual context or IRQ context. Disable IRQ to avoid deadlock. | ||
104 | */ | ||
105 | local_irq_save(*flags); | ||
106 | bit_spin_lock(PCG_MOVE_LOCK, &pc->flags); | ||
107 | } | ||
108 | |||
109 | static inline void move_unlock_page_cgroup(struct page_cgroup *pc, | ||
110 | unsigned long *flags) | ||
111 | { | ||
112 | bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags); | ||
113 | local_irq_restore(*flags); | ||
114 | } | ||
115 | |||
116 | #else /* CONFIG_CGROUP_MEM_RES_CTLR */ | 85 | #else /* CONFIG_CGROUP_MEM_RES_CTLR */ |
117 | struct page_cgroup; | 86 | struct page_cgroup; |
118 | 87 | ||
diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1cdd62a2788a..fd07c4542cee 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h | |||
@@ -122,7 +122,6 @@ void unlink_anon_vmas(struct vm_area_struct *); | |||
122 | int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); | 122 | int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); |
123 | void anon_vma_moveto_tail(struct vm_area_struct *); | 123 | void anon_vma_moveto_tail(struct vm_area_struct *); |
124 | int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); | 124 | int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); |
125 | void __anon_vma_link(struct vm_area_struct *); | ||
126 | 125 | ||
127 | static inline void anon_vma_merge(struct vm_area_struct *vma, | 126 | static inline void anon_vma_merge(struct vm_area_struct *vma, |
128 | struct vm_area_struct *next) | 127 | struct vm_area_struct *next) |
diff --git a/include/linux/sched.h b/include/linux/sched.h index e074e1e54f85..0c147a4260a5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1514,7 +1514,7 @@ struct task_struct { | |||
1514 | #endif | 1514 | #endif |
1515 | #ifdef CONFIG_CPUSETS | 1515 | #ifdef CONFIG_CPUSETS |
1516 | nodemask_t mems_allowed; /* Protected by alloc_lock */ | 1516 | nodemask_t mems_allowed; /* Protected by alloc_lock */ |
1517 | int mems_allowed_change_disable; | 1517 | seqcount_t mems_allowed_seq; /* Seqence no to catch updates */ |
1518 | int cpuset_mem_spread_rotor; | 1518 | int cpuset_mem_spread_rotor; |
1519 | int cpuset_slab_spread_rotor; | 1519 | int cpuset_slab_spread_rotor; |
1520 | #endif | 1520 | #endif |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 3e60228e7299..b86b5c20617d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -223,6 +223,7 @@ extern void lru_add_page_tail(struct zone* zone, | |||
223 | extern void activate_page(struct page *); | 223 | extern void activate_page(struct page *); |
224 | extern void mark_page_accessed(struct page *); | 224 | extern void mark_page_accessed(struct page *); |
225 | extern void lru_add_drain(void); | 225 | extern void lru_add_drain(void); |
226 | extern void lru_add_drain_cpu(int cpu); | ||
226 | extern int lru_add_drain_all(void); | 227 | extern int lru_add_drain_all(void); |
227 | extern void rotate_reclaimable_page(struct page *page); | 228 | extern void rotate_reclaimable_page(struct page *page); |
228 | extern void deactivate_page(struct page *page); | 229 | extern void deactivate_page(struct page *page); |
@@ -329,7 +330,6 @@ extern long total_swap_pages; | |||
329 | extern void si_swapinfo(struct sysinfo *); | 330 | extern void si_swapinfo(struct sysinfo *); |
330 | extern swp_entry_t get_swap_page(void); | 331 | extern swp_entry_t get_swap_page(void); |
331 | extern swp_entry_t get_swap_page_of_type(int); | 332 | extern swp_entry_t get_swap_page_of_type(int); |
332 | extern int valid_swaphandles(swp_entry_t, unsigned long *); | ||
333 | extern int add_swap_count_continuation(swp_entry_t, gfp_t); | 333 | extern int add_swap_count_continuation(swp_entry_t, gfp_t); |
334 | extern void swap_shmem_alloc(swp_entry_t); | 334 | extern void swap_shmem_alloc(swp_entry_t); |
335 | extern int swap_duplicate(swp_entry_t); | 335 | extern int swap_duplicate(swp_entry_t); |
@@ -482,7 +482,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) | |||
482 | /* hugetlb_file_setup applies strict accounting */ | 482 | /* hugetlb_file_setup applies strict accounting */ |
483 | if (shmflg & SHM_NORESERVE) | 483 | if (shmflg & SHM_NORESERVE) |
484 | acctflag = VM_NORESERVE; | 484 | acctflag = VM_NORESERVE; |
485 | file = hugetlb_file_setup(name, size, acctflag, | 485 | file = hugetlb_file_setup(name, 0, size, acctflag, |
486 | &shp->mlock_user, HUGETLB_SHMFS_INODE); | 486 | &shp->mlock_user, HUGETLB_SHMFS_INODE); |
487 | } else { | 487 | } else { |
488 | /* | 488 | /* |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1ece8e20fdb5..f4ea4b6f3cf1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -4881,9 +4881,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | |||
4881 | 4881 | ||
4882 | rcu_assign_pointer(id->css, NULL); | 4882 | rcu_assign_pointer(id->css, NULL); |
4883 | rcu_assign_pointer(css->id, NULL); | 4883 | rcu_assign_pointer(css->id, NULL); |
4884 | write_lock(&ss->id_lock); | 4884 | spin_lock(&ss->id_lock); |
4885 | idr_remove(&ss->idr, id->id); | 4885 | idr_remove(&ss->idr, id->id); |
4886 | write_unlock(&ss->id_lock); | 4886 | spin_unlock(&ss->id_lock); |
4887 | kfree_rcu(id, rcu_head); | 4887 | kfree_rcu(id, rcu_head); |
4888 | } | 4888 | } |
4889 | EXPORT_SYMBOL_GPL(free_css_id); | 4889 | EXPORT_SYMBOL_GPL(free_css_id); |
@@ -4909,10 +4909,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) | |||
4909 | error = -ENOMEM; | 4909 | error = -ENOMEM; |
4910 | goto err_out; | 4910 | goto err_out; |
4911 | } | 4911 | } |
4912 | write_lock(&ss->id_lock); | 4912 | spin_lock(&ss->id_lock); |
4913 | /* Don't use 0. allocates an ID of 1-65535 */ | 4913 | /* Don't use 0. allocates an ID of 1-65535 */ |
4914 | error = idr_get_new_above(&ss->idr, newid, 1, &myid); | 4914 | error = idr_get_new_above(&ss->idr, newid, 1, &myid); |
4915 | write_unlock(&ss->id_lock); | 4915 | spin_unlock(&ss->id_lock); |
4916 | 4916 | ||
4917 | /* Returns error when there are no free spaces for new ID.*/ | 4917 | /* Returns error when there are no free spaces for new ID.*/ |
4918 | if (error) { | 4918 | if (error) { |
@@ -4927,9 +4927,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) | |||
4927 | return newid; | 4927 | return newid; |
4928 | remove_idr: | 4928 | remove_idr: |
4929 | error = -ENOSPC; | 4929 | error = -ENOSPC; |
4930 | write_lock(&ss->id_lock); | 4930 | spin_lock(&ss->id_lock); |
4931 | idr_remove(&ss->idr, myid); | 4931 | idr_remove(&ss->idr, myid); |
4932 | write_unlock(&ss->id_lock); | 4932 | spin_unlock(&ss->id_lock); |
4933 | err_out: | 4933 | err_out: |
4934 | kfree(newid); | 4934 | kfree(newid); |
4935 | return ERR_PTR(error); | 4935 | return ERR_PTR(error); |
@@ -4941,7 +4941,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, | |||
4941 | { | 4941 | { |
4942 | struct css_id *newid; | 4942 | struct css_id *newid; |
4943 | 4943 | ||
4944 | rwlock_init(&ss->id_lock); | 4944 | spin_lock_init(&ss->id_lock); |
4945 | idr_init(&ss->idr); | 4945 | idr_init(&ss->idr); |
4946 | 4946 | ||
4947 | newid = get_new_cssid(ss, 0); | 4947 | newid = get_new_cssid(ss, 0); |
@@ -5029,6 +5029,8 @@ css_get_next(struct cgroup_subsys *ss, int id, | |||
5029 | return NULL; | 5029 | return NULL; |
5030 | 5030 | ||
5031 | BUG_ON(!ss->use_id); | 5031 | BUG_ON(!ss->use_id); |
5032 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
5033 | |||
5032 | /* fill start point for scan */ | 5034 | /* fill start point for scan */ |
5033 | tmpid = id; | 5035 | tmpid = id; |
5034 | while (1) { | 5036 | while (1) { |
@@ -5036,10 +5038,7 @@ css_get_next(struct cgroup_subsys *ss, int id, | |||
5036 | * scan next entry from bitmap(tree), tmpid is updated after | 5038 | * scan next entry from bitmap(tree), tmpid is updated after |
5037 | * idr_get_next(). | 5039 | * idr_get_next(). |
5038 | */ | 5040 | */ |
5039 | read_lock(&ss->id_lock); | ||
5040 | tmp = idr_get_next(&ss->idr, &tmpid); | 5041 | tmp = idr_get_next(&ss->idr, &tmpid); |
5041 | read_unlock(&ss->id_lock); | ||
5042 | |||
5043 | if (!tmp) | 5042 | if (!tmp) |
5044 | break; | 5043 | break; |
5045 | if (tmp->depth >= depth && tmp->stack[depth] == rootid) { | 5044 | if (tmp->depth >= depth && tmp->stack[depth] == rootid) { |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 5d575836dba6..1010cc61931f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
964 | { | 964 | { |
965 | bool need_loop; | 965 | bool need_loop; |
966 | 966 | ||
967 | repeat: | ||
968 | /* | 967 | /* |
969 | * Allow tasks that have access to memory reserves because they have | 968 | * Allow tasks that have access to memory reserves because they have |
970 | * been OOM killed to get memory anywhere. | 969 | * been OOM killed to get memory anywhere. |
@@ -983,45 +982,19 @@ repeat: | |||
983 | */ | 982 | */ |
984 | need_loop = task_has_mempolicy(tsk) || | 983 | need_loop = task_has_mempolicy(tsk) || |
985 | !nodes_intersects(*newmems, tsk->mems_allowed); | 984 | !nodes_intersects(*newmems, tsk->mems_allowed); |
986 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); | ||
987 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); | ||
988 | 985 | ||
989 | /* | 986 | if (need_loop) |
990 | * ensure checking ->mems_allowed_change_disable after setting all new | 987 | write_seqcount_begin(&tsk->mems_allowed_seq); |
991 | * allowed nodes. | ||
992 | * | ||
993 | * the read-side task can see an nodemask with new allowed nodes and | ||
994 | * old allowed nodes. and if it allocates page when cpuset clears newly | ||
995 | * disallowed ones continuous, it can see the new allowed bits. | ||
996 | * | ||
997 | * And if setting all new allowed nodes is after the checking, setting | ||
998 | * all new allowed nodes and clearing newly disallowed ones will be done | ||
999 | * continuous, and the read-side task may find no node to alloc page. | ||
1000 | */ | ||
1001 | smp_mb(); | ||
1002 | 988 | ||
1003 | /* | 989 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); |
1004 | * Allocation of memory is very fast, we needn't sleep when waiting | 990 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); |
1005 | * for the read-side. | ||
1006 | */ | ||
1007 | while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { | ||
1008 | task_unlock(tsk); | ||
1009 | if (!task_curr(tsk)) | ||
1010 | yield(); | ||
1011 | goto repeat; | ||
1012 | } | ||
1013 | |||
1014 | /* | ||
1015 | * ensure checking ->mems_allowed_change_disable before clearing all new | ||
1016 | * disallowed nodes. | ||
1017 | * | ||
1018 | * if clearing newly disallowed bits before the checking, the read-side | ||
1019 | * task may find no node to alloc page. | ||
1020 | */ | ||
1021 | smp_mb(); | ||
1022 | 991 | ||
1023 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); | 992 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); |
1024 | tsk->mems_allowed = *newmems; | 993 | tsk->mems_allowed = *newmems; |
994 | |||
995 | if (need_loop) | ||
996 | write_seqcount_end(&tsk->mems_allowed_seq); | ||
997 | |||
1025 | task_unlock(tsk); | 998 | task_unlock(tsk); |
1026 | } | 999 | } |
1027 | 1000 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 7ad335c3045a..16b07bfac224 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -935,7 +935,7 @@ void do_exit(long code) | |||
935 | acct_update_integrals(tsk); | 935 | acct_update_integrals(tsk); |
936 | /* sync mm's RSS info before statistics gathering */ | 936 | /* sync mm's RSS info before statistics gathering */ |
937 | if (tsk->mm) | 937 | if (tsk->mm) |
938 | sync_mm_rss(tsk, tsk->mm); | 938 | sync_mm_rss(tsk->mm); |
939 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 939 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
940 | if (group_dead) { | 940 | if (group_dead) { |
941 | hrtimer_cancel(&tsk->signal->real_timer); | 941 | hrtimer_cancel(&tsk->signal->real_timer); |
diff --git a/kernel/fork.c b/kernel/fork.c index 26a7138bb849..37674ec55cde 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -512,6 +512,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) | |||
512 | return NULL; | 512 | return NULL; |
513 | } | 513 | } |
514 | 514 | ||
515 | static void check_mm(struct mm_struct *mm) | ||
516 | { | ||
517 | int i; | ||
518 | |||
519 | for (i = 0; i < NR_MM_COUNTERS; i++) { | ||
520 | long x = atomic_long_read(&mm->rss_stat.count[i]); | ||
521 | |||
522 | if (unlikely(x)) | ||
523 | printk(KERN_ALERT "BUG: Bad rss-counter state " | ||
524 | "mm:%p idx:%d val:%ld\n", mm, i, x); | ||
525 | } | ||
526 | |||
527 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
528 | VM_BUG_ON(mm->pmd_huge_pte); | ||
529 | #endif | ||
530 | } | ||
531 | |||
515 | /* | 532 | /* |
516 | * Allocate and initialize an mm_struct. | 533 | * Allocate and initialize an mm_struct. |
517 | */ | 534 | */ |
@@ -539,9 +556,7 @@ void __mmdrop(struct mm_struct *mm) | |||
539 | mm_free_pgd(mm); | 556 | mm_free_pgd(mm); |
540 | destroy_context(mm); | 557 | destroy_context(mm); |
541 | mmu_notifier_mm_destroy(mm); | 558 | mmu_notifier_mm_destroy(mm); |
542 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 559 | check_mm(mm); |
543 | VM_BUG_ON(mm->pmd_huge_pte); | ||
544 | #endif | ||
545 | free_mm(mm); | 560 | free_mm(mm); |
546 | } | 561 | } |
547 | EXPORT_SYMBOL_GPL(__mmdrop); | 562 | EXPORT_SYMBOL_GPL(__mmdrop); |
@@ -1223,6 +1238,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1223 | #ifdef CONFIG_CPUSETS | 1238 | #ifdef CONFIG_CPUSETS |
1224 | p->cpuset_mem_spread_rotor = NUMA_NO_NODE; | 1239 | p->cpuset_mem_spread_rotor = NUMA_NO_NODE; |
1225 | p->cpuset_slab_spread_rotor = NUMA_NO_NODE; | 1240 | p->cpuset_slab_spread_rotor = NUMA_NO_NODE; |
1241 | seqcount_init(&p->mems_allowed_seq); | ||
1226 | #endif | 1242 | #endif |
1227 | #ifdef CONFIG_TRACE_IRQFLAGS | 1243 | #ifdef CONFIG_TRACE_IRQFLAGS |
1228 | p->irq_events = 0; | 1244 | p->irq_events = 0; |
@@ -595,8 +595,10 @@ EXPORT_SYMBOL(idr_for_each); | |||
595 | * Returns pointer to registered object with id, which is next number to | 595 | * Returns pointer to registered object with id, which is next number to |
596 | * given id. After being looked up, *@nextidp will be updated for the next | 596 | * given id. After being looked up, *@nextidp will be updated for the next |
597 | * iteration. | 597 | * iteration. |
598 | * | ||
599 | * This function can be called under rcu_read_lock(), given that the leaf | ||
600 | * pointers lifetimes are correctly managed. | ||
598 | */ | 601 | */ |
599 | |||
600 | void *idr_get_next(struct idr *idp, int *nextidp) | 602 | void *idr_get_next(struct idr *idp, int *nextidp) |
601 | { | 603 | { |
602 | struct idr_layer *p, *pa[MAX_LEVEL]; | 604 | struct idr_layer *p, *pa[MAX_LEVEL]; |
@@ -605,11 +607,11 @@ void *idr_get_next(struct idr *idp, int *nextidp) | |||
605 | int n, max; | 607 | int n, max; |
606 | 608 | ||
607 | /* find first ent */ | 609 | /* find first ent */ |
608 | n = idp->layers * IDR_BITS; | ||
609 | max = 1 << n; | ||
610 | p = rcu_dereference_raw(idp->top); | 610 | p = rcu_dereference_raw(idp->top); |
611 | if (!p) | 611 | if (!p) |
612 | return NULL; | 612 | return NULL; |
613 | n = (p->layer + 1) * IDR_BITS; | ||
614 | max = 1 << n; | ||
613 | 615 | ||
614 | while (id < max) { | 616 | while (id < max) { |
615 | while (n > 0 && p) { | 617 | while (n > 0 && p) { |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 668e94df8cf2..0131170c9d54 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -766,14 +766,13 @@ void * __init alloc_bootmem_section(unsigned long size, | |||
766 | unsigned long section_nr) | 766 | unsigned long section_nr) |
767 | { | 767 | { |
768 | bootmem_data_t *bdata; | 768 | bootmem_data_t *bdata; |
769 | unsigned long pfn, goal, limit; | 769 | unsigned long pfn, goal; |
770 | 770 | ||
771 | pfn = section_nr_to_pfn(section_nr); | 771 | pfn = section_nr_to_pfn(section_nr); |
772 | goal = pfn << PAGE_SHIFT; | 772 | goal = pfn << PAGE_SHIFT; |
773 | limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; | ||
774 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; | 773 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; |
775 | 774 | ||
776 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); | 775 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0); |
777 | } | 776 | } |
778 | #endif | 777 | #endif |
779 | 778 | ||
diff --git a/mm/compaction.c b/mm/compaction.c index d9ebebe1a2aa..74a8c825ff28 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -35,7 +35,7 @@ struct compact_control { | |||
35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
36 | bool sync; /* Synchronous migration */ | 36 | bool sync; /* Synchronous migration */ |
37 | 37 | ||
38 | unsigned int order; /* order a direct compactor needs */ | 38 | int order; /* order a direct compactor needs */ |
39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
40 | struct zone *zone; | 40 | struct zone *zone; |
41 | }; | 41 | }; |
@@ -675,49 +675,71 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
675 | 675 | ||
676 | 676 | ||
677 | /* Compact all zones within a node */ | 677 | /* Compact all zones within a node */ |
678 | static int compact_node(int nid) | 678 | static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) |
679 | { | 679 | { |
680 | int zoneid; | 680 | int zoneid; |
681 | pg_data_t *pgdat; | ||
682 | struct zone *zone; | 681 | struct zone *zone; |
683 | 682 | ||
684 | if (nid < 0 || nid >= nr_node_ids || !node_online(nid)) | ||
685 | return -EINVAL; | ||
686 | pgdat = NODE_DATA(nid); | ||
687 | |||
688 | /* Flush pending updates to the LRU lists */ | ||
689 | lru_add_drain_all(); | ||
690 | |||
691 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { | 683 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { |
692 | struct compact_control cc = { | ||
693 | .nr_freepages = 0, | ||
694 | .nr_migratepages = 0, | ||
695 | .order = -1, | ||
696 | .sync = true, | ||
697 | }; | ||
698 | 684 | ||
699 | zone = &pgdat->node_zones[zoneid]; | 685 | zone = &pgdat->node_zones[zoneid]; |
700 | if (!populated_zone(zone)) | 686 | if (!populated_zone(zone)) |
701 | continue; | 687 | continue; |
702 | 688 | ||
703 | cc.zone = zone; | 689 | cc->nr_freepages = 0; |
704 | INIT_LIST_HEAD(&cc.freepages); | 690 | cc->nr_migratepages = 0; |
705 | INIT_LIST_HEAD(&cc.migratepages); | 691 | cc->zone = zone; |
706 | 692 | INIT_LIST_HEAD(&cc->freepages); | |
707 | compact_zone(zone, &cc); | 693 | INIT_LIST_HEAD(&cc->migratepages); |
694 | |||
695 | if (cc->order == -1 || !compaction_deferred(zone, cc->order)) | ||
696 | compact_zone(zone, cc); | ||
697 | |||
698 | if (cc->order > 0) { | ||
699 | int ok = zone_watermark_ok(zone, cc->order, | ||
700 | low_wmark_pages(zone), 0, 0); | ||
701 | if (ok && cc->order > zone->compact_order_failed) | ||
702 | zone->compact_order_failed = cc->order + 1; | ||
703 | /* Currently async compaction is never deferred. */ | ||
704 | else if (!ok && cc->sync) | ||
705 | defer_compaction(zone, cc->order); | ||
706 | } | ||
708 | 707 | ||
709 | VM_BUG_ON(!list_empty(&cc.freepages)); | 708 | VM_BUG_ON(!list_empty(&cc->freepages)); |
710 | VM_BUG_ON(!list_empty(&cc.migratepages)); | 709 | VM_BUG_ON(!list_empty(&cc->migratepages)); |
711 | } | 710 | } |
712 | 711 | ||
713 | return 0; | 712 | return 0; |
714 | } | 713 | } |
715 | 714 | ||
715 | int compact_pgdat(pg_data_t *pgdat, int order) | ||
716 | { | ||
717 | struct compact_control cc = { | ||
718 | .order = order, | ||
719 | .sync = false, | ||
720 | }; | ||
721 | |||
722 | return __compact_pgdat(pgdat, &cc); | ||
723 | } | ||
724 | |||
725 | static int compact_node(int nid) | ||
726 | { | ||
727 | struct compact_control cc = { | ||
728 | .order = -1, | ||
729 | .sync = true, | ||
730 | }; | ||
731 | |||
732 | return __compact_pgdat(NODE_DATA(nid), &cc); | ||
733 | } | ||
734 | |||
716 | /* Compact all nodes in the system */ | 735 | /* Compact all nodes in the system */ |
717 | static int compact_nodes(void) | 736 | static int compact_nodes(void) |
718 | { | 737 | { |
719 | int nid; | 738 | int nid; |
720 | 739 | ||
740 | /* Flush pending updates to the LRU lists */ | ||
741 | lru_add_drain_all(); | ||
742 | |||
721 | for_each_online_node(nid) | 743 | for_each_online_node(nid) |
722 | compact_node(nid); | 744 | compact_node(nid); |
723 | 745 | ||
@@ -750,7 +772,14 @@ ssize_t sysfs_compact_node(struct device *dev, | |||
750 | struct device_attribute *attr, | 772 | struct device_attribute *attr, |
751 | const char *buf, size_t count) | 773 | const char *buf, size_t count) |
752 | { | 774 | { |
753 | compact_node(dev->id); | 775 | int nid = dev->id; |
776 | |||
777 | if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { | ||
778 | /* Flush pending updates to the LRU lists */ | ||
779 | lru_add_drain_all(); | ||
780 | |||
781 | compact_node(nid); | ||
782 | } | ||
754 | 783 | ||
755 | return count; | 784 | return count; |
756 | } | 785 | } |
diff --git a/mm/filemap.c b/mm/filemap.c index 2f8165075a5a..843042045dc9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -101,9 +101,8 @@ | |||
101 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | 101 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
102 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 102 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
103 | * | 103 | * |
104 | * (code doesn't rely on that order, so you could switch it around) | 104 | * ->i_mmap_mutex |
105 | * ->tasklist_lock (memory_failure, collect_procs_ao) | 105 | * ->tasklist_lock (memory_failure, collect_procs_ao) |
106 | * ->i_mmap_mutex | ||
107 | */ | 106 | */ |
108 | 107 | ||
109 | /* | 108 | /* |
@@ -500,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp) | |||
500 | struct page *page; | 499 | struct page *page; |
501 | 500 | ||
502 | if (cpuset_do_page_mem_spread()) { | 501 | if (cpuset_do_page_mem_spread()) { |
503 | get_mems_allowed(); | 502 | unsigned int cpuset_mems_cookie; |
504 | n = cpuset_mem_spread_node(); | 503 | do { |
505 | page = alloc_pages_exact_node(n, gfp, 0); | 504 | cpuset_mems_cookie = get_mems_allowed(); |
506 | put_mems_allowed(); | 505 | n = cpuset_mem_spread_node(); |
506 | page = alloc_pages_exact_node(n, gfp, 0); | ||
507 | } while (!put_mems_allowed(cpuset_mems_cookie) && !page); | ||
508 | |||
507 | return page; | 509 | return page; |
508 | } | 510 | } |
509 | return alloc_pages(gfp, 0); | 511 | return alloc_pages(gfp, 0); |
@@ -2341,7 +2343,9 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, | |||
2341 | struct page *page; | 2343 | struct page *page; |
2342 | gfp_t gfp_notmask = 0; | 2344 | gfp_t gfp_notmask = 0; |
2343 | 2345 | ||
2344 | gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE; | 2346 | gfp_mask = mapping_gfp_mask(mapping); |
2347 | if (mapping_cap_account_dirty(mapping)) | ||
2348 | gfp_mask |= __GFP_WRITE; | ||
2345 | if (flags & AOP_FLAG_NOFS) | 2349 | if (flags & AOP_FLAG_NOFS) |
2346 | gfp_notmask = __GFP_FS; | 2350 | gfp_notmask = __GFP_FS; |
2347 | repeat: | 2351 | repeat: |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8f7fc394f636..f0e5306eeb55 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -1031,32 +1031,23 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1031 | { | 1031 | { |
1032 | int ret = 0; | 1032 | int ret = 0; |
1033 | 1033 | ||
1034 | spin_lock(&tlb->mm->page_table_lock); | 1034 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1035 | if (likely(pmd_trans_huge(*pmd))) { | 1035 | struct page *page; |
1036 | if (unlikely(pmd_trans_splitting(*pmd))) { | 1036 | pgtable_t pgtable; |
1037 | spin_unlock(&tlb->mm->page_table_lock); | 1037 | pgtable = get_pmd_huge_pte(tlb->mm); |
1038 | wait_split_huge_page(vma->anon_vma, | 1038 | page = pmd_page(*pmd); |
1039 | pmd); | 1039 | pmd_clear(pmd); |
1040 | } else { | 1040 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1041 | struct page *page; | 1041 | page_remove_rmap(page); |
1042 | pgtable_t pgtable; | 1042 | VM_BUG_ON(page_mapcount(page) < 0); |
1043 | pgtable = get_pmd_huge_pte(tlb->mm); | 1043 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); |
1044 | page = pmd_page(*pmd); | 1044 | VM_BUG_ON(!PageHead(page)); |
1045 | pmd_clear(pmd); | 1045 | tlb->mm->nr_ptes--; |
1046 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | ||
1047 | page_remove_rmap(page); | ||
1048 | VM_BUG_ON(page_mapcount(page) < 0); | ||
1049 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | ||
1050 | VM_BUG_ON(!PageHead(page)); | ||
1051 | tlb->mm->nr_ptes--; | ||
1052 | spin_unlock(&tlb->mm->page_table_lock); | ||
1053 | tlb_remove_page(tlb, page); | ||
1054 | pte_free(tlb->mm, pgtable); | ||
1055 | ret = 1; | ||
1056 | } | ||
1057 | } else | ||
1058 | spin_unlock(&tlb->mm->page_table_lock); | 1046 | spin_unlock(&tlb->mm->page_table_lock); |
1059 | 1047 | tlb_remove_page(tlb, page); | |
1048 | pte_free(tlb->mm, pgtable); | ||
1049 | ret = 1; | ||
1050 | } | ||
1060 | return ret; | 1051 | return ret; |
1061 | } | 1052 | } |
1062 | 1053 | ||
@@ -1066,21 +1057,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1066 | { | 1057 | { |
1067 | int ret = 0; | 1058 | int ret = 0; |
1068 | 1059 | ||
1069 | spin_lock(&vma->vm_mm->page_table_lock); | 1060 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1070 | if (likely(pmd_trans_huge(*pmd))) { | 1061 | /* |
1071 | ret = !pmd_trans_splitting(*pmd); | 1062 | * All logical pages in the range are present |
1072 | spin_unlock(&vma->vm_mm->page_table_lock); | 1063 | * if backed by a huge page. |
1073 | if (unlikely(!ret)) | 1064 | */ |
1074 | wait_split_huge_page(vma->anon_vma, pmd); | ||
1075 | else { | ||
1076 | /* | ||
1077 | * All logical pages in the range are present | ||
1078 | * if backed by a huge page. | ||
1079 | */ | ||
1080 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | ||
1081 | } | ||
1082 | } else | ||
1083 | spin_unlock(&vma->vm_mm->page_table_lock); | 1065 | spin_unlock(&vma->vm_mm->page_table_lock); |
1066 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | ||
1067 | ret = 1; | ||
1068 | } | ||
1084 | 1069 | ||
1085 | return ret; | 1070 | return ret; |
1086 | } | 1071 | } |
@@ -1110,20 +1095,11 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | |||
1110 | goto out; | 1095 | goto out; |
1111 | } | 1096 | } |
1112 | 1097 | ||
1113 | spin_lock(&mm->page_table_lock); | 1098 | ret = __pmd_trans_huge_lock(old_pmd, vma); |
1114 | if (likely(pmd_trans_huge(*old_pmd))) { | 1099 | if (ret == 1) { |
1115 | if (pmd_trans_splitting(*old_pmd)) { | 1100 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); |
1116 | spin_unlock(&mm->page_table_lock); | 1101 | VM_BUG_ON(!pmd_none(*new_pmd)); |
1117 | wait_split_huge_page(vma->anon_vma, old_pmd); | 1102 | set_pmd_at(mm, new_addr, new_pmd, pmd); |
1118 | ret = -1; | ||
1119 | } else { | ||
1120 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); | ||
1121 | VM_BUG_ON(!pmd_none(*new_pmd)); | ||
1122 | set_pmd_at(mm, new_addr, new_pmd, pmd); | ||
1123 | spin_unlock(&mm->page_table_lock); | ||
1124 | ret = 1; | ||
1125 | } | ||
1126 | } else { | ||
1127 | spin_unlock(&mm->page_table_lock); | 1103 | spin_unlock(&mm->page_table_lock); |
1128 | } | 1104 | } |
1129 | out: | 1105 | out: |
@@ -1136,24 +1112,41 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1136 | struct mm_struct *mm = vma->vm_mm; | 1112 | struct mm_struct *mm = vma->vm_mm; |
1137 | int ret = 0; | 1113 | int ret = 0; |
1138 | 1114 | ||
1139 | spin_lock(&mm->page_table_lock); | 1115 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1116 | pmd_t entry; | ||
1117 | entry = pmdp_get_and_clear(mm, addr, pmd); | ||
1118 | entry = pmd_modify(entry, newprot); | ||
1119 | set_pmd_at(mm, addr, pmd, entry); | ||
1120 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
1121 | ret = 1; | ||
1122 | } | ||
1123 | |||
1124 | return ret; | ||
1125 | } | ||
1126 | |||
1127 | /* | ||
1128 | * Returns 1 if a given pmd maps a stable (not under splitting) thp. | ||
1129 | * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. | ||
1130 | * | ||
1131 | * Note that if it returns 1, this routine returns without unlocking page | ||
1132 | * table locks. So callers must unlock them. | ||
1133 | */ | ||
1134 | int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) | ||
1135 | { | ||
1136 | spin_lock(&vma->vm_mm->page_table_lock); | ||
1140 | if (likely(pmd_trans_huge(*pmd))) { | 1137 | if (likely(pmd_trans_huge(*pmd))) { |
1141 | if (unlikely(pmd_trans_splitting(*pmd))) { | 1138 | if (unlikely(pmd_trans_splitting(*pmd))) { |
1142 | spin_unlock(&mm->page_table_lock); | 1139 | spin_unlock(&vma->vm_mm->page_table_lock); |
1143 | wait_split_huge_page(vma->anon_vma, pmd); | 1140 | wait_split_huge_page(vma->anon_vma, pmd); |
1141 | return -1; | ||
1144 | } else { | 1142 | } else { |
1145 | pmd_t entry; | 1143 | /* Thp mapped by 'pmd' is stable, so we can |
1146 | 1144 | * handle it as it is. */ | |
1147 | entry = pmdp_get_and_clear(mm, addr, pmd); | 1145 | return 1; |
1148 | entry = pmd_modify(entry, newprot); | ||
1149 | set_pmd_at(mm, addr, pmd, entry); | ||
1150 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
1151 | ret = 1; | ||
1152 | } | 1146 | } |
1153 | } else | 1147 | } |
1154 | spin_unlock(&vma->vm_mm->page_table_lock); | 1148 | spin_unlock(&vma->vm_mm->page_table_lock); |
1155 | 1149 | return 0; | |
1156 | return ret; | ||
1157 | } | 1150 | } |
1158 | 1151 | ||
1159 | pmd_t *page_check_address_pmd(struct page *page, | 1152 | pmd_t *page_check_address_pmd(struct page *page, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a876871f6be5..afa057a1d3fe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size; | |||
53 | */ | 53 | */ |
54 | static DEFINE_SPINLOCK(hugetlb_lock); | 54 | static DEFINE_SPINLOCK(hugetlb_lock); |
55 | 55 | ||
56 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) | ||
57 | { | ||
58 | bool free = (spool->count == 0) && (spool->used_hpages == 0); | ||
59 | |||
60 | spin_unlock(&spool->lock); | ||
61 | |||
62 | /* If no pages are used, and no other handles to the subpool | ||
63 | * remain, free the subpool the subpool remain */ | ||
64 | if (free) | ||
65 | kfree(spool); | ||
66 | } | ||
67 | |||
68 | struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) | ||
69 | { | ||
70 | struct hugepage_subpool *spool; | ||
71 | |||
72 | spool = kmalloc(sizeof(*spool), GFP_KERNEL); | ||
73 | if (!spool) | ||
74 | return NULL; | ||
75 | |||
76 | spin_lock_init(&spool->lock); | ||
77 | spool->count = 1; | ||
78 | spool->max_hpages = nr_blocks; | ||
79 | spool->used_hpages = 0; | ||
80 | |||
81 | return spool; | ||
82 | } | ||
83 | |||
84 | void hugepage_put_subpool(struct hugepage_subpool *spool) | ||
85 | { | ||
86 | spin_lock(&spool->lock); | ||
87 | BUG_ON(!spool->count); | ||
88 | spool->count--; | ||
89 | unlock_or_release_subpool(spool); | ||
90 | } | ||
91 | |||
92 | static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, | ||
93 | long delta) | ||
94 | { | ||
95 | int ret = 0; | ||
96 | |||
97 | if (!spool) | ||
98 | return 0; | ||
99 | |||
100 | spin_lock(&spool->lock); | ||
101 | if ((spool->used_hpages + delta) <= spool->max_hpages) { | ||
102 | spool->used_hpages += delta; | ||
103 | } else { | ||
104 | ret = -ENOMEM; | ||
105 | } | ||
106 | spin_unlock(&spool->lock); | ||
107 | |||
108 | return ret; | ||
109 | } | ||
110 | |||
111 | static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, | ||
112 | long delta) | ||
113 | { | ||
114 | if (!spool) | ||
115 | return; | ||
116 | |||
117 | spin_lock(&spool->lock); | ||
118 | spool->used_hpages -= delta; | ||
119 | /* If hugetlbfs_put_super couldn't free spool due to | ||
120 | * an outstanding quota reference, free it now. */ | ||
121 | unlock_or_release_subpool(spool); | ||
122 | } | ||
123 | |||
124 | static inline struct hugepage_subpool *subpool_inode(struct inode *inode) | ||
125 | { | ||
126 | return HUGETLBFS_SB(inode->i_sb)->spool; | ||
127 | } | ||
128 | |||
129 | static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) | ||
130 | { | ||
131 | return subpool_inode(vma->vm_file->f_dentry->d_inode); | ||
132 | } | ||
133 | |||
56 | /* | 134 | /* |
57 | * Region tracking -- allows tracking of reservations and instantiated pages | 135 | * Region tracking -- allows tracking of reservations and instantiated pages |
58 | * across the pages in a mapping. | 136 | * across the pages in a mapping. |
@@ -454,14 +532,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
454 | struct vm_area_struct *vma, | 532 | struct vm_area_struct *vma, |
455 | unsigned long address, int avoid_reserve) | 533 | unsigned long address, int avoid_reserve) |
456 | { | 534 | { |
457 | struct page *page = NULL; | 535 | struct page *page; |
458 | struct mempolicy *mpol; | 536 | struct mempolicy *mpol; |
459 | nodemask_t *nodemask; | 537 | nodemask_t *nodemask; |
460 | struct zonelist *zonelist; | 538 | struct zonelist *zonelist; |
461 | struct zone *zone; | 539 | struct zone *zone; |
462 | struct zoneref *z; | 540 | struct zoneref *z; |
541 | unsigned int cpuset_mems_cookie; | ||
463 | 542 | ||
464 | get_mems_allowed(); | 543 | retry_cpuset: |
544 | cpuset_mems_cookie = get_mems_allowed(); | ||
465 | zonelist = huge_zonelist(vma, address, | 545 | zonelist = huge_zonelist(vma, address, |
466 | htlb_alloc_mask, &mpol, &nodemask); | 546 | htlb_alloc_mask, &mpol, &nodemask); |
467 | /* | 547 | /* |
@@ -488,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
488 | } | 568 | } |
489 | } | 569 | } |
490 | } | 570 | } |
491 | err: | 571 | |
492 | mpol_cond_put(mpol); | 572 | mpol_cond_put(mpol); |
493 | put_mems_allowed(); | 573 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
574 | goto retry_cpuset; | ||
494 | return page; | 575 | return page; |
576 | |||
577 | err: | ||
578 | mpol_cond_put(mpol); | ||
579 | return NULL; | ||
495 | } | 580 | } |
496 | 581 | ||
497 | static void update_and_free_page(struct hstate *h, struct page *page) | 582 | static void update_and_free_page(struct hstate *h, struct page *page) |
@@ -533,9 +618,9 @@ static void free_huge_page(struct page *page) | |||
533 | */ | 618 | */ |
534 | struct hstate *h = page_hstate(page); | 619 | struct hstate *h = page_hstate(page); |
535 | int nid = page_to_nid(page); | 620 | int nid = page_to_nid(page); |
536 | struct address_space *mapping; | 621 | struct hugepage_subpool *spool = |
622 | (struct hugepage_subpool *)page_private(page); | ||
537 | 623 | ||
538 | mapping = (struct address_space *) page_private(page); | ||
539 | set_page_private(page, 0); | 624 | set_page_private(page, 0); |
540 | page->mapping = NULL; | 625 | page->mapping = NULL; |
541 | BUG_ON(page_count(page)); | 626 | BUG_ON(page_count(page)); |
@@ -551,8 +636,7 @@ static void free_huge_page(struct page *page) | |||
551 | enqueue_huge_page(h, page); | 636 | enqueue_huge_page(h, page); |
552 | } | 637 | } |
553 | spin_unlock(&hugetlb_lock); | 638 | spin_unlock(&hugetlb_lock); |
554 | if (mapping) | 639 | hugepage_subpool_put_pages(spool, 1); |
555 | hugetlb_put_quota(mapping, 1); | ||
556 | } | 640 | } |
557 | 641 | ||
558 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 642 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
@@ -852,6 +936,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) | |||
852 | struct page *page, *tmp; | 936 | struct page *page, *tmp; |
853 | int ret, i; | 937 | int ret, i; |
854 | int needed, allocated; | 938 | int needed, allocated; |
939 | bool alloc_ok = true; | ||
855 | 940 | ||
856 | needed = (h->resv_huge_pages + delta) - h->free_huge_pages; | 941 | needed = (h->resv_huge_pages + delta) - h->free_huge_pages; |
857 | if (needed <= 0) { | 942 | if (needed <= 0) { |
@@ -867,17 +952,13 @@ retry: | |||
867 | spin_unlock(&hugetlb_lock); | 952 | spin_unlock(&hugetlb_lock); |
868 | for (i = 0; i < needed; i++) { | 953 | for (i = 0; i < needed; i++) { |
869 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 954 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
870 | if (!page) | 955 | if (!page) { |
871 | /* | 956 | alloc_ok = false; |
872 | * We were not able to allocate enough pages to | 957 | break; |
873 | * satisfy the entire reservation so we free what | 958 | } |
874 | * we've allocated so far. | ||
875 | */ | ||
876 | goto free; | ||
877 | |||
878 | list_add(&page->lru, &surplus_list); | 959 | list_add(&page->lru, &surplus_list); |
879 | } | 960 | } |
880 | allocated += needed; | 961 | allocated += i; |
881 | 962 | ||
882 | /* | 963 | /* |
883 | * After retaking hugetlb_lock, we need to recalculate 'needed' | 964 | * After retaking hugetlb_lock, we need to recalculate 'needed' |
@@ -886,9 +967,16 @@ retry: | |||
886 | spin_lock(&hugetlb_lock); | 967 | spin_lock(&hugetlb_lock); |
887 | needed = (h->resv_huge_pages + delta) - | 968 | needed = (h->resv_huge_pages + delta) - |
888 | (h->free_huge_pages + allocated); | 969 | (h->free_huge_pages + allocated); |
889 | if (needed > 0) | 970 | if (needed > 0) { |
890 | goto retry; | 971 | if (alloc_ok) |
891 | 972 | goto retry; | |
973 | /* | ||
974 | * We were not able to allocate enough pages to | ||
975 | * satisfy the entire reservation so we free what | ||
976 | * we've allocated so far. | ||
977 | */ | ||
978 | goto free; | ||
979 | } | ||
892 | /* | 980 | /* |
893 | * The surplus_list now contains _at_least_ the number of extra pages | 981 | * The surplus_list now contains _at_least_ the number of extra pages |
894 | * needed to accommodate the reservation. Add the appropriate number | 982 | * needed to accommodate the reservation. Add the appropriate number |
@@ -914,10 +1002,10 @@ retry: | |||
914 | VM_BUG_ON(page_count(page)); | 1002 | VM_BUG_ON(page_count(page)); |
915 | enqueue_huge_page(h, page); | 1003 | enqueue_huge_page(h, page); |
916 | } | 1004 | } |
1005 | free: | ||
917 | spin_unlock(&hugetlb_lock); | 1006 | spin_unlock(&hugetlb_lock); |
918 | 1007 | ||
919 | /* Free unnecessary surplus pages to the buddy allocator */ | 1008 | /* Free unnecessary surplus pages to the buddy allocator */ |
920 | free: | ||
921 | if (!list_empty(&surplus_list)) { | 1009 | if (!list_empty(&surplus_list)) { |
922 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 1010 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
923 | list_del(&page->lru); | 1011 | list_del(&page->lru); |
@@ -966,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
966 | /* | 1054 | /* |
967 | * Determine if the huge page at addr within the vma has an associated | 1055 | * Determine if the huge page at addr within the vma has an associated |
968 | * reservation. Where it does not we will need to logically increase | 1056 | * reservation. Where it does not we will need to logically increase |
969 | * reservation and actually increase quota before an allocation can occur. | 1057 | * reservation and actually increase subpool usage before an allocation |
970 | * Where any new reservation would be required the reservation change is | 1058 | * can occur. Where any new reservation would be required the |
971 | * prepared, but not committed. Once the page has been quota'd allocated | 1059 | * reservation change is prepared, but not committed. Once the page |
972 | * an instantiated the change should be committed via vma_commit_reservation. | 1060 | * has been allocated from the subpool and instantiated the change should |
973 | * No action is required on failure. | 1061 | * be committed via vma_commit_reservation. No action is required on |
1062 | * failure. | ||
974 | */ | 1063 | */ |
975 | static long vma_needs_reservation(struct hstate *h, | 1064 | static long vma_needs_reservation(struct hstate *h, |
976 | struct vm_area_struct *vma, unsigned long addr) | 1065 | struct vm_area_struct *vma, unsigned long addr) |
@@ -1019,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h, | |||
1019 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 1108 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
1020 | unsigned long addr, int avoid_reserve) | 1109 | unsigned long addr, int avoid_reserve) |
1021 | { | 1110 | { |
1111 | struct hugepage_subpool *spool = subpool_vma(vma); | ||
1022 | struct hstate *h = hstate_vma(vma); | 1112 | struct hstate *h = hstate_vma(vma); |
1023 | struct page *page; | 1113 | struct page *page; |
1024 | struct address_space *mapping = vma->vm_file->f_mapping; | ||
1025 | struct inode *inode = mapping->host; | ||
1026 | long chg; | 1114 | long chg; |
1027 | 1115 | ||
1028 | /* | 1116 | /* |
1029 | * Processes that did not create the mapping will have no reserves and | 1117 | * Processes that did not create the mapping will have no |
1030 | * will not have accounted against quota. Check that the quota can be | 1118 | * reserves and will not have accounted against subpool |
1031 | * made before satisfying the allocation | 1119 | * limit. Check that the subpool limit can be made before |
1032 | * MAP_NORESERVE mappings may also need pages and quota allocated | 1120 | * satisfying the allocation MAP_NORESERVE mappings may also |
1033 | * if no reserve mapping overlaps. | 1121 | * need pages and subpool limit allocated allocated if no reserve |
1122 | * mapping overlaps. | ||
1034 | */ | 1123 | */ |
1035 | chg = vma_needs_reservation(h, vma, addr); | 1124 | chg = vma_needs_reservation(h, vma, addr); |
1036 | if (chg < 0) | 1125 | if (chg < 0) |
1037 | return ERR_PTR(-VM_FAULT_OOM); | 1126 | return ERR_PTR(-VM_FAULT_OOM); |
1038 | if (chg) | 1127 | if (chg) |
1039 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 1128 | if (hugepage_subpool_get_pages(spool, chg)) |
1040 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1129 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1041 | 1130 | ||
1042 | spin_lock(&hugetlb_lock); | 1131 | spin_lock(&hugetlb_lock); |
@@ -1046,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1046 | if (!page) { | 1135 | if (!page) { |
1047 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 1136 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
1048 | if (!page) { | 1137 | if (!page) { |
1049 | hugetlb_put_quota(inode->i_mapping, chg); | 1138 | hugepage_subpool_put_pages(spool, chg); |
1050 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1139 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1051 | } | 1140 | } |
1052 | } | 1141 | } |
1053 | 1142 | ||
1054 | set_page_private(page, (unsigned long) mapping); | 1143 | set_page_private(page, (unsigned long)spool); |
1055 | 1144 | ||
1056 | vma_commit_reservation(h, vma, addr); | 1145 | vma_commit_reservation(h, vma, addr); |
1057 | 1146 | ||
@@ -2072,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
2072 | { | 2161 | { |
2073 | struct hstate *h = hstate_vma(vma); | 2162 | struct hstate *h = hstate_vma(vma); |
2074 | struct resv_map *reservations = vma_resv_map(vma); | 2163 | struct resv_map *reservations = vma_resv_map(vma); |
2164 | struct hugepage_subpool *spool = subpool_vma(vma); | ||
2075 | unsigned long reserve; | 2165 | unsigned long reserve; |
2076 | unsigned long start; | 2166 | unsigned long start; |
2077 | unsigned long end; | 2167 | unsigned long end; |
@@ -2087,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
2087 | 2177 | ||
2088 | if (reserve) { | 2178 | if (reserve) { |
2089 | hugetlb_acct_memory(h, -reserve); | 2179 | hugetlb_acct_memory(h, -reserve); |
2090 | hugetlb_put_quota(vma->vm_file->f_mapping, reserve); | 2180 | hugepage_subpool_put_pages(spool, reserve); |
2091 | } | 2181 | } |
2092 | } | 2182 | } |
2093 | } | 2183 | } |
@@ -2276,6 +2366,10 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2276 | if (pte_dirty(pte)) | 2366 | if (pte_dirty(pte)) |
2277 | set_page_dirty(page); | 2367 | set_page_dirty(page); |
2278 | list_add(&page->lru, &page_list); | 2368 | list_add(&page->lru, &page_list); |
2369 | |||
2370 | /* Bail out after unmapping reference page if supplied */ | ||
2371 | if (ref_page) | ||
2372 | break; | ||
2279 | } | 2373 | } |
2280 | flush_tlb_range(vma, start, end); | 2374 | flush_tlb_range(vma, start, end); |
2281 | spin_unlock(&mm->page_table_lock); | 2375 | spin_unlock(&mm->page_table_lock); |
@@ -2316,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2316 | */ | 2410 | */ |
2317 | address = address & huge_page_mask(h); | 2411 | address = address & huge_page_mask(h); |
2318 | pgoff = vma_hugecache_offset(h, vma, address); | 2412 | pgoff = vma_hugecache_offset(h, vma, address); |
2319 | mapping = (struct address_space *)page_private(page); | 2413 | mapping = vma->vm_file->f_dentry->d_inode->i_mapping; |
2320 | 2414 | ||
2321 | /* | 2415 | /* |
2322 | * Take the mapping lock for the duration of the table walk. As | 2416 | * Take the mapping lock for the duration of the table walk. As |
@@ -2869,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2869 | { | 2963 | { |
2870 | long ret, chg; | 2964 | long ret, chg; |
2871 | struct hstate *h = hstate_inode(inode); | 2965 | struct hstate *h = hstate_inode(inode); |
2966 | struct hugepage_subpool *spool = subpool_inode(inode); | ||
2872 | 2967 | ||
2873 | /* | 2968 | /* |
2874 | * Only apply hugepage reservation if asked. At fault time, an | 2969 | * Only apply hugepage reservation if asked. At fault time, an |
2875 | * attempt will be made for VM_NORESERVE to allocate a page | 2970 | * attempt will be made for VM_NORESERVE to allocate a page |
2876 | * and filesystem quota without using reserves | 2971 | * without using reserves |
2877 | */ | 2972 | */ |
2878 | if (vm_flags & VM_NORESERVE) | 2973 | if (vm_flags & VM_NORESERVE) |
2879 | return 0; | 2974 | return 0; |
@@ -2900,17 +2995,17 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2900 | if (chg < 0) | 2995 | if (chg < 0) |
2901 | return chg; | 2996 | return chg; |
2902 | 2997 | ||
2903 | /* There must be enough filesystem quota for the mapping */ | 2998 | /* There must be enough pages in the subpool for the mapping */ |
2904 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 2999 | if (hugepage_subpool_get_pages(spool, chg)) |
2905 | return -ENOSPC; | 3000 | return -ENOSPC; |
2906 | 3001 | ||
2907 | /* | 3002 | /* |
2908 | * Check enough hugepages are available for the reservation. | 3003 | * Check enough hugepages are available for the reservation. |
2909 | * Hand back the quota if there are not | 3004 | * Hand the pages back to the subpool if there are not |
2910 | */ | 3005 | */ |
2911 | ret = hugetlb_acct_memory(h, chg); | 3006 | ret = hugetlb_acct_memory(h, chg); |
2912 | if (ret < 0) { | 3007 | if (ret < 0) { |
2913 | hugetlb_put_quota(inode->i_mapping, chg); | 3008 | hugepage_subpool_put_pages(spool, chg); |
2914 | return ret; | 3009 | return ret; |
2915 | } | 3010 | } |
2916 | 3011 | ||
@@ -2934,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
2934 | { | 3029 | { |
2935 | struct hstate *h = hstate_inode(inode); | 3030 | struct hstate *h = hstate_inode(inode); |
2936 | long chg = region_truncate(&inode->i_mapping->private_list, offset); | 3031 | long chg = region_truncate(&inode->i_mapping->private_list, offset); |
3032 | struct hugepage_subpool *spool = subpool_inode(inode); | ||
2937 | 3033 | ||
2938 | spin_lock(&inode->i_lock); | 3034 | spin_lock(&inode->i_lock); |
2939 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); | 3035 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); |
2940 | spin_unlock(&inode->i_lock); | 3036 | spin_unlock(&inode->i_lock); |
2941 | 3037 | ||
2942 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); | 3038 | hugepage_subpool_put_pages(spool, (chg - freed)); |
2943 | hugetlb_acct_memory(h, -(chg - freed)); | 3039 | hugetlb_acct_memory(h, -(chg - freed)); |
2944 | } | 3040 | } |
2945 | 3041 | ||
@@ -374,6 +374,20 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
374 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; | 374 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; |
375 | } | 375 | } |
376 | 376 | ||
377 | static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, | ||
378 | unsigned long addr) | ||
379 | { | ||
380 | struct vm_area_struct *vma; | ||
381 | if (ksm_test_exit(mm)) | ||
382 | return NULL; | ||
383 | vma = find_vma(mm, addr); | ||
384 | if (!vma || vma->vm_start > addr) | ||
385 | return NULL; | ||
386 | if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) | ||
387 | return NULL; | ||
388 | return vma; | ||
389 | } | ||
390 | |||
377 | static void break_cow(struct rmap_item *rmap_item) | 391 | static void break_cow(struct rmap_item *rmap_item) |
378 | { | 392 | { |
379 | struct mm_struct *mm = rmap_item->mm; | 393 | struct mm_struct *mm = rmap_item->mm; |
@@ -387,15 +401,9 @@ static void break_cow(struct rmap_item *rmap_item) | |||
387 | put_anon_vma(rmap_item->anon_vma); | 401 | put_anon_vma(rmap_item->anon_vma); |
388 | 402 | ||
389 | down_read(&mm->mmap_sem); | 403 | down_read(&mm->mmap_sem); |
390 | if (ksm_test_exit(mm)) | 404 | vma = find_mergeable_vma(mm, addr); |
391 | goto out; | 405 | if (vma) |
392 | vma = find_vma(mm, addr); | 406 | break_ksm(vma, addr); |
393 | if (!vma || vma->vm_start > addr) | ||
394 | goto out; | ||
395 | if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) | ||
396 | goto out; | ||
397 | break_ksm(vma, addr); | ||
398 | out: | ||
399 | up_read(&mm->mmap_sem); | 407 | up_read(&mm->mmap_sem); |
400 | } | 408 | } |
401 | 409 | ||
@@ -421,12 +429,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) | |||
421 | struct page *page; | 429 | struct page *page; |
422 | 430 | ||
423 | down_read(&mm->mmap_sem); | 431 | down_read(&mm->mmap_sem); |
424 | if (ksm_test_exit(mm)) | 432 | vma = find_mergeable_vma(mm, addr); |
425 | goto out; | 433 | if (!vma) |
426 | vma = find_vma(mm, addr); | ||
427 | if (!vma || vma->vm_start > addr) | ||
428 | goto out; | ||
429 | if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) | ||
430 | goto out; | 434 | goto out; |
431 | 435 | ||
432 | page = follow_page(vma, addr, FOLL_GET); | 436 | page = follow_page(vma, addr, FOLL_GET); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 26c6f4ec20f4..b2ee6df0e9bb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -89,7 +89,6 @@ enum mem_cgroup_stat_index { | |||
89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
90 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 90 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
91 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ | 91 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ |
92 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ | ||
93 | MEM_CGROUP_STAT_NSTATS, | 92 | MEM_CGROUP_STAT_NSTATS, |
94 | }; | 93 | }; |
95 | 94 | ||
@@ -135,7 +134,7 @@ struct mem_cgroup_reclaim_iter { | |||
135 | */ | 134 | */ |
136 | struct mem_cgroup_per_zone { | 135 | struct mem_cgroup_per_zone { |
137 | struct lruvec lruvec; | 136 | struct lruvec lruvec; |
138 | unsigned long count[NR_LRU_LISTS]; | 137 | unsigned long lru_size[NR_LRU_LISTS]; |
139 | 138 | ||
140 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 139 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; |
141 | 140 | ||
@@ -144,11 +143,9 @@ struct mem_cgroup_per_zone { | |||
144 | unsigned long long usage_in_excess;/* Set to the value by which */ | 143 | unsigned long long usage_in_excess;/* Set to the value by which */ |
145 | /* the soft limit is exceeded*/ | 144 | /* the soft limit is exceeded*/ |
146 | bool on_tree; | 145 | bool on_tree; |
147 | struct mem_cgroup *mem; /* Back pointer, we cannot */ | 146 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ |
148 | /* use container_of */ | 147 | /* use container_of */ |
149 | }; | 148 | }; |
150 | /* Macro for accessing counter */ | ||
151 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | ||
152 | 149 | ||
153 | struct mem_cgroup_per_node { | 150 | struct mem_cgroup_per_node { |
154 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 151 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
@@ -300,6 +297,12 @@ struct mem_cgroup { | |||
300 | */ | 297 | */ |
301 | unsigned long move_charge_at_immigrate; | 298 | unsigned long move_charge_at_immigrate; |
302 | /* | 299 | /* |
300 | * set > 0 if pages under this cgroup are moving to other cgroup. | ||
301 | */ | ||
302 | atomic_t moving_account; | ||
303 | /* taken only while moving_account > 0 */ | ||
304 | spinlock_t move_lock; | ||
305 | /* | ||
303 | * percpu counter. | 306 | * percpu counter. |
304 | */ | 307 | */ |
305 | struct mem_cgroup_stat_cpu *stat; | 308 | struct mem_cgroup_stat_cpu *stat; |
@@ -612,9 +615,9 @@ retry: | |||
612 | * we will to add it back at the end of reclaim to its correct | 615 | * we will to add it back at the end of reclaim to its correct |
613 | * position in the tree. | 616 | * position in the tree. |
614 | */ | 617 | */ |
615 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | 618 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); |
616 | if (!res_counter_soft_limit_excess(&mz->mem->res) || | 619 | if (!res_counter_soft_limit_excess(&mz->memcg->res) || |
617 | !css_tryget(&mz->mem->css)) | 620 | !css_tryget(&mz->memcg->css)) |
618 | goto retry; | 621 | goto retry; |
619 | done: | 622 | done: |
620 | return mz; | 623 | return mz; |
@@ -692,15 +695,19 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | |||
692 | } | 695 | } |
693 | 696 | ||
694 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | 697 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, |
695 | bool file, int nr_pages) | 698 | bool anon, int nr_pages) |
696 | { | 699 | { |
697 | preempt_disable(); | 700 | preempt_disable(); |
698 | 701 | ||
699 | if (file) | 702 | /* |
700 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], | 703 | * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is |
704 | * counted as CACHE even if it's on ANON LRU. | ||
705 | */ | ||
706 | if (anon) | ||
707 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], | ||
701 | nr_pages); | 708 | nr_pages); |
702 | else | 709 | else |
703 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], | 710 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], |
704 | nr_pages); | 711 | nr_pages); |
705 | 712 | ||
706 | /* pagein of a big page is an event. So, ignore page size */ | 713 | /* pagein of a big page is an event. So, ignore page size */ |
@@ -721,14 +728,14 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, | |||
721 | unsigned int lru_mask) | 728 | unsigned int lru_mask) |
722 | { | 729 | { |
723 | struct mem_cgroup_per_zone *mz; | 730 | struct mem_cgroup_per_zone *mz; |
724 | enum lru_list l; | 731 | enum lru_list lru; |
725 | unsigned long ret = 0; | 732 | unsigned long ret = 0; |
726 | 733 | ||
727 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | 734 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
728 | 735 | ||
729 | for_each_lru(l) { | 736 | for_each_lru(lru) { |
730 | if (BIT(l) & lru_mask) | 737 | if (BIT(lru) & lru_mask) |
731 | ret += MEM_CGROUP_ZSTAT(mz, l); | 738 | ret += mz->lru_size[lru]; |
732 | } | 739 | } |
733 | return ret; | 740 | return ret; |
734 | } | 741 | } |
@@ -1077,7 +1084,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, | |||
1077 | 1084 | ||
1078 | mz = page_cgroup_zoneinfo(memcg, page); | 1085 | mz = page_cgroup_zoneinfo(memcg, page); |
1079 | /* compound_order() is stabilized through lru_lock */ | 1086 | /* compound_order() is stabilized through lru_lock */ |
1080 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); | 1087 | mz->lru_size[lru] += 1 << compound_order(page); |
1081 | return &mz->lruvec; | 1088 | return &mz->lruvec; |
1082 | } | 1089 | } |
1083 | 1090 | ||
@@ -1105,8 +1112,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) | |||
1105 | VM_BUG_ON(!memcg); | 1112 | VM_BUG_ON(!memcg); |
1106 | mz = page_cgroup_zoneinfo(memcg, page); | 1113 | mz = page_cgroup_zoneinfo(memcg, page); |
1107 | /* huge page split is done under lru_lock. so, we have no races. */ | 1114 | /* huge page split is done under lru_lock. so, we have no races. */ |
1108 | VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); | 1115 | VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page))); |
1109 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); | 1116 | mz->lru_size[lru] -= 1 << compound_order(page); |
1110 | } | 1117 | } |
1111 | 1118 | ||
1112 | void mem_cgroup_lru_del(struct page *page) | 1119 | void mem_cgroup_lru_del(struct page *page) |
@@ -1285,40 +1292,48 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) | |||
1285 | return memcg->swappiness; | 1292 | return memcg->swappiness; |
1286 | } | 1293 | } |
1287 | 1294 | ||
1288 | static void mem_cgroup_start_move(struct mem_cgroup *memcg) | 1295 | /* |
1289 | { | 1296 | * memcg->moving_account is used for checking possibility that some thread is |
1290 | int cpu; | 1297 | * calling move_account(). When a thread on CPU-A starts moving pages under |
1298 | * a memcg, other threads should check memcg->moving_account under | ||
1299 | * rcu_read_lock(), like this: | ||
1300 | * | ||
1301 | * CPU-A CPU-B | ||
1302 | * rcu_read_lock() | ||
1303 | * memcg->moving_account+1 if (memcg->mocing_account) | ||
1304 | * take heavy locks. | ||
1305 | * synchronize_rcu() update something. | ||
1306 | * rcu_read_unlock() | ||
1307 | * start move here. | ||
1308 | */ | ||
1291 | 1309 | ||
1292 | get_online_cpus(); | 1310 | /* for quick checking without looking up memcg */ |
1293 | spin_lock(&memcg->pcp_counter_lock); | 1311 | atomic_t memcg_moving __read_mostly; |
1294 | for_each_online_cpu(cpu) | ||
1295 | per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; | ||
1296 | memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; | ||
1297 | spin_unlock(&memcg->pcp_counter_lock); | ||
1298 | put_online_cpus(); | ||
1299 | 1312 | ||
1313 | static void mem_cgroup_start_move(struct mem_cgroup *memcg) | ||
1314 | { | ||
1315 | atomic_inc(&memcg_moving); | ||
1316 | atomic_inc(&memcg->moving_account); | ||
1300 | synchronize_rcu(); | 1317 | synchronize_rcu(); |
1301 | } | 1318 | } |
1302 | 1319 | ||
1303 | static void mem_cgroup_end_move(struct mem_cgroup *memcg) | 1320 | static void mem_cgroup_end_move(struct mem_cgroup *memcg) |
1304 | { | 1321 | { |
1305 | int cpu; | 1322 | /* |
1306 | 1323 | * Now, mem_cgroup_clear_mc() may call this function with NULL. | |
1307 | if (!memcg) | 1324 | * We check NULL in callee rather than caller. |
1308 | return; | 1325 | */ |
1309 | get_online_cpus(); | 1326 | if (memcg) { |
1310 | spin_lock(&memcg->pcp_counter_lock); | 1327 | atomic_dec(&memcg_moving); |
1311 | for_each_online_cpu(cpu) | 1328 | atomic_dec(&memcg->moving_account); |
1312 | per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; | 1329 | } |
1313 | memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; | ||
1314 | spin_unlock(&memcg->pcp_counter_lock); | ||
1315 | put_online_cpus(); | ||
1316 | } | 1330 | } |
1331 | |||
1317 | /* | 1332 | /* |
1318 | * 2 routines for checking "mem" is under move_account() or not. | 1333 | * 2 routines for checking "mem" is under move_account() or not. |
1319 | * | 1334 | * |
1320 | * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used | 1335 | * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This |
1321 | * for avoiding race in accounting. If true, | 1336 | * is used for avoiding races in accounting. If true, |
1322 | * pc->mem_cgroup may be overwritten. | 1337 | * pc->mem_cgroup may be overwritten. |
1323 | * | 1338 | * |
1324 | * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or | 1339 | * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or |
@@ -1326,10 +1341,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) | |||
1326 | * waiting at hith-memory prressure caused by "move". | 1341 | * waiting at hith-memory prressure caused by "move". |
1327 | */ | 1342 | */ |
1328 | 1343 | ||
1329 | static bool mem_cgroup_stealed(struct mem_cgroup *memcg) | 1344 | static bool mem_cgroup_stolen(struct mem_cgroup *memcg) |
1330 | { | 1345 | { |
1331 | VM_BUG_ON(!rcu_read_lock_held()); | 1346 | VM_BUG_ON(!rcu_read_lock_held()); |
1332 | return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; | 1347 | return atomic_read(&memcg->moving_account) > 0; |
1333 | } | 1348 | } |
1334 | 1349 | ||
1335 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) | 1350 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) |
@@ -1370,6 +1385,24 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) | |||
1370 | return false; | 1385 | return false; |
1371 | } | 1386 | } |
1372 | 1387 | ||
1388 | /* | ||
1389 | * Take this lock when | ||
1390 | * - a code tries to modify page's memcg while it's USED. | ||
1391 | * - a code tries to modify page state accounting in a memcg. | ||
1392 | * see mem_cgroup_stolen(), too. | ||
1393 | */ | ||
1394 | static void move_lock_mem_cgroup(struct mem_cgroup *memcg, | ||
1395 | unsigned long *flags) | ||
1396 | { | ||
1397 | spin_lock_irqsave(&memcg->move_lock, *flags); | ||
1398 | } | ||
1399 | |||
1400 | static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, | ||
1401 | unsigned long *flags) | ||
1402 | { | ||
1403 | spin_unlock_irqrestore(&memcg->move_lock, *flags); | ||
1404 | } | ||
1405 | |||
1373 | /** | 1406 | /** |
1374 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. | 1407 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. |
1375 | * @memcg: The memory cgroup that went over limit | 1408 | * @memcg: The memory cgroup that went over limit |
@@ -1393,7 +1426,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1393 | if (!memcg || !p) | 1426 | if (!memcg || !p) |
1394 | return; | 1427 | return; |
1395 | 1428 | ||
1396 | |||
1397 | rcu_read_lock(); | 1429 | rcu_read_lock(); |
1398 | 1430 | ||
1399 | mem_cgrp = memcg->css.cgroup; | 1431 | mem_cgrp = memcg->css.cgroup; |
@@ -1772,22 +1804,22 @@ static DEFINE_SPINLOCK(memcg_oom_lock); | |||
1772 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1804 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
1773 | 1805 | ||
1774 | struct oom_wait_info { | 1806 | struct oom_wait_info { |
1775 | struct mem_cgroup *mem; | 1807 | struct mem_cgroup *memcg; |
1776 | wait_queue_t wait; | 1808 | wait_queue_t wait; |
1777 | }; | 1809 | }; |
1778 | 1810 | ||
1779 | static int memcg_oom_wake_function(wait_queue_t *wait, | 1811 | static int memcg_oom_wake_function(wait_queue_t *wait, |
1780 | unsigned mode, int sync, void *arg) | 1812 | unsigned mode, int sync, void *arg) |
1781 | { | 1813 | { |
1782 | struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg, | 1814 | struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; |
1783 | *oom_wait_memcg; | 1815 | struct mem_cgroup *oom_wait_memcg; |
1784 | struct oom_wait_info *oom_wait_info; | 1816 | struct oom_wait_info *oom_wait_info; |
1785 | 1817 | ||
1786 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | 1818 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); |
1787 | oom_wait_memcg = oom_wait_info->mem; | 1819 | oom_wait_memcg = oom_wait_info->memcg; |
1788 | 1820 | ||
1789 | /* | 1821 | /* |
1790 | * Both of oom_wait_info->mem and wake_mem are stable under us. | 1822 | * Both of oom_wait_info->memcg and wake_memcg are stable under us. |
1791 | * Then we can use css_is_ancestor without taking care of RCU. | 1823 | * Then we can use css_is_ancestor without taking care of RCU. |
1792 | */ | 1824 | */ |
1793 | if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) | 1825 | if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) |
@@ -1811,12 +1843,12 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
1811 | /* | 1843 | /* |
1812 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 1844 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. |
1813 | */ | 1845 | */ |
1814 | bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) | 1846 | bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
1815 | { | 1847 | { |
1816 | struct oom_wait_info owait; | 1848 | struct oom_wait_info owait; |
1817 | bool locked, need_to_kill; | 1849 | bool locked, need_to_kill; |
1818 | 1850 | ||
1819 | owait.mem = memcg; | 1851 | owait.memcg = memcg; |
1820 | owait.wait.flags = 0; | 1852 | owait.wait.flags = 0; |
1821 | owait.wait.func = memcg_oom_wake_function; | 1853 | owait.wait.func = memcg_oom_wake_function; |
1822 | owait.wait.private = current; | 1854 | owait.wait.private = current; |
@@ -1841,7 +1873,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) | |||
1841 | 1873 | ||
1842 | if (need_to_kill) { | 1874 | if (need_to_kill) { |
1843 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1875 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1844 | mem_cgroup_out_of_memory(memcg, mask); | 1876 | mem_cgroup_out_of_memory(memcg, mask, order); |
1845 | } else { | 1877 | } else { |
1846 | schedule(); | 1878 | schedule(); |
1847 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1879 | finish_wait(&memcg_oom_waitq, &owait.wait); |
@@ -1881,41 +1913,66 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) | |||
1881 | * by flags. | 1913 | * by flags. |
1882 | * | 1914 | * |
1883 | * Considering "move", this is an only case we see a race. To make the race | 1915 | * Considering "move", this is an only case we see a race. To make the race |
1884 | * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are | 1916 | * small, we check mm->moving_account and detect there are possibility of race |
1885 | * possibility of race condition. If there is, we take a lock. | 1917 | * If there is, we take a lock. |
1886 | */ | 1918 | */ |
1887 | 1919 | ||
1920 | void __mem_cgroup_begin_update_page_stat(struct page *page, | ||
1921 | bool *locked, unsigned long *flags) | ||
1922 | { | ||
1923 | struct mem_cgroup *memcg; | ||
1924 | struct page_cgroup *pc; | ||
1925 | |||
1926 | pc = lookup_page_cgroup(page); | ||
1927 | again: | ||
1928 | memcg = pc->mem_cgroup; | ||
1929 | if (unlikely(!memcg || !PageCgroupUsed(pc))) | ||
1930 | return; | ||
1931 | /* | ||
1932 | * If this memory cgroup is not under account moving, we don't | ||
1933 | * need to take move_lock_page_cgroup(). Because we already hold | ||
1934 | * rcu_read_lock(), any calls to move_account will be delayed until | ||
1935 | * rcu_read_unlock() if mem_cgroup_stolen() == true. | ||
1936 | */ | ||
1937 | if (!mem_cgroup_stolen(memcg)) | ||
1938 | return; | ||
1939 | |||
1940 | move_lock_mem_cgroup(memcg, flags); | ||
1941 | if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { | ||
1942 | move_unlock_mem_cgroup(memcg, flags); | ||
1943 | goto again; | ||
1944 | } | ||
1945 | *locked = true; | ||
1946 | } | ||
1947 | |||
1948 | void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) | ||
1949 | { | ||
1950 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
1951 | |||
1952 | /* | ||
1953 | * It's guaranteed that pc->mem_cgroup never changes while | ||
1954 | * lock is held because a routine modifies pc->mem_cgroup | ||
1955 | * should take move_lock_page_cgroup(). | ||
1956 | */ | ||
1957 | move_unlock_mem_cgroup(pc->mem_cgroup, flags); | ||
1958 | } | ||
1959 | |||
1888 | void mem_cgroup_update_page_stat(struct page *page, | 1960 | void mem_cgroup_update_page_stat(struct page *page, |
1889 | enum mem_cgroup_page_stat_item idx, int val) | 1961 | enum mem_cgroup_page_stat_item idx, int val) |
1890 | { | 1962 | { |
1891 | struct mem_cgroup *memcg; | 1963 | struct mem_cgroup *memcg; |
1892 | struct page_cgroup *pc = lookup_page_cgroup(page); | 1964 | struct page_cgroup *pc = lookup_page_cgroup(page); |
1893 | bool need_unlock = false; | ||
1894 | unsigned long uninitialized_var(flags); | 1965 | unsigned long uninitialized_var(flags); |
1895 | 1966 | ||
1896 | if (mem_cgroup_disabled()) | 1967 | if (mem_cgroup_disabled()) |
1897 | return; | 1968 | return; |
1898 | 1969 | ||
1899 | rcu_read_lock(); | ||
1900 | memcg = pc->mem_cgroup; | 1970 | memcg = pc->mem_cgroup; |
1901 | if (unlikely(!memcg || !PageCgroupUsed(pc))) | 1971 | if (unlikely(!memcg || !PageCgroupUsed(pc))) |
1902 | goto out; | 1972 | return; |
1903 | /* pc->mem_cgroup is unstable ? */ | ||
1904 | if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) { | ||
1905 | /* take a lock against to access pc->mem_cgroup */ | ||
1906 | move_lock_page_cgroup(pc, &flags); | ||
1907 | need_unlock = true; | ||
1908 | memcg = pc->mem_cgroup; | ||
1909 | if (!memcg || !PageCgroupUsed(pc)) | ||
1910 | goto out; | ||
1911 | } | ||
1912 | 1973 | ||
1913 | switch (idx) { | 1974 | switch (idx) { |
1914 | case MEMCG_NR_FILE_MAPPED: | 1975 | case MEMCG_NR_FILE_MAPPED: |
1915 | if (val > 0) | ||
1916 | SetPageCgroupFileMapped(pc); | ||
1917 | else if (!page_mapped(page)) | ||
1918 | ClearPageCgroupFileMapped(pc); | ||
1919 | idx = MEM_CGROUP_STAT_FILE_MAPPED; | 1976 | idx = MEM_CGROUP_STAT_FILE_MAPPED; |
1920 | break; | 1977 | break; |
1921 | default: | 1978 | default: |
@@ -1923,14 +1980,7 @@ void mem_cgroup_update_page_stat(struct page *page, | |||
1923 | } | 1980 | } |
1924 | 1981 | ||
1925 | this_cpu_add(memcg->stat->count[idx], val); | 1982 | this_cpu_add(memcg->stat->count[idx], val); |
1926 | |||
1927 | out: | ||
1928 | if (unlikely(need_unlock)) | ||
1929 | move_unlock_page_cgroup(pc, &flags); | ||
1930 | rcu_read_unlock(); | ||
1931 | return; | ||
1932 | } | 1983 | } |
1933 | EXPORT_SYMBOL(mem_cgroup_update_page_stat); | ||
1934 | 1984 | ||
1935 | /* | 1985 | /* |
1936 | * size of first charge trial. "32" comes from vmscan.c's magic value. | 1986 | * size of first charge trial. "32" comes from vmscan.c's magic value. |
@@ -2101,17 +2151,6 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) | |||
2101 | per_cpu(memcg->stat->events[i], cpu) = 0; | 2151 | per_cpu(memcg->stat->events[i], cpu) = 0; |
2102 | memcg->nocpu_base.events[i] += x; | 2152 | memcg->nocpu_base.events[i] += x; |
2103 | } | 2153 | } |
2104 | /* need to clear ON_MOVE value, works as a kind of lock. */ | ||
2105 | per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; | ||
2106 | spin_unlock(&memcg->pcp_counter_lock); | ||
2107 | } | ||
2108 | |||
2109 | static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu) | ||
2110 | { | ||
2111 | int idx = MEM_CGROUP_ON_MOVE; | ||
2112 | |||
2113 | spin_lock(&memcg->pcp_counter_lock); | ||
2114 | per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx]; | ||
2115 | spin_unlock(&memcg->pcp_counter_lock); | 2154 | spin_unlock(&memcg->pcp_counter_lock); |
2116 | } | 2155 | } |
2117 | 2156 | ||
@@ -2123,11 +2162,8 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, | |||
2123 | struct memcg_stock_pcp *stock; | 2162 | struct memcg_stock_pcp *stock; |
2124 | struct mem_cgroup *iter; | 2163 | struct mem_cgroup *iter; |
2125 | 2164 | ||
2126 | if ((action == CPU_ONLINE)) { | 2165 | if (action == CPU_ONLINE) |
2127 | for_each_mem_cgroup(iter) | ||
2128 | synchronize_mem_cgroup_on_move(iter, cpu); | ||
2129 | return NOTIFY_OK; | 2166 | return NOTIFY_OK; |
2130 | } | ||
2131 | 2167 | ||
2132 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) | 2168 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) |
2133 | return NOTIFY_OK; | 2169 | return NOTIFY_OK; |
@@ -2212,7 +2248,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2212 | if (!oom_check) | 2248 | if (!oom_check) |
2213 | return CHARGE_NOMEM; | 2249 | return CHARGE_NOMEM; |
2214 | /* check OOM */ | 2250 | /* check OOM */ |
2215 | if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) | 2251 | if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) |
2216 | return CHARGE_OOM_DIE; | 2252 | return CHARGE_OOM_DIE; |
2217 | 2253 | ||
2218 | return CHARGE_RETRY; | 2254 | return CHARGE_RETRY; |
@@ -2446,6 +2482,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2446 | { | 2482 | { |
2447 | struct zone *uninitialized_var(zone); | 2483 | struct zone *uninitialized_var(zone); |
2448 | bool was_on_lru = false; | 2484 | bool was_on_lru = false; |
2485 | bool anon; | ||
2449 | 2486 | ||
2450 | lock_page_cgroup(pc); | 2487 | lock_page_cgroup(pc); |
2451 | if (unlikely(PageCgroupUsed(pc))) { | 2488 | if (unlikely(PageCgroupUsed(pc))) { |
@@ -2481,19 +2518,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2481 | * See mem_cgroup_add_lru_list(), etc. | 2518 | * See mem_cgroup_add_lru_list(), etc. |
2482 | */ | 2519 | */ |
2483 | smp_wmb(); | 2520 | smp_wmb(); |
2484 | switch (ctype) { | 2521 | SetPageCgroupUsed(pc); |
2485 | case MEM_CGROUP_CHARGE_TYPE_CACHE: | ||
2486 | case MEM_CGROUP_CHARGE_TYPE_SHMEM: | ||
2487 | SetPageCgroupCache(pc); | ||
2488 | SetPageCgroupUsed(pc); | ||
2489 | break; | ||
2490 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | ||
2491 | ClearPageCgroupCache(pc); | ||
2492 | SetPageCgroupUsed(pc); | ||
2493 | break; | ||
2494 | default: | ||
2495 | break; | ||
2496 | } | ||
2497 | 2522 | ||
2498 | if (lrucare) { | 2523 | if (lrucare) { |
2499 | if (was_on_lru) { | 2524 | if (was_on_lru) { |
@@ -2504,7 +2529,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2504 | spin_unlock_irq(&zone->lru_lock); | 2529 | spin_unlock_irq(&zone->lru_lock); |
2505 | } | 2530 | } |
2506 | 2531 | ||
2507 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); | 2532 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) |
2533 | anon = true; | ||
2534 | else | ||
2535 | anon = false; | ||
2536 | |||
2537 | mem_cgroup_charge_statistics(memcg, anon, nr_pages); | ||
2508 | unlock_page_cgroup(pc); | 2538 | unlock_page_cgroup(pc); |
2509 | 2539 | ||
2510 | /* | 2540 | /* |
@@ -2517,8 +2547,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2517 | 2547 | ||
2518 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2548 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
2519 | 2549 | ||
2520 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ | 2550 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) |
2521 | (1 << PCG_MIGRATION)) | ||
2522 | /* | 2551 | /* |
2523 | * Because tail pages are not marked as "used", set it. We're under | 2552 | * Because tail pages are not marked as "used", set it. We're under |
2524 | * zone->lru_lock, 'splitting on pmd' and compound_lock. | 2553 | * zone->lru_lock, 'splitting on pmd' and compound_lock. |
@@ -2569,6 +2598,7 @@ static int mem_cgroup_move_account(struct page *page, | |||
2569 | { | 2598 | { |
2570 | unsigned long flags; | 2599 | unsigned long flags; |
2571 | int ret; | 2600 | int ret; |
2601 | bool anon = PageAnon(page); | ||
2572 | 2602 | ||
2573 | VM_BUG_ON(from == to); | 2603 | VM_BUG_ON(from == to); |
2574 | VM_BUG_ON(PageLRU(page)); | 2604 | VM_BUG_ON(PageLRU(page)); |
@@ -2588,23 +2618,23 @@ static int mem_cgroup_move_account(struct page *page, | |||
2588 | if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) | 2618 | if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) |
2589 | goto unlock; | 2619 | goto unlock; |
2590 | 2620 | ||
2591 | move_lock_page_cgroup(pc, &flags); | 2621 | move_lock_mem_cgroup(from, &flags); |
2592 | 2622 | ||
2593 | if (PageCgroupFileMapped(pc)) { | 2623 | if (!anon && page_mapped(page)) { |
2594 | /* Update mapped_file data for mem_cgroup */ | 2624 | /* Update mapped_file data for mem_cgroup */ |
2595 | preempt_disable(); | 2625 | preempt_disable(); |
2596 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 2626 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
2597 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 2627 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
2598 | preempt_enable(); | 2628 | preempt_enable(); |
2599 | } | 2629 | } |
2600 | mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); | 2630 | mem_cgroup_charge_statistics(from, anon, -nr_pages); |
2601 | if (uncharge) | 2631 | if (uncharge) |
2602 | /* This is not "cancel", but cancel_charge does all we need. */ | 2632 | /* This is not "cancel", but cancel_charge does all we need. */ |
2603 | __mem_cgroup_cancel_charge(from, nr_pages); | 2633 | __mem_cgroup_cancel_charge(from, nr_pages); |
2604 | 2634 | ||
2605 | /* caller should have done css_get */ | 2635 | /* caller should have done css_get */ |
2606 | pc->mem_cgroup = to; | 2636 | pc->mem_cgroup = to; |
2607 | mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); | 2637 | mem_cgroup_charge_statistics(to, anon, nr_pages); |
2608 | /* | 2638 | /* |
2609 | * We charges against "to" which may not have any tasks. Then, "to" | 2639 | * We charges against "to" which may not have any tasks. Then, "to" |
2610 | * can be under rmdir(). But in current implementation, caller of | 2640 | * can be under rmdir(). But in current implementation, caller of |
@@ -2612,7 +2642,7 @@ static int mem_cgroup_move_account(struct page *page, | |||
2612 | * guaranteed that "to" is never removed. So, we don't check rmdir | 2642 | * guaranteed that "to" is never removed. So, we don't check rmdir |
2613 | * status here. | 2643 | * status here. |
2614 | */ | 2644 | */ |
2615 | move_unlock_page_cgroup(pc, &flags); | 2645 | move_unlock_mem_cgroup(from, &flags); |
2616 | ret = 0; | 2646 | ret = 0; |
2617 | unlock: | 2647 | unlock: |
2618 | unlock_page_cgroup(pc); | 2648 | unlock_page_cgroup(pc); |
@@ -2914,7 +2944,6 @@ direct_uncharge: | |||
2914 | res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); | 2944 | res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); |
2915 | if (unlikely(batch->memcg != memcg)) | 2945 | if (unlikely(batch->memcg != memcg)) |
2916 | memcg_oom_recover(memcg); | 2946 | memcg_oom_recover(memcg); |
2917 | return; | ||
2918 | } | 2947 | } |
2919 | 2948 | ||
2920 | /* | 2949 | /* |
@@ -2926,6 +2955,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2926 | struct mem_cgroup *memcg = NULL; | 2955 | struct mem_cgroup *memcg = NULL; |
2927 | unsigned int nr_pages = 1; | 2956 | unsigned int nr_pages = 1; |
2928 | struct page_cgroup *pc; | 2957 | struct page_cgroup *pc; |
2958 | bool anon; | ||
2929 | 2959 | ||
2930 | if (mem_cgroup_disabled()) | 2960 | if (mem_cgroup_disabled()) |
2931 | return NULL; | 2961 | return NULL; |
@@ -2951,8 +2981,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2951 | if (!PageCgroupUsed(pc)) | 2981 | if (!PageCgroupUsed(pc)) |
2952 | goto unlock_out; | 2982 | goto unlock_out; |
2953 | 2983 | ||
2984 | anon = PageAnon(page); | ||
2985 | |||
2954 | switch (ctype) { | 2986 | switch (ctype) { |
2955 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 2987 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: |
2988 | /* | ||
2989 | * Generally PageAnon tells if it's the anon statistics to be | ||
2990 | * updated; but sometimes e.g. mem_cgroup_uncharge_page() is | ||
2991 | * used before page reached the stage of being marked PageAnon. | ||
2992 | */ | ||
2993 | anon = true; | ||
2994 | /* fallthrough */ | ||
2956 | case MEM_CGROUP_CHARGE_TYPE_DROP: | 2995 | case MEM_CGROUP_CHARGE_TYPE_DROP: |
2957 | /* See mem_cgroup_prepare_migration() */ | 2996 | /* See mem_cgroup_prepare_migration() */ |
2958 | if (page_mapped(page) || PageCgroupMigration(pc)) | 2997 | if (page_mapped(page) || PageCgroupMigration(pc)) |
@@ -2969,7 +3008,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2969 | break; | 3008 | break; |
2970 | } | 3009 | } |
2971 | 3010 | ||
2972 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages); | 3011 | mem_cgroup_charge_statistics(memcg, anon, -nr_pages); |
2973 | 3012 | ||
2974 | ClearPageCgroupUsed(pc); | 3013 | ClearPageCgroupUsed(pc); |
2975 | /* | 3014 | /* |
@@ -3276,6 +3315,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3276 | { | 3315 | { |
3277 | struct page *used, *unused; | 3316 | struct page *used, *unused; |
3278 | struct page_cgroup *pc; | 3317 | struct page_cgroup *pc; |
3318 | bool anon; | ||
3279 | 3319 | ||
3280 | if (!memcg) | 3320 | if (!memcg) |
3281 | return; | 3321 | return; |
@@ -3297,8 +3337,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3297 | lock_page_cgroup(pc); | 3337 | lock_page_cgroup(pc); |
3298 | ClearPageCgroupMigration(pc); | 3338 | ClearPageCgroupMigration(pc); |
3299 | unlock_page_cgroup(pc); | 3339 | unlock_page_cgroup(pc); |
3300 | 3340 | anon = PageAnon(used); | |
3301 | __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); | 3341 | __mem_cgroup_uncharge_common(unused, |
3342 | anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED | ||
3343 | : MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
3302 | 3344 | ||
3303 | /* | 3345 | /* |
3304 | * If a page is a file cache, radix-tree replacement is very atomic | 3346 | * If a page is a file cache, radix-tree replacement is very atomic |
@@ -3308,7 +3350,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3308 | * and USED bit check in mem_cgroup_uncharge_page() will do enough | 3350 | * and USED bit check in mem_cgroup_uncharge_page() will do enough |
3309 | * check. (see prepare_charge() also) | 3351 | * check. (see prepare_charge() also) |
3310 | */ | 3352 | */ |
3311 | if (PageAnon(used)) | 3353 | if (anon) |
3312 | mem_cgroup_uncharge_page(used); | 3354 | mem_cgroup_uncharge_page(used); |
3313 | /* | 3355 | /* |
3314 | * At migration, we may charge account against cgroup which has no | 3356 | * At migration, we may charge account against cgroup which has no |
@@ -3338,7 +3380,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
3338 | /* fix accounting on old pages */ | 3380 | /* fix accounting on old pages */ |
3339 | lock_page_cgroup(pc); | 3381 | lock_page_cgroup(pc); |
3340 | memcg = pc->mem_cgroup; | 3382 | memcg = pc->mem_cgroup; |
3341 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); | 3383 | mem_cgroup_charge_statistics(memcg, false, -1); |
3342 | ClearPageCgroupUsed(pc); | 3384 | ClearPageCgroupUsed(pc); |
3343 | unlock_page_cgroup(pc); | 3385 | unlock_page_cgroup(pc); |
3344 | 3386 | ||
@@ -3549,7 +3591,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3549 | break; | 3591 | break; |
3550 | 3592 | ||
3551 | nr_scanned = 0; | 3593 | nr_scanned = 0; |
3552 | reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, | 3594 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, |
3553 | gfp_mask, &nr_scanned); | 3595 | gfp_mask, &nr_scanned); |
3554 | nr_reclaimed += reclaimed; | 3596 | nr_reclaimed += reclaimed; |
3555 | *total_scanned += nr_scanned; | 3597 | *total_scanned += nr_scanned; |
@@ -3576,13 +3618,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3576 | next_mz = | 3618 | next_mz = |
3577 | __mem_cgroup_largest_soft_limit_node(mctz); | 3619 | __mem_cgroup_largest_soft_limit_node(mctz); |
3578 | if (next_mz == mz) | 3620 | if (next_mz == mz) |
3579 | css_put(&next_mz->mem->css); | 3621 | css_put(&next_mz->memcg->css); |
3580 | else /* next_mz == NULL or other memcg */ | 3622 | else /* next_mz == NULL or other memcg */ |
3581 | break; | 3623 | break; |
3582 | } while (1); | 3624 | } while (1); |
3583 | } | 3625 | } |
3584 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | 3626 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); |
3585 | excess = res_counter_soft_limit_excess(&mz->mem->res); | 3627 | excess = res_counter_soft_limit_excess(&mz->memcg->res); |
3586 | /* | 3628 | /* |
3587 | * One school of thought says that we should not add | 3629 | * One school of thought says that we should not add |
3588 | * back the node to the tree if reclaim returns 0. | 3630 | * back the node to the tree if reclaim returns 0. |
@@ -3592,9 +3634,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3592 | * term TODO. | 3634 | * term TODO. |
3593 | */ | 3635 | */ |
3594 | /* If excess == 0, no tree ops */ | 3636 | /* If excess == 0, no tree ops */ |
3595 | __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); | 3637 | __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); |
3596 | spin_unlock(&mctz->lock); | 3638 | spin_unlock(&mctz->lock); |
3597 | css_put(&mz->mem->css); | 3639 | css_put(&mz->memcg->css); |
3598 | loop++; | 3640 | loop++; |
3599 | /* | 3641 | /* |
3600 | * Could not reclaim anything and there are no more | 3642 | * Could not reclaim anything and there are no more |
@@ -3607,7 +3649,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3607 | break; | 3649 | break; |
3608 | } while (!nr_reclaimed); | 3650 | } while (!nr_reclaimed); |
3609 | if (next_mz) | 3651 | if (next_mz) |
3610 | css_put(&next_mz->mem->css); | 3652 | css_put(&next_mz->memcg->css); |
3611 | return nr_reclaimed; | 3653 | return nr_reclaimed; |
3612 | } | 3654 | } |
3613 | 3655 | ||
@@ -3629,7 +3671,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3629 | mz = mem_cgroup_zoneinfo(memcg, node, zid); | 3671 | mz = mem_cgroup_zoneinfo(memcg, node, zid); |
3630 | list = &mz->lruvec.lists[lru]; | 3672 | list = &mz->lruvec.lists[lru]; |
3631 | 3673 | ||
3632 | loop = MEM_CGROUP_ZSTAT(mz, lru); | 3674 | loop = mz->lru_size[lru]; |
3633 | /* give some margin against EBUSY etc...*/ | 3675 | /* give some margin against EBUSY etc...*/ |
3634 | loop += 256; | 3676 | loop += 256; |
3635 | busy = NULL; | 3677 | busy = NULL; |
@@ -3703,10 +3745,10 @@ move_account: | |||
3703 | mem_cgroup_start_move(memcg); | 3745 | mem_cgroup_start_move(memcg); |
3704 | for_each_node_state(node, N_HIGH_MEMORY) { | 3746 | for_each_node_state(node, N_HIGH_MEMORY) { |
3705 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 3747 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
3706 | enum lru_list l; | 3748 | enum lru_list lru; |
3707 | for_each_lru(l) { | 3749 | for_each_lru(lru) { |
3708 | ret = mem_cgroup_force_empty_list(memcg, | 3750 | ret = mem_cgroup_force_empty_list(memcg, |
3709 | node, zid, l); | 3751 | node, zid, lru); |
3710 | if (ret) | 3752 | if (ret) |
3711 | break; | 3753 | break; |
3712 | } | 3754 | } |
@@ -3860,7 +3902,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
3860 | break; | 3902 | break; |
3861 | default: | 3903 | default: |
3862 | BUG(); | 3904 | BUG(); |
3863 | break; | ||
3864 | } | 3905 | } |
3865 | return val; | 3906 | return val; |
3866 | } | 3907 | } |
@@ -3939,7 +3980,6 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, | |||
3939 | out: | 3980 | out: |
3940 | *mem_limit = min_limit; | 3981 | *mem_limit = min_limit; |
3941 | *memsw_limit = min_memsw_limit; | 3982 | *memsw_limit = min_memsw_limit; |
3942 | return; | ||
3943 | } | 3983 | } |
3944 | 3984 | ||
3945 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 3985 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
@@ -4098,38 +4138,38 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) | |||
4098 | unsigned long total_nr, file_nr, anon_nr, unevictable_nr; | 4138 | unsigned long total_nr, file_nr, anon_nr, unevictable_nr; |
4099 | unsigned long node_nr; | 4139 | unsigned long node_nr; |
4100 | struct cgroup *cont = m->private; | 4140 | struct cgroup *cont = m->private; |
4101 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); | 4141 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4102 | 4142 | ||
4103 | total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); | 4143 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); |
4104 | seq_printf(m, "total=%lu", total_nr); | 4144 | seq_printf(m, "total=%lu", total_nr); |
4105 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4145 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4106 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); | 4146 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); |
4107 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4147 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4108 | } | 4148 | } |
4109 | seq_putc(m, '\n'); | 4149 | seq_putc(m, '\n'); |
4110 | 4150 | ||
4111 | file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); | 4151 | file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); |
4112 | seq_printf(m, "file=%lu", file_nr); | 4152 | seq_printf(m, "file=%lu", file_nr); |
4113 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4153 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4114 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, | 4154 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4115 | LRU_ALL_FILE); | 4155 | LRU_ALL_FILE); |
4116 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4156 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4117 | } | 4157 | } |
4118 | seq_putc(m, '\n'); | 4158 | seq_putc(m, '\n'); |
4119 | 4159 | ||
4120 | anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); | 4160 | anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); |
4121 | seq_printf(m, "anon=%lu", anon_nr); | 4161 | seq_printf(m, "anon=%lu", anon_nr); |
4122 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4162 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4123 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, | 4163 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4124 | LRU_ALL_ANON); | 4164 | LRU_ALL_ANON); |
4125 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4165 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4126 | } | 4166 | } |
4127 | seq_putc(m, '\n'); | 4167 | seq_putc(m, '\n'); |
4128 | 4168 | ||
4129 | unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); | 4169 | unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); |
4130 | seq_printf(m, "unevictable=%lu", unevictable_nr); | 4170 | seq_printf(m, "unevictable=%lu", unevictable_nr); |
4131 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4171 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4132 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, | 4172 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4133 | BIT(LRU_UNEVICTABLE)); | 4173 | BIT(LRU_UNEVICTABLE)); |
4134 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4174 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4135 | } | 4175 | } |
@@ -4141,12 +4181,12 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) | |||
4141 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | 4181 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, |
4142 | struct cgroup_map_cb *cb) | 4182 | struct cgroup_map_cb *cb) |
4143 | { | 4183 | { |
4144 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); | 4184 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4145 | struct mcs_total_stat mystat; | 4185 | struct mcs_total_stat mystat; |
4146 | int i; | 4186 | int i; |
4147 | 4187 | ||
4148 | memset(&mystat, 0, sizeof(mystat)); | 4188 | memset(&mystat, 0, sizeof(mystat)); |
4149 | mem_cgroup_get_local_stat(mem_cont, &mystat); | 4189 | mem_cgroup_get_local_stat(memcg, &mystat); |
4150 | 4190 | ||
4151 | 4191 | ||
4152 | for (i = 0; i < NR_MCS_STAT; i++) { | 4192 | for (i = 0; i < NR_MCS_STAT; i++) { |
@@ -4158,14 +4198,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4158 | /* Hierarchical information */ | 4198 | /* Hierarchical information */ |
4159 | { | 4199 | { |
4160 | unsigned long long limit, memsw_limit; | 4200 | unsigned long long limit, memsw_limit; |
4161 | memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); | 4201 | memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); |
4162 | cb->fill(cb, "hierarchical_memory_limit", limit); | 4202 | cb->fill(cb, "hierarchical_memory_limit", limit); |
4163 | if (do_swap_account) | 4203 | if (do_swap_account) |
4164 | cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); | 4204 | cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); |
4165 | } | 4205 | } |
4166 | 4206 | ||
4167 | memset(&mystat, 0, sizeof(mystat)); | 4207 | memset(&mystat, 0, sizeof(mystat)); |
4168 | mem_cgroup_get_total_stat(mem_cont, &mystat); | 4208 | mem_cgroup_get_total_stat(memcg, &mystat); |
4169 | for (i = 0; i < NR_MCS_STAT; i++) { | 4209 | for (i = 0; i < NR_MCS_STAT; i++) { |
4170 | if (i == MCS_SWAP && !do_swap_account) | 4210 | if (i == MCS_SWAP && !do_swap_account) |
4171 | continue; | 4211 | continue; |
@@ -4181,7 +4221,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4181 | 4221 | ||
4182 | for_each_online_node(nid) | 4222 | for_each_online_node(nid) |
4183 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 4223 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
4184 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | 4224 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
4185 | 4225 | ||
4186 | recent_rotated[0] += | 4226 | recent_rotated[0] += |
4187 | mz->reclaim_stat.recent_rotated[0]; | 4227 | mz->reclaim_stat.recent_rotated[0]; |
@@ -4426,12 +4466,6 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, | |||
4426 | else | 4466 | else |
4427 | BUG(); | 4467 | BUG(); |
4428 | 4468 | ||
4429 | /* | ||
4430 | * Something went wrong if we trying to unregister a threshold | ||
4431 | * if we don't have thresholds | ||
4432 | */ | ||
4433 | BUG_ON(!thresholds); | ||
4434 | |||
4435 | if (!thresholds->primary) | 4469 | if (!thresholds->primary) |
4436 | goto unlock; | 4470 | goto unlock; |
4437 | 4471 | ||
@@ -4736,7 +4770,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
4736 | { | 4770 | { |
4737 | struct mem_cgroup_per_node *pn; | 4771 | struct mem_cgroup_per_node *pn; |
4738 | struct mem_cgroup_per_zone *mz; | 4772 | struct mem_cgroup_per_zone *mz; |
4739 | enum lru_list l; | 4773 | enum lru_list lru; |
4740 | int zone, tmp = node; | 4774 | int zone, tmp = node; |
4741 | /* | 4775 | /* |
4742 | * This routine is called against possible nodes. | 4776 | * This routine is called against possible nodes. |
@@ -4754,11 +4788,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
4754 | 4788 | ||
4755 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4789 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
4756 | mz = &pn->zoneinfo[zone]; | 4790 | mz = &pn->zoneinfo[zone]; |
4757 | for_each_lru(l) | 4791 | for_each_lru(lru) |
4758 | INIT_LIST_HEAD(&mz->lruvec.lists[l]); | 4792 | INIT_LIST_HEAD(&mz->lruvec.lists[lru]); |
4759 | mz->usage_in_excess = 0; | 4793 | mz->usage_in_excess = 0; |
4760 | mz->on_tree = false; | 4794 | mz->on_tree = false; |
4761 | mz->mem = memcg; | 4795 | mz->memcg = memcg; |
4762 | } | 4796 | } |
4763 | memcg->info.nodeinfo[node] = pn; | 4797 | memcg->info.nodeinfo[node] = pn; |
4764 | return 0; | 4798 | return 0; |
@@ -4771,29 +4805,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
4771 | 4805 | ||
4772 | static struct mem_cgroup *mem_cgroup_alloc(void) | 4806 | static struct mem_cgroup *mem_cgroup_alloc(void) |
4773 | { | 4807 | { |
4774 | struct mem_cgroup *mem; | 4808 | struct mem_cgroup *memcg; |
4775 | int size = sizeof(struct mem_cgroup); | 4809 | int size = sizeof(struct mem_cgroup); |
4776 | 4810 | ||
4777 | /* Can be very big if MAX_NUMNODES is very big */ | 4811 | /* Can be very big if MAX_NUMNODES is very big */ |
4778 | if (size < PAGE_SIZE) | 4812 | if (size < PAGE_SIZE) |
4779 | mem = kzalloc(size, GFP_KERNEL); | 4813 | memcg = kzalloc(size, GFP_KERNEL); |
4780 | else | 4814 | else |
4781 | mem = vzalloc(size); | 4815 | memcg = vzalloc(size); |
4782 | 4816 | ||
4783 | if (!mem) | 4817 | if (!memcg) |
4784 | return NULL; | 4818 | return NULL; |
4785 | 4819 | ||
4786 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | 4820 | memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); |
4787 | if (!mem->stat) | 4821 | if (!memcg->stat) |
4788 | goto out_free; | 4822 | goto out_free; |
4789 | spin_lock_init(&mem->pcp_counter_lock); | 4823 | spin_lock_init(&memcg->pcp_counter_lock); |
4790 | return mem; | 4824 | return memcg; |
4791 | 4825 | ||
4792 | out_free: | 4826 | out_free: |
4793 | if (size < PAGE_SIZE) | 4827 | if (size < PAGE_SIZE) |
4794 | kfree(mem); | 4828 | kfree(memcg); |
4795 | else | 4829 | else |
4796 | vfree(mem); | 4830 | vfree(memcg); |
4797 | return NULL; | 4831 | return NULL; |
4798 | } | 4832 | } |
4799 | 4833 | ||
@@ -4981,6 +5015,7 @@ mem_cgroup_create(struct cgroup *cont) | |||
4981 | atomic_set(&memcg->refcnt, 1); | 5015 | atomic_set(&memcg->refcnt, 1); |
4982 | memcg->move_charge_at_immigrate = 0; | 5016 | memcg->move_charge_at_immigrate = 0; |
4983 | mutex_init(&memcg->thresholds_lock); | 5017 | mutex_init(&memcg->thresholds_lock); |
5018 | spin_lock_init(&memcg->move_lock); | ||
4984 | return &memcg->css; | 5019 | return &memcg->css; |
4985 | free_out: | 5020 | free_out: |
4986 | __mem_cgroup_free(memcg); | 5021 | __mem_cgroup_free(memcg); |
@@ -5075,7 +5110,7 @@ one_by_one: | |||
5075 | } | 5110 | } |
5076 | 5111 | ||
5077 | /** | 5112 | /** |
5078 | * is_target_pte_for_mc - check a pte whether it is valid for move charge | 5113 | * get_mctgt_type - get target type of moving charge |
5079 | * @vma: the vma the pte to be checked belongs | 5114 | * @vma: the vma the pte to be checked belongs |
5080 | * @addr: the address corresponding to the pte to be checked | 5115 | * @addr: the address corresponding to the pte to be checked |
5081 | * @ptent: the pte to be checked | 5116 | * @ptent: the pte to be checked |
@@ -5098,7 +5133,7 @@ union mc_target { | |||
5098 | }; | 5133 | }; |
5099 | 5134 | ||
5100 | enum mc_target_type { | 5135 | enum mc_target_type { |
5101 | MC_TARGET_NONE, /* not used */ | 5136 | MC_TARGET_NONE = 0, |
5102 | MC_TARGET_PAGE, | 5137 | MC_TARGET_PAGE, |
5103 | MC_TARGET_SWAP, | 5138 | MC_TARGET_SWAP, |
5104 | }; | 5139 | }; |
@@ -5179,12 +5214,12 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
5179 | return page; | 5214 | return page; |
5180 | } | 5215 | } |
5181 | 5216 | ||
5182 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | 5217 | static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, |
5183 | unsigned long addr, pte_t ptent, union mc_target *target) | 5218 | unsigned long addr, pte_t ptent, union mc_target *target) |
5184 | { | 5219 | { |
5185 | struct page *page = NULL; | 5220 | struct page *page = NULL; |
5186 | struct page_cgroup *pc; | 5221 | struct page_cgroup *pc; |
5187 | int ret = 0; | 5222 | enum mc_target_type ret = MC_TARGET_NONE; |
5188 | swp_entry_t ent = { .val = 0 }; | 5223 | swp_entry_t ent = { .val = 0 }; |
5189 | 5224 | ||
5190 | if (pte_present(ptent)) | 5225 | if (pte_present(ptent)) |
@@ -5195,7 +5230,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
5195 | page = mc_handle_file_pte(vma, addr, ptent, &ent); | 5230 | page = mc_handle_file_pte(vma, addr, ptent, &ent); |
5196 | 5231 | ||
5197 | if (!page && !ent.val) | 5232 | if (!page && !ent.val) |
5198 | return 0; | 5233 | return ret; |
5199 | if (page) { | 5234 | if (page) { |
5200 | pc = lookup_page_cgroup(page); | 5235 | pc = lookup_page_cgroup(page); |
5201 | /* | 5236 | /* |
@@ -5221,6 +5256,41 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
5221 | return ret; | 5256 | return ret; |
5222 | } | 5257 | } |
5223 | 5258 | ||
5259 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
5260 | /* | ||
5261 | * We don't consider swapping or file mapped pages because THP does not | ||
5262 | * support them for now. | ||
5263 | * Caller should make sure that pmd_trans_huge(pmd) is true. | ||
5264 | */ | ||
5265 | static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | ||
5266 | unsigned long addr, pmd_t pmd, union mc_target *target) | ||
5267 | { | ||
5268 | struct page *page = NULL; | ||
5269 | struct page_cgroup *pc; | ||
5270 | enum mc_target_type ret = MC_TARGET_NONE; | ||
5271 | |||
5272 | page = pmd_page(pmd); | ||
5273 | VM_BUG_ON(!page || !PageHead(page)); | ||
5274 | if (!move_anon()) | ||
5275 | return ret; | ||
5276 | pc = lookup_page_cgroup(page); | ||
5277 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | ||
5278 | ret = MC_TARGET_PAGE; | ||
5279 | if (target) { | ||
5280 | get_page(page); | ||
5281 | target->page = page; | ||
5282 | } | ||
5283 | } | ||
5284 | return ret; | ||
5285 | } | ||
5286 | #else | ||
5287 | static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | ||
5288 | unsigned long addr, pmd_t pmd, union mc_target *target) | ||
5289 | { | ||
5290 | return MC_TARGET_NONE; | ||
5291 | } | ||
5292 | #endif | ||
5293 | |||
5224 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | 5294 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, |
5225 | unsigned long addr, unsigned long end, | 5295 | unsigned long addr, unsigned long end, |
5226 | struct mm_walk *walk) | 5296 | struct mm_walk *walk) |
@@ -5229,11 +5299,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
5229 | pte_t *pte; | 5299 | pte_t *pte; |
5230 | spinlock_t *ptl; | 5300 | spinlock_t *ptl; |
5231 | 5301 | ||
5232 | split_huge_page_pmd(walk->mm, pmd); | 5302 | if (pmd_trans_huge_lock(pmd, vma) == 1) { |
5303 | if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) | ||
5304 | mc.precharge += HPAGE_PMD_NR; | ||
5305 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
5306 | return 0; | ||
5307 | } | ||
5233 | 5308 | ||
5234 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 5309 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
5235 | for (; addr != end; pte++, addr += PAGE_SIZE) | 5310 | for (; addr != end; pte++, addr += PAGE_SIZE) |
5236 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | 5311 | if (get_mctgt_type(vma, addr, *pte, NULL)) |
5237 | mc.precharge++; /* increment precharge temporarily */ | 5312 | mc.precharge++; /* increment precharge temporarily */ |
5238 | pte_unmap_unlock(pte - 1, ptl); | 5313 | pte_unmap_unlock(pte - 1, ptl); |
5239 | cond_resched(); | 5314 | cond_resched(); |
@@ -5388,23 +5463,55 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
5388 | struct vm_area_struct *vma = walk->private; | 5463 | struct vm_area_struct *vma = walk->private; |
5389 | pte_t *pte; | 5464 | pte_t *pte; |
5390 | spinlock_t *ptl; | 5465 | spinlock_t *ptl; |
5466 | enum mc_target_type target_type; | ||
5467 | union mc_target target; | ||
5468 | struct page *page; | ||
5469 | struct page_cgroup *pc; | ||
5470 | |||
5471 | /* | ||
5472 | * We don't take compound_lock() here but no race with splitting thp | ||
5473 | * happens because: | ||
5474 | * - if pmd_trans_huge_lock() returns 1, the relevant thp is not | ||
5475 | * under splitting, which means there's no concurrent thp split, | ||
5476 | * - if another thread runs into split_huge_page() just after we | ||
5477 | * entered this if-block, the thread must wait for page table lock | ||
5478 | * to be unlocked in __split_huge_page_splitting(), where the main | ||
5479 | * part of thp split is not executed yet. | ||
5480 | */ | ||
5481 | if (pmd_trans_huge_lock(pmd, vma) == 1) { | ||
5482 | if (!mc.precharge) { | ||
5483 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
5484 | return 0; | ||
5485 | } | ||
5486 | target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); | ||
5487 | if (target_type == MC_TARGET_PAGE) { | ||
5488 | page = target.page; | ||
5489 | if (!isolate_lru_page(page)) { | ||
5490 | pc = lookup_page_cgroup(page); | ||
5491 | if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, | ||
5492 | pc, mc.from, mc.to, | ||
5493 | false)) { | ||
5494 | mc.precharge -= HPAGE_PMD_NR; | ||
5495 | mc.moved_charge += HPAGE_PMD_NR; | ||
5496 | } | ||
5497 | putback_lru_page(page); | ||
5498 | } | ||
5499 | put_page(page); | ||
5500 | } | ||
5501 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
5502 | return 0; | ||
5503 | } | ||
5391 | 5504 | ||
5392 | split_huge_page_pmd(walk->mm, pmd); | ||
5393 | retry: | 5505 | retry: |
5394 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 5506 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
5395 | for (; addr != end; addr += PAGE_SIZE) { | 5507 | for (; addr != end; addr += PAGE_SIZE) { |
5396 | pte_t ptent = *(pte++); | 5508 | pte_t ptent = *(pte++); |
5397 | union mc_target target; | ||
5398 | int type; | ||
5399 | struct page *page; | ||
5400 | struct page_cgroup *pc; | ||
5401 | swp_entry_t ent; | 5509 | swp_entry_t ent; |
5402 | 5510 | ||
5403 | if (!mc.precharge) | 5511 | if (!mc.precharge) |
5404 | break; | 5512 | break; |
5405 | 5513 | ||
5406 | type = is_target_pte_for_mc(vma, addr, ptent, &target); | 5514 | switch (get_mctgt_type(vma, addr, ptent, &target)) { |
5407 | switch (type) { | ||
5408 | case MC_TARGET_PAGE: | 5515 | case MC_TARGET_PAGE: |
5409 | page = target.page; | 5516 | page = target.page; |
5410 | if (isolate_lru_page(page)) | 5517 | if (isolate_lru_page(page)) |
@@ -5417,7 +5524,7 @@ retry: | |||
5417 | mc.moved_charge++; | 5524 | mc.moved_charge++; |
5418 | } | 5525 | } |
5419 | putback_lru_page(page); | 5526 | putback_lru_page(page); |
5420 | put: /* is_target_pte_for_mc() gets the page */ | 5527 | put: /* get_mctgt_type() gets the page */ |
5421 | put_page(page); | 5528 | put_page(page); |
5422 | break; | 5529 | break; |
5423 | case MC_TARGET_SWAP: | 5530 | case MC_TARGET_SWAP: |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 56080ea36140..c22076ffdd44 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1063,7 +1063,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1063 | * The check (unnecessarily) ignores LRU pages being isolated and | 1063 | * The check (unnecessarily) ignores LRU pages being isolated and |
1064 | * walked by the page reclaim code, however that's not a big loss. | 1064 | * walked by the page reclaim code, however that's not a big loss. |
1065 | */ | 1065 | */ |
1066 | if (!PageHuge(p) && !PageTransCompound(p)) { | 1066 | if (!PageHuge(p) && !PageTransTail(p)) { |
1067 | if (!PageLRU(p)) | 1067 | if (!PageLRU(p)) |
1068 | shake_page(p, 0); | 1068 | shake_page(p, 0); |
1069 | if (!PageLRU(p)) { | 1069 | if (!PageLRU(p)) { |
diff --git a/mm/memory.c b/mm/memory.c index 8438c157e4d9..3416b6e018d6 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -125,17 +125,17 @@ core_initcall(init_zero_pfn); | |||
125 | 125 | ||
126 | #if defined(SPLIT_RSS_COUNTING) | 126 | #if defined(SPLIT_RSS_COUNTING) |
127 | 127 | ||
128 | static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) | 128 | void sync_mm_rss(struct mm_struct *mm) |
129 | { | 129 | { |
130 | int i; | 130 | int i; |
131 | 131 | ||
132 | for (i = 0; i < NR_MM_COUNTERS; i++) { | 132 | for (i = 0; i < NR_MM_COUNTERS; i++) { |
133 | if (task->rss_stat.count[i]) { | 133 | if (current->rss_stat.count[i]) { |
134 | add_mm_counter(mm, i, task->rss_stat.count[i]); | 134 | add_mm_counter(mm, i, current->rss_stat.count[i]); |
135 | task->rss_stat.count[i] = 0; | 135 | current->rss_stat.count[i] = 0; |
136 | } | 136 | } |
137 | } | 137 | } |
138 | task->rss_stat.events = 0; | 138 | current->rss_stat.events = 0; |
139 | } | 139 | } |
140 | 140 | ||
141 | static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) | 141 | static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) |
@@ -157,30 +157,7 @@ static void check_sync_rss_stat(struct task_struct *task) | |||
157 | if (unlikely(task != current)) | 157 | if (unlikely(task != current)) |
158 | return; | 158 | return; |
159 | if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) | 159 | if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) |
160 | __sync_task_rss_stat(task, task->mm); | 160 | sync_mm_rss(task->mm); |
161 | } | ||
162 | |||
163 | unsigned long get_mm_counter(struct mm_struct *mm, int member) | ||
164 | { | ||
165 | long val = 0; | ||
166 | |||
167 | /* | ||
168 | * Don't use task->mm here...for avoiding to use task_get_mm().. | ||
169 | * The caller must guarantee task->mm is not invalid. | ||
170 | */ | ||
171 | val = atomic_long_read(&mm->rss_stat.count[member]); | ||
172 | /* | ||
173 | * counter is updated in asynchronous manner and may go to minus. | ||
174 | * But it's never be expected number for users. | ||
175 | */ | ||
176 | if (val < 0) | ||
177 | return 0; | ||
178 | return (unsigned long)val; | ||
179 | } | ||
180 | |||
181 | void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) | ||
182 | { | ||
183 | __sync_task_rss_stat(task, mm); | ||
184 | } | 161 | } |
185 | #else /* SPLIT_RSS_COUNTING */ | 162 | #else /* SPLIT_RSS_COUNTING */ |
186 | 163 | ||
@@ -661,7 +638,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) | |||
661 | int i; | 638 | int i; |
662 | 639 | ||
663 | if (current->mm == mm) | 640 | if (current->mm == mm) |
664 | sync_mm_rss(current, mm); | 641 | sync_mm_rss(mm); |
665 | for (i = 0; i < NR_MM_COUNTERS; i++) | 642 | for (i = 0; i < NR_MM_COUNTERS; i++) |
666 | if (rss[i]) | 643 | if (rss[i]) |
667 | add_mm_counter(mm, i, rss[i]); | 644 | add_mm_counter(mm, i, rss[i]); |
@@ -1247,16 +1224,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1247 | do { | 1224 | do { |
1248 | next = pmd_addr_end(addr, end); | 1225 | next = pmd_addr_end(addr, end); |
1249 | if (pmd_trans_huge(*pmd)) { | 1226 | if (pmd_trans_huge(*pmd)) { |
1250 | if (next-addr != HPAGE_PMD_SIZE) { | 1227 | if (next - addr != HPAGE_PMD_SIZE) { |
1251 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | 1228 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); |
1252 | split_huge_page_pmd(vma->vm_mm, pmd); | 1229 | split_huge_page_pmd(vma->vm_mm, pmd); |
1253 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) | 1230 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
1254 | continue; | 1231 | goto next; |
1255 | /* fall through */ | 1232 | /* fall through */ |
1256 | } | 1233 | } |
1257 | if (pmd_none_or_clear_bad(pmd)) | 1234 | /* |
1258 | continue; | 1235 | * Here there can be other concurrent MADV_DONTNEED or |
1236 | * trans huge page faults running, and if the pmd is | ||
1237 | * none or trans huge it can change under us. This is | ||
1238 | * because MADV_DONTNEED holds the mmap_sem in read | ||
1239 | * mode. | ||
1240 | */ | ||
1241 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | ||
1242 | goto next; | ||
1259 | next = zap_pte_range(tlb, vma, pmd, addr, next, details); | 1243 | next = zap_pte_range(tlb, vma, pmd, addr, next, details); |
1244 | next: | ||
1260 | cond_resched(); | 1245 | cond_resched(); |
1261 | } while (pmd++, addr = next, addr != end); | 1246 | } while (pmd++, addr = next, addr != end); |
1262 | 1247 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 47296fee23db..cfb6c8678754 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
512 | do { | 512 | do { |
513 | next = pmd_addr_end(addr, end); | 513 | next = pmd_addr_end(addr, end); |
514 | split_huge_page_pmd(vma->vm_mm, pmd); | 514 | split_huge_page_pmd(vma->vm_mm, pmd); |
515 | if (pmd_none_or_clear_bad(pmd)) | 515 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
516 | continue; | 516 | continue; |
517 | if (check_pte_range(vma, pmd, addr, next, nodes, | 517 | if (check_pte_range(vma, pmd, addr, next, nodes, |
518 | flags, private)) | 518 | flags, private)) |
@@ -1323,12 +1323,9 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1323 | err = -ESRCH; | 1323 | err = -ESRCH; |
1324 | goto out; | 1324 | goto out; |
1325 | } | 1325 | } |
1326 | mm = get_task_mm(task); | 1326 | get_task_struct(task); |
1327 | rcu_read_unlock(); | ||
1328 | 1327 | ||
1329 | err = -EINVAL; | 1328 | err = -EINVAL; |
1330 | if (!mm) | ||
1331 | goto out; | ||
1332 | 1329 | ||
1333 | /* | 1330 | /* |
1334 | * Check if this process has the right to modify the specified | 1331 | * Check if this process has the right to modify the specified |
@@ -1336,14 +1333,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1336 | * capabilities, superuser privileges or the same | 1333 | * capabilities, superuser privileges or the same |
1337 | * userid as the target process. | 1334 | * userid as the target process. |
1338 | */ | 1335 | */ |
1339 | rcu_read_lock(); | ||
1340 | tcred = __task_cred(task); | 1336 | tcred = __task_cred(task); |
1341 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && | 1337 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && |
1342 | cred->uid != tcred->suid && cred->uid != tcred->uid && | 1338 | cred->uid != tcred->suid && cred->uid != tcred->uid && |
1343 | !capable(CAP_SYS_NICE)) { | 1339 | !capable(CAP_SYS_NICE)) { |
1344 | rcu_read_unlock(); | 1340 | rcu_read_unlock(); |
1345 | err = -EPERM; | 1341 | err = -EPERM; |
1346 | goto out; | 1342 | goto out_put; |
1347 | } | 1343 | } |
1348 | rcu_read_unlock(); | 1344 | rcu_read_unlock(); |
1349 | 1345 | ||
@@ -1351,26 +1347,36 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1351 | /* Is the user allowed to access the target nodes? */ | 1347 | /* Is the user allowed to access the target nodes? */ |
1352 | if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { | 1348 | if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { |
1353 | err = -EPERM; | 1349 | err = -EPERM; |
1354 | goto out; | 1350 | goto out_put; |
1355 | } | 1351 | } |
1356 | 1352 | ||
1357 | if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { | 1353 | if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { |
1358 | err = -EINVAL; | 1354 | err = -EINVAL; |
1359 | goto out; | 1355 | goto out_put; |
1360 | } | 1356 | } |
1361 | 1357 | ||
1362 | err = security_task_movememory(task); | 1358 | err = security_task_movememory(task); |
1363 | if (err) | 1359 | if (err) |
1364 | goto out; | 1360 | goto out_put; |
1365 | 1361 | ||
1366 | err = do_migrate_pages(mm, old, new, | 1362 | mm = get_task_mm(task); |
1367 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); | 1363 | put_task_struct(task); |
1368 | out: | ||
1369 | if (mm) | 1364 | if (mm) |
1370 | mmput(mm); | 1365 | err = do_migrate_pages(mm, old, new, |
1366 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); | ||
1367 | else | ||
1368 | err = -EINVAL; | ||
1369 | |||
1370 | mmput(mm); | ||
1371 | out: | ||
1371 | NODEMASK_SCRATCH_FREE(scratch); | 1372 | NODEMASK_SCRATCH_FREE(scratch); |
1372 | 1373 | ||
1373 | return err; | 1374 | return err; |
1375 | |||
1376 | out_put: | ||
1377 | put_task_struct(task); | ||
1378 | goto out; | ||
1379 | |||
1374 | } | 1380 | } |
1375 | 1381 | ||
1376 | 1382 | ||
@@ -1844,18 +1850,24 @@ struct page * | |||
1844 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | 1850 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
1845 | unsigned long addr, int node) | 1851 | unsigned long addr, int node) |
1846 | { | 1852 | { |
1847 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1853 | struct mempolicy *pol; |
1848 | struct zonelist *zl; | 1854 | struct zonelist *zl; |
1849 | struct page *page; | 1855 | struct page *page; |
1856 | unsigned int cpuset_mems_cookie; | ||
1857 | |||
1858 | retry_cpuset: | ||
1859 | pol = get_vma_policy(current, vma, addr); | ||
1860 | cpuset_mems_cookie = get_mems_allowed(); | ||
1850 | 1861 | ||
1851 | get_mems_allowed(); | ||
1852 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1862 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
1853 | unsigned nid; | 1863 | unsigned nid; |
1854 | 1864 | ||
1855 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); | 1865 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); |
1856 | mpol_cond_put(pol); | 1866 | mpol_cond_put(pol); |
1857 | page = alloc_page_interleave(gfp, order, nid); | 1867 | page = alloc_page_interleave(gfp, order, nid); |
1858 | put_mems_allowed(); | 1868 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1869 | goto retry_cpuset; | ||
1870 | |||
1859 | return page; | 1871 | return page; |
1860 | } | 1872 | } |
1861 | zl = policy_zonelist(gfp, pol, node); | 1873 | zl = policy_zonelist(gfp, pol, node); |
@@ -1866,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
1866 | struct page *page = __alloc_pages_nodemask(gfp, order, | 1878 | struct page *page = __alloc_pages_nodemask(gfp, order, |
1867 | zl, policy_nodemask(gfp, pol)); | 1879 | zl, policy_nodemask(gfp, pol)); |
1868 | __mpol_put(pol); | 1880 | __mpol_put(pol); |
1869 | put_mems_allowed(); | 1881 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1882 | goto retry_cpuset; | ||
1870 | return page; | 1883 | return page; |
1871 | } | 1884 | } |
1872 | /* | 1885 | /* |
@@ -1874,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
1874 | */ | 1887 | */ |
1875 | page = __alloc_pages_nodemask(gfp, order, zl, | 1888 | page = __alloc_pages_nodemask(gfp, order, zl, |
1876 | policy_nodemask(gfp, pol)); | 1889 | policy_nodemask(gfp, pol)); |
1877 | put_mems_allowed(); | 1890 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1891 | goto retry_cpuset; | ||
1878 | return page; | 1892 | return page; |
1879 | } | 1893 | } |
1880 | 1894 | ||
@@ -1901,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1901 | { | 1915 | { |
1902 | struct mempolicy *pol = current->mempolicy; | 1916 | struct mempolicy *pol = current->mempolicy; |
1903 | struct page *page; | 1917 | struct page *page; |
1918 | unsigned int cpuset_mems_cookie; | ||
1904 | 1919 | ||
1905 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1920 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
1906 | pol = &default_policy; | 1921 | pol = &default_policy; |
1907 | 1922 | ||
1908 | get_mems_allowed(); | 1923 | retry_cpuset: |
1924 | cpuset_mems_cookie = get_mems_allowed(); | ||
1925 | |||
1909 | /* | 1926 | /* |
1910 | * No reference counting needed for current->mempolicy | 1927 | * No reference counting needed for current->mempolicy |
1911 | * nor system default_policy | 1928 | * nor system default_policy |
@@ -1916,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1916 | page = __alloc_pages_nodemask(gfp, order, | 1933 | page = __alloc_pages_nodemask(gfp, order, |
1917 | policy_zonelist(gfp, pol, numa_node_id()), | 1934 | policy_zonelist(gfp, pol, numa_node_id()), |
1918 | policy_nodemask(gfp, pol)); | 1935 | policy_nodemask(gfp, pol)); |
1919 | put_mems_allowed(); | 1936 | |
1937 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
1938 | goto retry_cpuset; | ||
1939 | |||
1920 | return page; | 1940 | return page; |
1921 | } | 1941 | } |
1922 | EXPORT_SYMBOL(alloc_pages_current); | 1942 | EXPORT_SYMBOL(alloc_pages_current); |
diff --git a/mm/migrate.c b/mm/migrate.c index 1503b6b54ecb..51c08a0c6f68 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -1174,20 +1174,17 @@ set_status: | |||
1174 | * Migrate an array of page address onto an array of nodes and fill | 1174 | * Migrate an array of page address onto an array of nodes and fill |
1175 | * the corresponding array of status. | 1175 | * the corresponding array of status. |
1176 | */ | 1176 | */ |
1177 | static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | 1177 | static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, |
1178 | unsigned long nr_pages, | 1178 | unsigned long nr_pages, |
1179 | const void __user * __user *pages, | 1179 | const void __user * __user *pages, |
1180 | const int __user *nodes, | 1180 | const int __user *nodes, |
1181 | int __user *status, int flags) | 1181 | int __user *status, int flags) |
1182 | { | 1182 | { |
1183 | struct page_to_node *pm; | 1183 | struct page_to_node *pm; |
1184 | nodemask_t task_nodes; | ||
1185 | unsigned long chunk_nr_pages; | 1184 | unsigned long chunk_nr_pages; |
1186 | unsigned long chunk_start; | 1185 | unsigned long chunk_start; |
1187 | int err; | 1186 | int err; |
1188 | 1187 | ||
1189 | task_nodes = cpuset_mems_allowed(task); | ||
1190 | |||
1191 | err = -ENOMEM; | 1188 | err = -ENOMEM; |
1192 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); | 1189 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); |
1193 | if (!pm) | 1190 | if (!pm) |
@@ -1349,6 +1346,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1349 | struct task_struct *task; | 1346 | struct task_struct *task; |
1350 | struct mm_struct *mm; | 1347 | struct mm_struct *mm; |
1351 | int err; | 1348 | int err; |
1349 | nodemask_t task_nodes; | ||
1352 | 1350 | ||
1353 | /* Check flags */ | 1351 | /* Check flags */ |
1354 | if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) | 1352 | if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) |
@@ -1364,11 +1362,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1364 | rcu_read_unlock(); | 1362 | rcu_read_unlock(); |
1365 | return -ESRCH; | 1363 | return -ESRCH; |
1366 | } | 1364 | } |
1367 | mm = get_task_mm(task); | 1365 | get_task_struct(task); |
1368 | rcu_read_unlock(); | ||
1369 | |||
1370 | if (!mm) | ||
1371 | return -EINVAL; | ||
1372 | 1366 | ||
1373 | /* | 1367 | /* |
1374 | * Check if this process has the right to modify the specified | 1368 | * Check if this process has the right to modify the specified |
@@ -1376,7 +1370,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1376 | * capabilities, superuser privileges or the same | 1370 | * capabilities, superuser privileges or the same |
1377 | * userid as the target process. | 1371 | * userid as the target process. |
1378 | */ | 1372 | */ |
1379 | rcu_read_lock(); | ||
1380 | tcred = __task_cred(task); | 1373 | tcred = __task_cred(task); |
1381 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && | 1374 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && |
1382 | cred->uid != tcred->suid && cred->uid != tcred->uid && | 1375 | cred->uid != tcred->suid && cred->uid != tcred->uid && |
@@ -1391,16 +1384,25 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1391 | if (err) | 1384 | if (err) |
1392 | goto out; | 1385 | goto out; |
1393 | 1386 | ||
1394 | if (nodes) { | 1387 | task_nodes = cpuset_mems_allowed(task); |
1395 | err = do_pages_move(mm, task, nr_pages, pages, nodes, status, | 1388 | mm = get_task_mm(task); |
1396 | flags); | 1389 | put_task_struct(task); |
1397 | } else { | 1390 | |
1398 | err = do_pages_stat(mm, nr_pages, pages, status); | 1391 | if (mm) { |
1399 | } | 1392 | if (nodes) |
1393 | err = do_pages_move(mm, task_nodes, nr_pages, pages, | ||
1394 | nodes, status, flags); | ||
1395 | else | ||
1396 | err = do_pages_stat(mm, nr_pages, pages, status); | ||
1397 | } else | ||
1398 | err = -EINVAL; | ||
1400 | 1399 | ||
1401 | out: | ||
1402 | mmput(mm); | 1400 | mmput(mm); |
1403 | return err; | 1401 | return err; |
1402 | |||
1403 | out: | ||
1404 | put_task_struct(task); | ||
1405 | return err; | ||
1404 | } | 1406 | } |
1405 | 1407 | ||
1406 | /* | 1408 | /* |
diff --git a/mm/mincore.c b/mm/mincore.c index 636a86876ff2..936b4cee8cb1 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
164 | } | 164 | } |
165 | /* fall through */ | 165 | /* fall through */ |
166 | } | 166 | } |
167 | if (pmd_none_or_clear_bad(pmd)) | 167 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
168 | mincore_unmapped_range(vma, addr, next, vec); | 168 | mincore_unmapped_range(vma, addr, next, vec); |
169 | else | 169 | else |
170 | mincore_pte_range(vma, pmd, addr, next, vec); | 170 | mincore_pte_range(vma, pmd, addr, next, vec); |
@@ -451,9 +451,8 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
451 | } | 451 | } |
452 | 452 | ||
453 | /* | 453 | /* |
454 | * Helper for vma_adjust in the split_vma insert case: | 454 | * Helper for vma_adjust() in the split_vma insert case: insert a vma into the |
455 | * insert vm structure into list and rbtree and anon_vma, | 455 | * mm's list and rbtree. It has already been inserted into the prio_tree. |
456 | * but it has already been inserted into prio_tree earlier. | ||
457 | */ | 456 | */ |
458 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | 457 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
459 | { | 458 | { |
@@ -1112,9 +1111,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1112 | * A dummy user value is used because we are not locking | 1111 | * A dummy user value is used because we are not locking |
1113 | * memory so no accounting is necessary | 1112 | * memory so no accounting is necessary |
1114 | */ | 1113 | */ |
1115 | len = ALIGN(len, huge_page_size(&default_hstate)); | 1114 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, |
1116 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, | 1115 | VM_NORESERVE, &user, |
1117 | &user, HUGETLB_ANONHUGE_INODE); | 1116 | HUGETLB_ANONHUGE_INODE); |
1118 | if (IS_ERR(file)) | 1117 | if (IS_ERR(file)) |
1119 | return PTR_ERR(file); | 1118 | return PTR_ERR(file); |
1120 | } | 1119 | } |
@@ -1439,10 +1438,8 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr) | |||
1439 | /* | 1438 | /* |
1440 | * Is this a new hole at the lowest possible address? | 1439 | * Is this a new hole at the lowest possible address? |
1441 | */ | 1440 | */ |
1442 | if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { | 1441 | if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) |
1443 | mm->free_area_cache = addr; | 1442 | mm->free_area_cache = addr; |
1444 | mm->cached_hole_size = ~0UL; | ||
1445 | } | ||
1446 | } | 1443 | } |
1447 | 1444 | ||
1448 | /* | 1445 | /* |
@@ -1457,7 +1454,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1457 | { | 1454 | { |
1458 | struct vm_area_struct *vma; | 1455 | struct vm_area_struct *vma; |
1459 | struct mm_struct *mm = current->mm; | 1456 | struct mm_struct *mm = current->mm; |
1460 | unsigned long addr = addr0; | 1457 | unsigned long addr = addr0, start_addr; |
1461 | 1458 | ||
1462 | /* requested length too big for entire address space */ | 1459 | /* requested length too big for entire address space */ |
1463 | if (len > TASK_SIZE) | 1460 | if (len > TASK_SIZE) |
@@ -1481,22 +1478,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1481 | mm->free_area_cache = mm->mmap_base; | 1478 | mm->free_area_cache = mm->mmap_base; |
1482 | } | 1479 | } |
1483 | 1480 | ||
1481 | try_again: | ||
1484 | /* either no address requested or can't fit in requested address hole */ | 1482 | /* either no address requested or can't fit in requested address hole */ |
1485 | addr = mm->free_area_cache; | 1483 | start_addr = addr = mm->free_area_cache; |
1486 | |||
1487 | /* make sure it can fit in the remaining address space */ | ||
1488 | if (addr > len) { | ||
1489 | vma = find_vma(mm, addr-len); | ||
1490 | if (!vma || addr <= vma->vm_start) | ||
1491 | /* remember the address as a hint for next time */ | ||
1492 | return (mm->free_area_cache = addr-len); | ||
1493 | } | ||
1494 | |||
1495 | if (mm->mmap_base < len) | ||
1496 | goto bottomup; | ||
1497 | 1484 | ||
1498 | addr = mm->mmap_base-len; | 1485 | if (addr < len) |
1486 | goto fail; | ||
1499 | 1487 | ||
1488 | addr -= len; | ||
1500 | do { | 1489 | do { |
1501 | /* | 1490 | /* |
1502 | * Lookup failure means no vma is above this address, | 1491 | * Lookup failure means no vma is above this address, |
@@ -1516,7 +1505,21 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1516 | addr = vma->vm_start-len; | 1505 | addr = vma->vm_start-len; |
1517 | } while (len < vma->vm_start); | 1506 | } while (len < vma->vm_start); |
1518 | 1507 | ||
1519 | bottomup: | 1508 | fail: |
1509 | /* | ||
1510 | * if hint left us with no space for the requested | ||
1511 | * mapping then try again: | ||
1512 | * | ||
1513 | * Note: this is different with the case of bottomup | ||
1514 | * which does the fully line-search, but we use find_vma | ||
1515 | * here that causes some holes skipped. | ||
1516 | */ | ||
1517 | if (start_addr != mm->mmap_base) { | ||
1518 | mm->free_area_cache = mm->mmap_base; | ||
1519 | mm->cached_hole_size = 0; | ||
1520 | goto try_again; | ||
1521 | } | ||
1522 | |||
1520 | /* | 1523 | /* |
1521 | * A failed mmap() very likely causes application failure, | 1524 | * A failed mmap() very likely causes application failure, |
1522 | * so fall back to the bottom-up function here. This scenario | 1525 | * so fall back to the bottom-up function here. This scenario |
diff --git a/mm/mmu_context.c b/mm/mmu_context.c index cf332bc0080a..3dcfaf4ed355 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c | |||
@@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm) | |||
53 | struct task_struct *tsk = current; | 53 | struct task_struct *tsk = current; |
54 | 54 | ||
55 | task_lock(tsk); | 55 | task_lock(tsk); |
56 | sync_mm_rss(tsk, mm); | 56 | sync_mm_rss(mm); |
57 | tsk->mm = NULL; | 57 | tsk->mm = NULL; |
58 | /* active_mm is still 'mm' */ | 58 | /* active_mm is still 'mm' */ |
59 | enter_lazy_tlb(mm, tsk); | 59 | enter_lazy_tlb(mm, tsk); |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 142ef4a1f480..a40992610ab6 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -60,7 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
60 | ptent = pte_mkwrite(ptent); | 60 | ptent = pte_mkwrite(ptent); |
61 | 61 | ||
62 | ptep_modify_prot_commit(mm, addr, pte, ptent); | 62 | ptep_modify_prot_commit(mm, addr, pte, ptent); |
63 | } else if (PAGE_MIGRATION && !pte_file(oldpte)) { | 63 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { |
64 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 64 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
65 | 65 | ||
66 | if (is_write_migration_entry(entry)) { | 66 | if (is_write_migration_entry(entry)) { |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2958fd8e7c9a..4198e000f41a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/ptrace.h> | 34 | #include <linux/ptrace.h> |
35 | #include <linux/freezer.h> | 35 | #include <linux/freezer.h> |
36 | #include <linux/ftrace.h> | 36 | #include <linux/ftrace.h> |
37 | #include <linux/ratelimit.h> | ||
37 | 38 | ||
38 | #define CREATE_TRACE_POINTS | 39 | #define CREATE_TRACE_POINTS |
39 | #include <trace/events/oom.h> | 40 | #include <trace/events/oom.h> |
@@ -309,7 +310,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
309 | */ | 310 | */ |
310 | static struct task_struct *select_bad_process(unsigned int *ppoints, | 311 | static struct task_struct *select_bad_process(unsigned int *ppoints, |
311 | unsigned long totalpages, struct mem_cgroup *memcg, | 312 | unsigned long totalpages, struct mem_cgroup *memcg, |
312 | const nodemask_t *nodemask) | 313 | const nodemask_t *nodemask, bool force_kill) |
313 | { | 314 | { |
314 | struct task_struct *g, *p; | 315 | struct task_struct *g, *p; |
315 | struct task_struct *chosen = NULL; | 316 | struct task_struct *chosen = NULL; |
@@ -335,7 +336,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
335 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) { | 336 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) { |
336 | if (unlikely(frozen(p))) | 337 | if (unlikely(frozen(p))) |
337 | __thaw_task(p); | 338 | __thaw_task(p); |
338 | return ERR_PTR(-1UL); | 339 | if (!force_kill) |
340 | return ERR_PTR(-1UL); | ||
339 | } | 341 | } |
340 | if (!p->mm) | 342 | if (!p->mm) |
341 | continue; | 343 | continue; |
@@ -353,7 +355,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
353 | if (p == current) { | 355 | if (p == current) { |
354 | chosen = p; | 356 | chosen = p; |
355 | *ppoints = 1000; | 357 | *ppoints = 1000; |
356 | } else { | 358 | } else if (!force_kill) { |
357 | /* | 359 | /* |
358 | * If this task is not being ptraced on exit, | 360 | * If this task is not being ptraced on exit, |
359 | * then wait for it to finish before killing | 361 | * then wait for it to finish before killing |
@@ -434,66 +436,18 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
434 | } | 436 | } |
435 | 437 | ||
436 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 438 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
437 | static int oom_kill_task(struct task_struct *p) | 439 | static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, |
438 | { | 440 | unsigned int points, unsigned long totalpages, |
439 | struct task_struct *q; | 441 | struct mem_cgroup *memcg, nodemask_t *nodemask, |
440 | struct mm_struct *mm; | 442 | const char *message) |
441 | |||
442 | p = find_lock_task_mm(p); | ||
443 | if (!p) | ||
444 | return 1; | ||
445 | |||
446 | /* mm cannot be safely dereferenced after task_unlock(p) */ | ||
447 | mm = p->mm; | ||
448 | |||
449 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", | ||
450 | task_pid_nr(p), p->comm, K(p->mm->total_vm), | ||
451 | K(get_mm_counter(p->mm, MM_ANONPAGES)), | ||
452 | K(get_mm_counter(p->mm, MM_FILEPAGES))); | ||
453 | task_unlock(p); | ||
454 | |||
455 | /* | ||
456 | * Kill all user processes sharing p->mm in other thread groups, if any. | ||
457 | * They don't get access to memory reserves or a higher scheduler | ||
458 | * priority, though, to avoid depletion of all memory or task | ||
459 | * starvation. This prevents mm->mmap_sem livelock when an oom killed | ||
460 | * task cannot exit because it requires the semaphore and its contended | ||
461 | * by another thread trying to allocate memory itself. That thread will | ||
462 | * now get access to memory reserves since it has a pending fatal | ||
463 | * signal. | ||
464 | */ | ||
465 | for_each_process(q) | ||
466 | if (q->mm == mm && !same_thread_group(q, p) && | ||
467 | !(q->flags & PF_KTHREAD)) { | ||
468 | if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
469 | continue; | ||
470 | |||
471 | task_lock(q); /* Protect ->comm from prctl() */ | ||
472 | pr_err("Kill process %d (%s) sharing same memory\n", | ||
473 | task_pid_nr(q), q->comm); | ||
474 | task_unlock(q); | ||
475 | force_sig(SIGKILL, q); | ||
476 | } | ||
477 | |||
478 | set_tsk_thread_flag(p, TIF_MEMDIE); | ||
479 | force_sig(SIGKILL, p); | ||
480 | |||
481 | return 0; | ||
482 | } | ||
483 | #undef K | ||
484 | |||
485 | static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | ||
486 | unsigned int points, unsigned long totalpages, | ||
487 | struct mem_cgroup *memcg, nodemask_t *nodemask, | ||
488 | const char *message) | ||
489 | { | 443 | { |
490 | struct task_struct *victim = p; | 444 | struct task_struct *victim = p; |
491 | struct task_struct *child; | 445 | struct task_struct *child; |
492 | struct task_struct *t = p; | 446 | struct task_struct *t = p; |
447 | struct mm_struct *mm; | ||
493 | unsigned int victim_points = 0; | 448 | unsigned int victim_points = 0; |
494 | 449 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, | |
495 | if (printk_ratelimit()) | 450 | DEFAULT_RATELIMIT_BURST); |
496 | dump_header(p, gfp_mask, order, memcg, nodemask); | ||
497 | 451 | ||
498 | /* | 452 | /* |
499 | * If the task is already exiting, don't alarm the sysadmin or kill | 453 | * If the task is already exiting, don't alarm the sysadmin or kill |
@@ -501,9 +455,12 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
501 | */ | 455 | */ |
502 | if (p->flags & PF_EXITING) { | 456 | if (p->flags & PF_EXITING) { |
503 | set_tsk_thread_flag(p, TIF_MEMDIE); | 457 | set_tsk_thread_flag(p, TIF_MEMDIE); |
504 | return 0; | 458 | return; |
505 | } | 459 | } |
506 | 460 | ||
461 | if (__ratelimit(&oom_rs)) | ||
462 | dump_header(p, gfp_mask, order, memcg, nodemask); | ||
463 | |||
507 | task_lock(p); | 464 | task_lock(p); |
508 | pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", | 465 | pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", |
509 | message, task_pid_nr(p), p->comm, points); | 466 | message, task_pid_nr(p), p->comm, points); |
@@ -533,8 +490,44 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
533 | } | 490 | } |
534 | } while_each_thread(p, t); | 491 | } while_each_thread(p, t); |
535 | 492 | ||
536 | return oom_kill_task(victim); | 493 | victim = find_lock_task_mm(victim); |
494 | if (!victim) | ||
495 | return; | ||
496 | |||
497 | /* mm cannot safely be dereferenced after task_unlock(victim) */ | ||
498 | mm = victim->mm; | ||
499 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", | ||
500 | task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), | ||
501 | K(get_mm_counter(victim->mm, MM_ANONPAGES)), | ||
502 | K(get_mm_counter(victim->mm, MM_FILEPAGES))); | ||
503 | task_unlock(victim); | ||
504 | |||
505 | /* | ||
506 | * Kill all user processes sharing victim->mm in other thread groups, if | ||
507 | * any. They don't get access to memory reserves, though, to avoid | ||
508 | * depletion of all memory. This prevents mm->mmap_sem livelock when an | ||
509 | * oom killed thread cannot exit because it requires the semaphore and | ||
510 | * its contended by another thread trying to allocate memory itself. | ||
511 | * That thread will now get access to memory reserves since it has a | ||
512 | * pending fatal signal. | ||
513 | */ | ||
514 | for_each_process(p) | ||
515 | if (p->mm == mm && !same_thread_group(p, victim) && | ||
516 | !(p->flags & PF_KTHREAD)) { | ||
517 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
518 | continue; | ||
519 | |||
520 | task_lock(p); /* Protect ->comm from prctl() */ | ||
521 | pr_err("Kill process %d (%s) sharing same memory\n", | ||
522 | task_pid_nr(p), p->comm); | ||
523 | task_unlock(p); | ||
524 | force_sig(SIGKILL, p); | ||
525 | } | ||
526 | |||
527 | set_tsk_thread_flag(victim, TIF_MEMDIE); | ||
528 | force_sig(SIGKILL, victim); | ||
537 | } | 529 | } |
530 | #undef K | ||
538 | 531 | ||
539 | /* | 532 | /* |
540 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. | 533 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. |
@@ -561,7 +554,8 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | |||
561 | } | 554 | } |
562 | 555 | ||
563 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 556 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
564 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) | 557 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, |
558 | int order) | ||
565 | { | 559 | { |
566 | unsigned long limit; | 560 | unsigned long limit; |
567 | unsigned int points = 0; | 561 | unsigned int points = 0; |
@@ -577,18 +571,13 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) | |||
577 | return; | 571 | return; |
578 | } | 572 | } |
579 | 573 | ||
580 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); | 574 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); |
581 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; | 575 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; |
582 | read_lock(&tasklist_lock); | 576 | read_lock(&tasklist_lock); |
583 | retry: | 577 | p = select_bad_process(&points, limit, memcg, NULL, false); |
584 | p = select_bad_process(&points, limit, memcg, NULL); | 578 | if (p && PTR_ERR(p) != -1UL) |
585 | if (!p || PTR_ERR(p) == -1UL) | 579 | oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL, |
586 | goto out; | 580 | "Memory cgroup out of memory"); |
587 | |||
588 | if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL, | ||
589 | "Memory cgroup out of memory")) | ||
590 | goto retry; | ||
591 | out: | ||
592 | read_unlock(&tasklist_lock); | 581 | read_unlock(&tasklist_lock); |
593 | } | 582 | } |
594 | #endif | 583 | #endif |
@@ -700,6 +689,7 @@ static void clear_system_oom(void) | |||
700 | * @gfp_mask: memory allocation flags | 689 | * @gfp_mask: memory allocation flags |
701 | * @order: amount of memory being requested as a power of 2 | 690 | * @order: amount of memory being requested as a power of 2 |
702 | * @nodemask: nodemask passed to page allocator | 691 | * @nodemask: nodemask passed to page allocator |
692 | * @force_kill: true if a task must be killed, even if others are exiting | ||
703 | * | 693 | * |
704 | * If we run out of memory, we have the choice between either | 694 | * If we run out of memory, we have the choice between either |
705 | * killing a random task (bad), letting the system crash (worse) | 695 | * killing a random task (bad), letting the system crash (worse) |
@@ -707,7 +697,7 @@ static void clear_system_oom(void) | |||
707 | * don't have to be perfect here, we just have to be good. | 697 | * don't have to be perfect here, we just have to be good. |
708 | */ | 698 | */ |
709 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | 699 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
710 | int order, nodemask_t *nodemask) | 700 | int order, nodemask_t *nodemask, bool force_kill) |
711 | { | 701 | { |
712 | const nodemask_t *mpol_mask; | 702 | const nodemask_t *mpol_mask; |
713 | struct task_struct *p; | 703 | struct task_struct *p; |
@@ -745,33 +735,25 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
745 | if (sysctl_oom_kill_allocating_task && | 735 | if (sysctl_oom_kill_allocating_task && |
746 | !oom_unkillable_task(current, NULL, nodemask) && | 736 | !oom_unkillable_task(current, NULL, nodemask) && |
747 | current->mm) { | 737 | current->mm) { |
748 | /* | 738 | oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, |
749 | * oom_kill_process() needs tasklist_lock held. If it returns | 739 | nodemask, |
750 | * non-zero, current could not be killed so we must fallback to | 740 | "Out of memory (oom_kill_allocating_task)"); |
751 | * the tasklist scan. | ||
752 | */ | ||
753 | if (!oom_kill_process(current, gfp_mask, order, 0, totalpages, | ||
754 | NULL, nodemask, | ||
755 | "Out of memory (oom_kill_allocating_task)")) | ||
756 | goto out; | ||
757 | } | ||
758 | |||
759 | retry: | ||
760 | p = select_bad_process(&points, totalpages, NULL, mpol_mask); | ||
761 | if (PTR_ERR(p) == -1UL) | ||
762 | goto out; | 741 | goto out; |
742 | } | ||
763 | 743 | ||
744 | p = select_bad_process(&points, totalpages, NULL, mpol_mask, | ||
745 | force_kill); | ||
764 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 746 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
765 | if (!p) { | 747 | if (!p) { |
766 | dump_header(NULL, gfp_mask, order, NULL, mpol_mask); | 748 | dump_header(NULL, gfp_mask, order, NULL, mpol_mask); |
767 | read_unlock(&tasklist_lock); | 749 | read_unlock(&tasklist_lock); |
768 | panic("Out of memory and no killable processes...\n"); | 750 | panic("Out of memory and no killable processes...\n"); |
769 | } | 751 | } |
770 | 752 | if (PTR_ERR(p) != -1UL) { | |
771 | if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, | 753 | oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, |
772 | nodemask, "Out of memory")) | 754 | nodemask, "Out of memory"); |
773 | goto retry; | 755 | killed = 1; |
774 | killed = 1; | 756 | } |
775 | out: | 757 | out: |
776 | read_unlock(&tasklist_lock); | 758 | read_unlock(&tasklist_lock); |
777 | 759 | ||
@@ -792,7 +774,7 @@ out: | |||
792 | void pagefault_out_of_memory(void) | 774 | void pagefault_out_of_memory(void) |
793 | { | 775 | { |
794 | if (try_set_system_oom()) { | 776 | if (try_set_system_oom()) { |
795 | out_of_memory(NULL, 0, 0, NULL); | 777 | out_of_memory(NULL, 0, 0, NULL, false); |
796 | clear_system_oom(); | 778 | clear_system_oom(); |
797 | } | 779 | } |
798 | if (!test_thread_flag(TIF_MEMDIE)) | 780 | if (!test_thread_flag(TIF_MEMDIE)) |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 363ba7082ef5..3fc261705b1e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1472,6 +1472,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
1472 | 1472 | ||
1473 | for ( ; ; ) { | 1473 | for ( ; ; ) { |
1474 | global_dirty_limits(&background_thresh, &dirty_thresh); | 1474 | global_dirty_limits(&background_thresh, &dirty_thresh); |
1475 | dirty_thresh = hard_dirty_limit(dirty_thresh); | ||
1475 | 1476 | ||
1476 | /* | 1477 | /* |
1477 | * Boost the allowable dirty threshold a bit for page | 1478 | * Boost the allowable dirty threshold a bit for page |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a13ded1938f0..caea788628e4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1968,7 +1968,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
1968 | goto out; | 1968 | goto out; |
1969 | } | 1969 | } |
1970 | /* Exhausted what can be done so it's blamo time */ | 1970 | /* Exhausted what can be done so it's blamo time */ |
1971 | out_of_memory(zonelist, gfp_mask, order, nodemask); | 1971 | out_of_memory(zonelist, gfp_mask, order, nodemask, false); |
1972 | 1972 | ||
1973 | out: | 1973 | out: |
1974 | clear_zonelist_oom(zonelist, gfp_mask); | 1974 | clear_zonelist_oom(zonelist, gfp_mask); |
@@ -1990,7 +1990,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
1990 | if (!order) | 1990 | if (!order) |
1991 | return NULL; | 1991 | return NULL; |
1992 | 1992 | ||
1993 | if (compaction_deferred(preferred_zone)) { | 1993 | if (compaction_deferred(preferred_zone, order)) { |
1994 | *deferred_compaction = true; | 1994 | *deferred_compaction = true; |
1995 | return NULL; | 1995 | return NULL; |
1996 | } | 1996 | } |
@@ -2012,6 +2012,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2012 | if (page) { | 2012 | if (page) { |
2013 | preferred_zone->compact_considered = 0; | 2013 | preferred_zone->compact_considered = 0; |
2014 | preferred_zone->compact_defer_shift = 0; | 2014 | preferred_zone->compact_defer_shift = 0; |
2015 | if (order >= preferred_zone->compact_order_failed) | ||
2016 | preferred_zone->compact_order_failed = order + 1; | ||
2015 | count_vm_event(COMPACTSUCCESS); | 2017 | count_vm_event(COMPACTSUCCESS); |
2016 | return page; | 2018 | return page; |
2017 | } | 2019 | } |
@@ -2028,7 +2030,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2028 | * defer if the failure was a sync compaction failure. | 2030 | * defer if the failure was a sync compaction failure. |
2029 | */ | 2031 | */ |
2030 | if (sync_migration) | 2032 | if (sync_migration) |
2031 | defer_compaction(preferred_zone); | 2033 | defer_compaction(preferred_zone, order); |
2032 | 2034 | ||
2033 | cond_resched(); | 2035 | cond_resched(); |
2034 | } | 2036 | } |
@@ -2378,8 +2380,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2378 | { | 2380 | { |
2379 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 2381 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
2380 | struct zone *preferred_zone; | 2382 | struct zone *preferred_zone; |
2381 | struct page *page; | 2383 | struct page *page = NULL; |
2382 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2384 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2385 | unsigned int cpuset_mems_cookie; | ||
2383 | 2386 | ||
2384 | gfp_mask &= gfp_allowed_mask; | 2387 | gfp_mask &= gfp_allowed_mask; |
2385 | 2388 | ||
@@ -2398,15 +2401,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2398 | if (unlikely(!zonelist->_zonerefs->zone)) | 2401 | if (unlikely(!zonelist->_zonerefs->zone)) |
2399 | return NULL; | 2402 | return NULL; |
2400 | 2403 | ||
2401 | get_mems_allowed(); | 2404 | retry_cpuset: |
2405 | cpuset_mems_cookie = get_mems_allowed(); | ||
2406 | |||
2402 | /* The preferred zone is used for statistics later */ | 2407 | /* The preferred zone is used for statistics later */ |
2403 | first_zones_zonelist(zonelist, high_zoneidx, | 2408 | first_zones_zonelist(zonelist, high_zoneidx, |
2404 | nodemask ? : &cpuset_current_mems_allowed, | 2409 | nodemask ? : &cpuset_current_mems_allowed, |
2405 | &preferred_zone); | 2410 | &preferred_zone); |
2406 | if (!preferred_zone) { | 2411 | if (!preferred_zone) |
2407 | put_mems_allowed(); | 2412 | goto out; |
2408 | return NULL; | ||
2409 | } | ||
2410 | 2413 | ||
2411 | /* First allocation attempt */ | 2414 | /* First allocation attempt */ |
2412 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2415 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
@@ -2416,9 +2419,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2416 | page = __alloc_pages_slowpath(gfp_mask, order, | 2419 | page = __alloc_pages_slowpath(gfp_mask, order, |
2417 | zonelist, high_zoneidx, nodemask, | 2420 | zonelist, high_zoneidx, nodemask, |
2418 | preferred_zone, migratetype); | 2421 | preferred_zone, migratetype); |
2419 | put_mems_allowed(); | ||
2420 | 2422 | ||
2421 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2423 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
2424 | |||
2425 | out: | ||
2426 | /* | ||
2427 | * When updating a task's mems_allowed, it is possible to race with | ||
2428 | * parallel threads in such a way that an allocation can fail while | ||
2429 | * the mask is being updated. If a page allocation is about to fail, | ||
2430 | * check if the cpuset changed during allocation and if so, retry. | ||
2431 | */ | ||
2432 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
2433 | goto retry_cpuset; | ||
2434 | |||
2422 | return page; | 2435 | return page; |
2423 | } | 2436 | } |
2424 | EXPORT_SYMBOL(__alloc_pages_nodemask); | 2437 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
@@ -2632,13 +2645,15 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
2632 | bool skip_free_areas_node(unsigned int flags, int nid) | 2645 | bool skip_free_areas_node(unsigned int flags, int nid) |
2633 | { | 2646 | { |
2634 | bool ret = false; | 2647 | bool ret = false; |
2648 | unsigned int cpuset_mems_cookie; | ||
2635 | 2649 | ||
2636 | if (!(flags & SHOW_MEM_FILTER_NODES)) | 2650 | if (!(flags & SHOW_MEM_FILTER_NODES)) |
2637 | goto out; | 2651 | goto out; |
2638 | 2652 | ||
2639 | get_mems_allowed(); | 2653 | do { |
2640 | ret = !node_isset(nid, cpuset_current_mems_allowed); | 2654 | cpuset_mems_cookie = get_mems_allowed(); |
2641 | put_mems_allowed(); | 2655 | ret = !node_isset(nid, cpuset_current_mems_allowed); |
2656 | } while (!put_mems_allowed(cpuset_mems_cookie)); | ||
2642 | out: | 2657 | out: |
2643 | return ret; | 2658 | return ret; |
2644 | } | 2659 | } |
@@ -3925,18 +3940,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | |||
3925 | } | 3940 | } |
3926 | } | 3941 | } |
3927 | 3942 | ||
3928 | int __init add_from_early_node_map(struct range *range, int az, | ||
3929 | int nr_range, int nid) | ||
3930 | { | ||
3931 | unsigned long start_pfn, end_pfn; | ||
3932 | int i; | ||
3933 | |||
3934 | /* need to go over early_node_map to find out good range for node */ | ||
3935 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) | ||
3936 | nr_range = add_range(range, az, nr_range, start_pfn, end_pfn); | ||
3937 | return nr_range; | ||
3938 | } | ||
3939 | |||
3940 | /** | 3943 | /** |
3941 | * sparse_memory_present_with_active_regions - Call memory_present for each active range | 3944 | * sparse_memory_present_with_active_regions - Call memory_present for each active range |
3942 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. | 3945 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. |
@@ -4521,7 +4524,7 @@ static unsigned long __init early_calculate_totalpages(void) | |||
4521 | * memory. When they don't, some nodes will have more kernelcore than | 4524 | * memory. When they don't, some nodes will have more kernelcore than |
4522 | * others | 4525 | * others |
4523 | */ | 4526 | */ |
4524 | static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | 4527 | static void __init find_zone_movable_pfns_for_nodes(void) |
4525 | { | 4528 | { |
4526 | int i, nid; | 4529 | int i, nid; |
4527 | unsigned long usable_startpfn; | 4530 | unsigned long usable_startpfn; |
@@ -4713,7 +4716,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4713 | 4716 | ||
4714 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ | 4717 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ |
4715 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); | 4718 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); |
4716 | find_zone_movable_pfns_for_nodes(zone_movable_pfn); | 4719 | find_zone_movable_pfns_for_nodes(); |
4717 | 4720 | ||
4718 | /* Print out the zone ranges */ | 4721 | /* Print out the zone ranges */ |
4719 | printk("Zone PFN ranges:\n"); | 4722 | printk("Zone PFN ranges:\n"); |
@@ -4823,6 +4826,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self, | |||
4823 | int cpu = (unsigned long)hcpu; | 4826 | int cpu = (unsigned long)hcpu; |
4824 | 4827 | ||
4825 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | 4828 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { |
4829 | lru_add_drain_cpu(cpu); | ||
4826 | drain_pages(cpu); | 4830 | drain_pages(cpu); |
4827 | 4831 | ||
4828 | /* | 4832 | /* |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2f5cf10ff660..aa9701e12714 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -59,7 +59,7 @@ again: | |||
59 | continue; | 59 | continue; |
60 | 60 | ||
61 | split_huge_page_pmd(walk->mm, pmd); | 61 | split_huge_page_pmd(walk->mm, pmd); |
62 | if (pmd_none_or_clear_bad(pmd)) | 62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
63 | goto again; | 63 | goto again; |
64 | err = walk_pte_range(pmd, addr, next, walk); | 64 | err = walk_pte_range(pmd, addr, next, walk); |
65 | if (err) | 65 | if (err) |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index eb663fb533e0..5a74fea182f1 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -70,10 +70,11 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, | |||
70 | unsigned long address, pmd_t *pmdp) | 70 | unsigned long address, pmd_t *pmdp) |
71 | { | 71 | { |
72 | int young; | 72 | int young; |
73 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE | 73 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
74 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
75 | #else | ||
74 | BUG(); | 76 | BUG(); |
75 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 77 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
76 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
77 | young = pmdp_test_and_clear_young(vma, address, pmdp); | 78 | young = pmdp_test_and_clear_young(vma, address, pmdp); |
78 | if (young) | 79 | if (young) |
79 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | 80 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
@@ -120,6 +120,21 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | |||
120 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); | 120 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); |
121 | } | 121 | } |
122 | 122 | ||
123 | static void anon_vma_chain_link(struct vm_area_struct *vma, | ||
124 | struct anon_vma_chain *avc, | ||
125 | struct anon_vma *anon_vma) | ||
126 | { | ||
127 | avc->vma = vma; | ||
128 | avc->anon_vma = anon_vma; | ||
129 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
130 | |||
131 | /* | ||
132 | * It's critical to add new vmas to the tail of the anon_vma, | ||
133 | * see comment in huge_memory.c:__split_huge_page(). | ||
134 | */ | ||
135 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
136 | } | ||
137 | |||
123 | /** | 138 | /** |
124 | * anon_vma_prepare - attach an anon_vma to a memory region | 139 | * anon_vma_prepare - attach an anon_vma to a memory region |
125 | * @vma: the memory region in question | 140 | * @vma: the memory region in question |
@@ -175,10 +190,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
175 | spin_lock(&mm->page_table_lock); | 190 | spin_lock(&mm->page_table_lock); |
176 | if (likely(!vma->anon_vma)) { | 191 | if (likely(!vma->anon_vma)) { |
177 | vma->anon_vma = anon_vma; | 192 | vma->anon_vma = anon_vma; |
178 | avc->anon_vma = anon_vma; | 193 | anon_vma_chain_link(vma, avc, anon_vma); |
179 | avc->vma = vma; | ||
180 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
181 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
182 | allocated = NULL; | 194 | allocated = NULL; |
183 | avc = NULL; | 195 | avc = NULL; |
184 | } | 196 | } |
@@ -224,21 +236,6 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) | |||
224 | mutex_unlock(&root->mutex); | 236 | mutex_unlock(&root->mutex); |
225 | } | 237 | } |
226 | 238 | ||
227 | static void anon_vma_chain_link(struct vm_area_struct *vma, | ||
228 | struct anon_vma_chain *avc, | ||
229 | struct anon_vma *anon_vma) | ||
230 | { | ||
231 | avc->vma = vma; | ||
232 | avc->anon_vma = anon_vma; | ||
233 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
234 | |||
235 | /* | ||
236 | * It's critical to add new vmas to the tail of the anon_vma, | ||
237 | * see comment in huge_memory.c:__split_huge_page(). | ||
238 | */ | ||
239 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
240 | } | ||
241 | |||
242 | /* | 239 | /* |
243 | * Attach the anon_vmas from src to dst. | 240 | * Attach the anon_vmas from src to dst. |
244 | * Returns 0 on success, -ENOMEM on failure. | 241 | * Returns 0 on success, -ENOMEM on failure. |
@@ -1151,10 +1148,15 @@ void page_add_new_anon_rmap(struct page *page, | |||
1151 | */ | 1148 | */ |
1152 | void page_add_file_rmap(struct page *page) | 1149 | void page_add_file_rmap(struct page *page) |
1153 | { | 1150 | { |
1151 | bool locked; | ||
1152 | unsigned long flags; | ||
1153 | |||
1154 | mem_cgroup_begin_update_page_stat(page, &locked, &flags); | ||
1154 | if (atomic_inc_and_test(&page->_mapcount)) { | 1155 | if (atomic_inc_and_test(&page->_mapcount)) { |
1155 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 1156 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
1156 | mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); | 1157 | mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); |
1157 | } | 1158 | } |
1159 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | ||
1158 | } | 1160 | } |
1159 | 1161 | ||
1160 | /** | 1162 | /** |
@@ -1165,9 +1167,21 @@ void page_add_file_rmap(struct page *page) | |||
1165 | */ | 1167 | */ |
1166 | void page_remove_rmap(struct page *page) | 1168 | void page_remove_rmap(struct page *page) |
1167 | { | 1169 | { |
1170 | bool anon = PageAnon(page); | ||
1171 | bool locked; | ||
1172 | unsigned long flags; | ||
1173 | |||
1174 | /* | ||
1175 | * The anon case has no mem_cgroup page_stat to update; but may | ||
1176 | * uncharge_page() below, where the lock ordering can deadlock if | ||
1177 | * we hold the lock against page_stat move: so avoid it on anon. | ||
1178 | */ | ||
1179 | if (!anon) | ||
1180 | mem_cgroup_begin_update_page_stat(page, &locked, &flags); | ||
1181 | |||
1168 | /* page still mapped by someone else? */ | 1182 | /* page still mapped by someone else? */ |
1169 | if (!atomic_add_negative(-1, &page->_mapcount)) | 1183 | if (!atomic_add_negative(-1, &page->_mapcount)) |
1170 | return; | 1184 | goto out; |
1171 | 1185 | ||
1172 | /* | 1186 | /* |
1173 | * Now that the last pte has gone, s390 must transfer dirty | 1187 | * Now that the last pte has gone, s390 must transfer dirty |
@@ -1176,7 +1190,7 @@ void page_remove_rmap(struct page *page) | |||
1176 | * not if it's in swapcache - there might be another pte slot | 1190 | * not if it's in swapcache - there might be another pte slot |
1177 | * containing the swap entry, but page not yet written to swap. | 1191 | * containing the swap entry, but page not yet written to swap. |
1178 | */ | 1192 | */ |
1179 | if ((!PageAnon(page) || PageSwapCache(page)) && | 1193 | if ((!anon || PageSwapCache(page)) && |
1180 | page_test_and_clear_dirty(page_to_pfn(page), 1)) | 1194 | page_test_and_clear_dirty(page_to_pfn(page), 1)) |
1181 | set_page_dirty(page); | 1195 | set_page_dirty(page); |
1182 | /* | 1196 | /* |
@@ -1184,8 +1198,8 @@ void page_remove_rmap(struct page *page) | |||
1184 | * and not charged by memcg for now. | 1198 | * and not charged by memcg for now. |
1185 | */ | 1199 | */ |
1186 | if (unlikely(PageHuge(page))) | 1200 | if (unlikely(PageHuge(page))) |
1187 | return; | 1201 | goto out; |
1188 | if (PageAnon(page)) { | 1202 | if (anon) { |
1189 | mem_cgroup_uncharge_page(page); | 1203 | mem_cgroup_uncharge_page(page); |
1190 | if (!PageTransHuge(page)) | 1204 | if (!PageTransHuge(page)) |
1191 | __dec_zone_page_state(page, NR_ANON_PAGES); | 1205 | __dec_zone_page_state(page, NR_ANON_PAGES); |
@@ -1205,6 +1219,9 @@ void page_remove_rmap(struct page *page) | |||
1205 | * Leaving it set also helps swapoff to reinstate ptes | 1219 | * Leaving it set also helps swapoff to reinstate ptes |
1206 | * faster for those pages still in swapcache. | 1220 | * faster for those pages still in swapcache. |
1207 | */ | 1221 | */ |
1222 | out: | ||
1223 | if (!anon) | ||
1224 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | ||
1208 | } | 1225 | } |
1209 | 1226 | ||
1210 | /* | 1227 | /* |
@@ -1282,7 +1299,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1282 | } | 1299 | } |
1283 | dec_mm_counter(mm, MM_ANONPAGES); | 1300 | dec_mm_counter(mm, MM_ANONPAGES); |
1284 | inc_mm_counter(mm, MM_SWAPENTS); | 1301 | inc_mm_counter(mm, MM_SWAPENTS); |
1285 | } else if (PAGE_MIGRATION) { | 1302 | } else if (IS_ENABLED(CONFIG_MIGRATION)) { |
1286 | /* | 1303 | /* |
1287 | * Store the pfn of the page in a special migration | 1304 | * Store the pfn of the page in a special migration |
1288 | * pte. do_swap_page() will wait until the migration | 1305 | * pte. do_swap_page() will wait until the migration |
@@ -1293,7 +1310,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1293 | } | 1310 | } |
1294 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 1311 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
1295 | BUG_ON(pte_file(*pte)); | 1312 | BUG_ON(pte_file(*pte)); |
1296 | } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { | 1313 | } else if (IS_ENABLED(CONFIG_MIGRATION) && |
1314 | (TTU_ACTION(flags) == TTU_MIGRATION)) { | ||
1297 | /* Establish migration entry for a file page */ | 1315 | /* Establish migration entry for a file page */ |
1298 | swp_entry_t entry; | 1316 | swp_entry_t entry; |
1299 | entry = make_migration_entry(page, pte_write(pteval)); | 1317 | entry = make_migration_entry(page, pte_write(pteval)); |
@@ -1499,7 +1517,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1499 | * locking requirements of exec(), migration skips | 1517 | * locking requirements of exec(), migration skips |
1500 | * temporary VMAs until after exec() completes. | 1518 | * temporary VMAs until after exec() completes. |
1501 | */ | 1519 | */ |
1502 | if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && | 1520 | if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && |
1503 | is_vma_temporary_stack(vma)) | 1521 | is_vma_temporary_stack(vma)) |
1504 | continue; | 1522 | continue; |
1505 | 1523 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 7a45ad004cfd..f99ff3e50bd6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1178,6 +1178,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
1178 | static const struct inode_operations shmem_symlink_inode_operations; | 1178 | static const struct inode_operations shmem_symlink_inode_operations; |
1179 | static const struct inode_operations shmem_short_symlink_operations; | 1179 | static const struct inode_operations shmem_short_symlink_operations; |
1180 | 1180 | ||
1181 | #ifdef CONFIG_TMPFS_XATTR | ||
1182 | static int shmem_initxattrs(struct inode *, const struct xattr *, void *); | ||
1183 | #else | ||
1184 | #define shmem_initxattrs NULL | ||
1185 | #endif | ||
1186 | |||
1181 | static int | 1187 | static int |
1182 | shmem_write_begin(struct file *file, struct address_space *mapping, | 1188 | shmem_write_begin(struct file *file, struct address_space *mapping, |
1183 | loff_t pos, unsigned len, unsigned flags, | 1189 | loff_t pos, unsigned len, unsigned flags, |
@@ -1490,7 +1496,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) | |||
1490 | if (inode) { | 1496 | if (inode) { |
1491 | error = security_inode_init_security(inode, dir, | 1497 | error = security_inode_init_security(inode, dir, |
1492 | &dentry->d_name, | 1498 | &dentry->d_name, |
1493 | NULL, NULL); | 1499 | shmem_initxattrs, NULL); |
1494 | if (error) { | 1500 | if (error) { |
1495 | if (error != -EOPNOTSUPP) { | 1501 | if (error != -EOPNOTSUPP) { |
1496 | iput(inode); | 1502 | iput(inode); |
@@ -1630,7 +1636,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
1630 | return -ENOSPC; | 1636 | return -ENOSPC; |
1631 | 1637 | ||
1632 | error = security_inode_init_security(inode, dir, &dentry->d_name, | 1638 | error = security_inode_init_security(inode, dir, &dentry->d_name, |
1633 | NULL, NULL); | 1639 | shmem_initxattrs, NULL); |
1634 | if (error) { | 1640 | if (error) { |
1635 | if (error != -EOPNOTSUPP) { | 1641 | if (error != -EOPNOTSUPP) { |
1636 | iput(inode); | 1642 | iput(inode); |
@@ -1704,6 +1710,66 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co | |||
1704 | * filesystem level, though. | 1710 | * filesystem level, though. |
1705 | */ | 1711 | */ |
1706 | 1712 | ||
1713 | /* | ||
1714 | * Allocate new xattr and copy in the value; but leave the name to callers. | ||
1715 | */ | ||
1716 | static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size) | ||
1717 | { | ||
1718 | struct shmem_xattr *new_xattr; | ||
1719 | size_t len; | ||
1720 | |||
1721 | /* wrap around? */ | ||
1722 | len = sizeof(*new_xattr) + size; | ||
1723 | if (len <= sizeof(*new_xattr)) | ||
1724 | return NULL; | ||
1725 | |||
1726 | new_xattr = kmalloc(len, GFP_KERNEL); | ||
1727 | if (!new_xattr) | ||
1728 | return NULL; | ||
1729 | |||
1730 | new_xattr->size = size; | ||
1731 | memcpy(new_xattr->value, value, size); | ||
1732 | return new_xattr; | ||
1733 | } | ||
1734 | |||
1735 | /* | ||
1736 | * Callback for security_inode_init_security() for acquiring xattrs. | ||
1737 | */ | ||
1738 | static int shmem_initxattrs(struct inode *inode, | ||
1739 | const struct xattr *xattr_array, | ||
1740 | void *fs_info) | ||
1741 | { | ||
1742 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
1743 | const struct xattr *xattr; | ||
1744 | struct shmem_xattr *new_xattr; | ||
1745 | size_t len; | ||
1746 | |||
1747 | for (xattr = xattr_array; xattr->name != NULL; xattr++) { | ||
1748 | new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len); | ||
1749 | if (!new_xattr) | ||
1750 | return -ENOMEM; | ||
1751 | |||
1752 | len = strlen(xattr->name) + 1; | ||
1753 | new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, | ||
1754 | GFP_KERNEL); | ||
1755 | if (!new_xattr->name) { | ||
1756 | kfree(new_xattr); | ||
1757 | return -ENOMEM; | ||
1758 | } | ||
1759 | |||
1760 | memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, | ||
1761 | XATTR_SECURITY_PREFIX_LEN); | ||
1762 | memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, | ||
1763 | xattr->name, len); | ||
1764 | |||
1765 | spin_lock(&info->lock); | ||
1766 | list_add(&new_xattr->list, &info->xattr_list); | ||
1767 | spin_unlock(&info->lock); | ||
1768 | } | ||
1769 | |||
1770 | return 0; | ||
1771 | } | ||
1772 | |||
1707 | static int shmem_xattr_get(struct dentry *dentry, const char *name, | 1773 | static int shmem_xattr_get(struct dentry *dentry, const char *name, |
1708 | void *buffer, size_t size) | 1774 | void *buffer, size_t size) |
1709 | { | 1775 | { |
@@ -1731,24 +1797,17 @@ static int shmem_xattr_get(struct dentry *dentry, const char *name, | |||
1731 | return ret; | 1797 | return ret; |
1732 | } | 1798 | } |
1733 | 1799 | ||
1734 | static int shmem_xattr_set(struct dentry *dentry, const char *name, | 1800 | static int shmem_xattr_set(struct inode *inode, const char *name, |
1735 | const void *value, size_t size, int flags) | 1801 | const void *value, size_t size, int flags) |
1736 | { | 1802 | { |
1737 | struct inode *inode = dentry->d_inode; | ||
1738 | struct shmem_inode_info *info = SHMEM_I(inode); | 1803 | struct shmem_inode_info *info = SHMEM_I(inode); |
1739 | struct shmem_xattr *xattr; | 1804 | struct shmem_xattr *xattr; |
1740 | struct shmem_xattr *new_xattr = NULL; | 1805 | struct shmem_xattr *new_xattr = NULL; |
1741 | size_t len; | ||
1742 | int err = 0; | 1806 | int err = 0; |
1743 | 1807 | ||
1744 | /* value == NULL means remove */ | 1808 | /* value == NULL means remove */ |
1745 | if (value) { | 1809 | if (value) { |
1746 | /* wrap around? */ | 1810 | new_xattr = shmem_xattr_alloc(value, size); |
1747 | len = sizeof(*new_xattr) + size; | ||
1748 | if (len <= sizeof(*new_xattr)) | ||
1749 | return -ENOMEM; | ||
1750 | |||
1751 | new_xattr = kmalloc(len, GFP_KERNEL); | ||
1752 | if (!new_xattr) | 1811 | if (!new_xattr) |
1753 | return -ENOMEM; | 1812 | return -ENOMEM; |
1754 | 1813 | ||
@@ -1757,9 +1816,6 @@ static int shmem_xattr_set(struct dentry *dentry, const char *name, | |||
1757 | kfree(new_xattr); | 1816 | kfree(new_xattr); |
1758 | return -ENOMEM; | 1817 | return -ENOMEM; |
1759 | } | 1818 | } |
1760 | |||
1761 | new_xattr->size = size; | ||
1762 | memcpy(new_xattr->value, value, size); | ||
1763 | } | 1819 | } |
1764 | 1820 | ||
1765 | spin_lock(&info->lock); | 1821 | spin_lock(&info->lock); |
@@ -1858,7 +1914,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name, | |||
1858 | if (size == 0) | 1914 | if (size == 0) |
1859 | value = ""; /* empty EA, do not remove */ | 1915 | value = ""; /* empty EA, do not remove */ |
1860 | 1916 | ||
1861 | return shmem_xattr_set(dentry, name, value, size, flags); | 1917 | return shmem_xattr_set(dentry->d_inode, name, value, size, flags); |
1862 | 1918 | ||
1863 | } | 1919 | } |
1864 | 1920 | ||
@@ -1878,7 +1934,7 @@ static int shmem_removexattr(struct dentry *dentry, const char *name) | |||
1878 | if (err) | 1934 | if (err) |
1879 | return err; | 1935 | return err; |
1880 | 1936 | ||
1881 | return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE); | 1937 | return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); |
1882 | } | 1938 | } |
1883 | 1939 | ||
1884 | static bool xattr_is_trusted(const char *name) | 1940 | static bool xattr_is_trusted(const char *name) |
@@ -3284,12 +3284,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3284 | if (in_interrupt() || (flags & __GFP_THISNODE)) | 3284 | if (in_interrupt() || (flags & __GFP_THISNODE)) |
3285 | return NULL; | 3285 | return NULL; |
3286 | nid_alloc = nid_here = numa_mem_id(); | 3286 | nid_alloc = nid_here = numa_mem_id(); |
3287 | get_mems_allowed(); | ||
3288 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | 3287 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) |
3289 | nid_alloc = cpuset_slab_spread_node(); | 3288 | nid_alloc = cpuset_slab_spread_node(); |
3290 | else if (current->mempolicy) | 3289 | else if (current->mempolicy) |
3291 | nid_alloc = slab_node(current->mempolicy); | 3290 | nid_alloc = slab_node(current->mempolicy); |
3292 | put_mems_allowed(); | ||
3293 | if (nid_alloc != nid_here) | 3291 | if (nid_alloc != nid_here) |
3294 | return ____cache_alloc_node(cachep, flags, nid_alloc); | 3292 | return ____cache_alloc_node(cachep, flags, nid_alloc); |
3295 | return NULL; | 3293 | return NULL; |
@@ -3312,14 +3310,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3312 | enum zone_type high_zoneidx = gfp_zone(flags); | 3310 | enum zone_type high_zoneidx = gfp_zone(flags); |
3313 | void *obj = NULL; | 3311 | void *obj = NULL; |
3314 | int nid; | 3312 | int nid; |
3313 | unsigned int cpuset_mems_cookie; | ||
3315 | 3314 | ||
3316 | if (flags & __GFP_THISNODE) | 3315 | if (flags & __GFP_THISNODE) |
3317 | return NULL; | 3316 | return NULL; |
3318 | 3317 | ||
3319 | get_mems_allowed(); | ||
3320 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | ||
3321 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | 3318 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
3322 | 3319 | ||
3320 | retry_cpuset: | ||
3321 | cpuset_mems_cookie = get_mems_allowed(); | ||
3322 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | ||
3323 | |||
3323 | retry: | 3324 | retry: |
3324 | /* | 3325 | /* |
3325 | * Look through allowed nodes for objects available | 3326 | * Look through allowed nodes for objects available |
@@ -3372,7 +3373,9 @@ retry: | |||
3372 | } | 3373 | } |
3373 | } | 3374 | } |
3374 | } | 3375 | } |
3375 | put_mems_allowed(); | 3376 | |
3377 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) | ||
3378 | goto retry_cpuset; | ||
3376 | return obj; | 3379 | return obj; |
3377 | } | 3380 | } |
3378 | 3381 | ||
@@ -1581,6 +1581,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, | |||
1581 | struct zone *zone; | 1581 | struct zone *zone; |
1582 | enum zone_type high_zoneidx = gfp_zone(flags); | 1582 | enum zone_type high_zoneidx = gfp_zone(flags); |
1583 | void *object; | 1583 | void *object; |
1584 | unsigned int cpuset_mems_cookie; | ||
1584 | 1585 | ||
1585 | /* | 1586 | /* |
1586 | * The defrag ratio allows a configuration of the tradeoffs between | 1587 | * The defrag ratio allows a configuration of the tradeoffs between |
@@ -1604,23 +1605,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, | |||
1604 | get_cycles() % 1024 > s->remote_node_defrag_ratio) | 1605 | get_cycles() % 1024 > s->remote_node_defrag_ratio) |
1605 | return NULL; | 1606 | return NULL; |
1606 | 1607 | ||
1607 | get_mems_allowed(); | 1608 | do { |
1608 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | 1609 | cpuset_mems_cookie = get_mems_allowed(); |
1609 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1610 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); |
1610 | struct kmem_cache_node *n; | 1611 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1611 | 1612 | struct kmem_cache_node *n; | |
1612 | n = get_node(s, zone_to_nid(zone)); | 1613 | |
1613 | 1614 | n = get_node(s, zone_to_nid(zone)); | |
1614 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && | 1615 | |
1615 | n->nr_partial > s->min_partial) { | 1616 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && |
1616 | object = get_partial_node(s, n, c); | 1617 | n->nr_partial > s->min_partial) { |
1617 | if (object) { | 1618 | object = get_partial_node(s, n, c); |
1618 | put_mems_allowed(); | 1619 | if (object) { |
1619 | return object; | 1620 | /* |
1621 | * Return the object even if | ||
1622 | * put_mems_allowed indicated that | ||
1623 | * the cpuset mems_allowed was | ||
1624 | * updated in parallel. It's a | ||
1625 | * harmless race between the alloc | ||
1626 | * and the cpuset update. | ||
1627 | */ | ||
1628 | put_mems_allowed(cpuset_mems_cookie); | ||
1629 | return object; | ||
1630 | } | ||
1620 | } | 1631 | } |
1621 | } | 1632 | } |
1622 | } | 1633 | } while (!put_mems_allowed(cpuset_mems_cookie)); |
1623 | put_mems_allowed(); | ||
1624 | #endif | 1634 | #endif |
1625 | return NULL; | 1635 | return NULL; |
1626 | } | 1636 | } |
diff --git a/mm/sparse.c b/mm/sparse.c index 61d7cde23111..a8bc7d364deb 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -353,29 +353,21 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, | |||
353 | 353 | ||
354 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), | 354 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), |
355 | usemap_count); | 355 | usemap_count); |
356 | if (usemap) { | 356 | if (!usemap) { |
357 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 357 | usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); |
358 | if (!present_section_nr(pnum)) | 358 | if (!usemap) { |
359 | continue; | 359 | printk(KERN_WARNING "%s: allocation failed\n", __func__); |
360 | usemap_map[pnum] = usemap; | 360 | return; |
361 | usemap += size; | ||
362 | } | 361 | } |
363 | return; | ||
364 | } | 362 | } |
365 | 363 | ||
366 | usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); | 364 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
367 | if (usemap) { | 365 | if (!present_section_nr(pnum)) |
368 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 366 | continue; |
369 | if (!present_section_nr(pnum)) | 367 | usemap_map[pnum] = usemap; |
370 | continue; | 368 | usemap += size; |
371 | usemap_map[pnum] = usemap; | 369 | check_usemap_section_nr(nodeid, usemap_map[pnum]); |
372 | usemap += size; | ||
373 | check_usemap_section_nr(nodeid, usemap_map[pnum]); | ||
374 | } | ||
375 | return; | ||
376 | } | 370 | } |
377 | |||
378 | printk(KERN_WARNING "%s: allocation failed\n", __func__); | ||
379 | } | 371 | } |
380 | 372 | ||
381 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 373 | #ifndef CONFIG_SPARSEMEM_VMEMMAP |
@@ -496,7 +496,7 @@ static void lru_deactivate_fn(struct page *page, void *arg) | |||
496 | * Either "cpu" is the current CPU, and preemption has already been | 496 | * Either "cpu" is the current CPU, and preemption has already been |
497 | * disabled; or "cpu" is being hot-unplugged, and is already dead. | 497 | * disabled; or "cpu" is being hot-unplugged, and is already dead. |
498 | */ | 498 | */ |
499 | static void drain_cpu_pagevecs(int cpu) | 499 | void lru_add_drain_cpu(int cpu) |
500 | { | 500 | { |
501 | struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); | 501 | struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); |
502 | struct pagevec *pvec; | 502 | struct pagevec *pvec; |
@@ -553,7 +553,7 @@ void deactivate_page(struct page *page) | |||
553 | 553 | ||
554 | void lru_add_drain(void) | 554 | void lru_add_drain(void) |
555 | { | 555 | { |
556 | drain_cpu_pagevecs(get_cpu()); | 556 | lru_add_drain_cpu(get_cpu()); |
557 | put_cpu(); | 557 | put_cpu(); |
558 | } | 558 | } |
559 | 559 | ||
diff --git a/mm/swap_state.c b/mm/swap_state.c index ea6b32d61873..9d3dd3763cf7 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -372,25 +372,23 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
372 | struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | 372 | struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, |
373 | struct vm_area_struct *vma, unsigned long addr) | 373 | struct vm_area_struct *vma, unsigned long addr) |
374 | { | 374 | { |
375 | int nr_pages; | ||
376 | struct page *page; | 375 | struct page *page; |
377 | unsigned long offset; | 376 | unsigned long offset = swp_offset(entry); |
378 | unsigned long end_offset; | 377 | unsigned long start_offset, end_offset; |
378 | unsigned long mask = (1UL << page_cluster) - 1; | ||
379 | 379 | ||
380 | /* | 380 | /* Read a page_cluster sized and aligned cluster around offset. */ |
381 | * Get starting offset for readaround, and number of pages to read. | 381 | start_offset = offset & ~mask; |
382 | * Adjust starting address by readbehind (for NUMA interleave case)? | 382 | end_offset = offset | mask; |
383 | * No, it's very unlikely that swap layout would follow vma layout, | 383 | if (!start_offset) /* First page is swap header. */ |
384 | * more likely that neighbouring swap pages came from the same node: | 384 | start_offset++; |
385 | * so use the same "addr" to choose the same node for each swap read. | 385 | |
386 | */ | 386 | for (offset = start_offset; offset <= end_offset ; offset++) { |
387 | nr_pages = valid_swaphandles(entry, &offset); | ||
388 | for (end_offset = offset + nr_pages; offset < end_offset; offset++) { | ||
389 | /* Ok, do the async read-ahead now */ | 387 | /* Ok, do the async read-ahead now */ |
390 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), | 388 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), |
391 | gfp_mask, vma, addr); | 389 | gfp_mask, vma, addr); |
392 | if (!page) | 390 | if (!page) |
393 | break; | 391 | continue; |
394 | page_cache_release(page); | 392 | page_cache_release(page); |
395 | } | 393 | } |
396 | lru_add_drain(); /* Push any new pages onto the LRU now */ | 394 | lru_add_drain(); /* Push any new pages onto the LRU now */ |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 6bf67ab6e469..dae42f380d6e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
932 | pmd = pmd_offset(pud, addr); | 932 | pmd = pmd_offset(pud, addr); |
933 | do { | 933 | do { |
934 | next = pmd_addr_end(addr, end); | 934 | next = pmd_addr_end(addr, end); |
935 | if (unlikely(pmd_trans_huge(*pmd))) | 935 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
936 | continue; | ||
937 | if (pmd_none_or_clear_bad(pmd)) | ||
938 | continue; | 936 | continue; |
939 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); | 937 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); |
940 | if (ret) | 938 | if (ret) |
@@ -2107,7 +2105,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2107 | p->flags |= SWP_SOLIDSTATE; | 2105 | p->flags |= SWP_SOLIDSTATE; |
2108 | p->cluster_next = 1 + (random32() % p->highest_bit); | 2106 | p->cluster_next = 1 + (random32() % p->highest_bit); |
2109 | } | 2107 | } |
2110 | if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD)) | 2108 | if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) |
2111 | p->flags |= SWP_DISCARDABLE; | 2109 | p->flags |= SWP_DISCARDABLE; |
2112 | } | 2110 | } |
2113 | 2111 | ||
@@ -2292,58 +2290,6 @@ int swapcache_prepare(swp_entry_t entry) | |||
2292 | } | 2290 | } |
2293 | 2291 | ||
2294 | /* | 2292 | /* |
2295 | * swap_lock prevents swap_map being freed. Don't grab an extra | ||
2296 | * reference on the swaphandle, it doesn't matter if it becomes unused. | ||
2297 | */ | ||
2298 | int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | ||
2299 | { | ||
2300 | struct swap_info_struct *si; | ||
2301 | int our_page_cluster = page_cluster; | ||
2302 | pgoff_t target, toff; | ||
2303 | pgoff_t base, end; | ||
2304 | int nr_pages = 0; | ||
2305 | |||
2306 | if (!our_page_cluster) /* no readahead */ | ||
2307 | return 0; | ||
2308 | |||
2309 | si = swap_info[swp_type(entry)]; | ||
2310 | target = swp_offset(entry); | ||
2311 | base = (target >> our_page_cluster) << our_page_cluster; | ||
2312 | end = base + (1 << our_page_cluster); | ||
2313 | if (!base) /* first page is swap header */ | ||
2314 | base++; | ||
2315 | |||
2316 | spin_lock(&swap_lock); | ||
2317 | if (end > si->max) /* don't go beyond end of map */ | ||
2318 | end = si->max; | ||
2319 | |||
2320 | /* Count contiguous allocated slots above our target */ | ||
2321 | for (toff = target; ++toff < end; nr_pages++) { | ||
2322 | /* Don't read in free or bad pages */ | ||
2323 | if (!si->swap_map[toff]) | ||
2324 | break; | ||
2325 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) | ||
2326 | break; | ||
2327 | } | ||
2328 | /* Count contiguous allocated slots below our target */ | ||
2329 | for (toff = target; --toff >= base; nr_pages++) { | ||
2330 | /* Don't read in free or bad pages */ | ||
2331 | if (!si->swap_map[toff]) | ||
2332 | break; | ||
2333 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) | ||
2334 | break; | ||
2335 | } | ||
2336 | spin_unlock(&swap_lock); | ||
2337 | |||
2338 | /* | ||
2339 | * Indicate starting offset, and return number of pages to get: | ||
2340 | * if only 1, say 0, since there's then no readahead to be done. | ||
2341 | */ | ||
2342 | *offset = ++toff; | ||
2343 | return nr_pages? ++nr_pages: 0; | ||
2344 | } | ||
2345 | |||
2346 | /* | ||
2347 | * add_swap_count_continuation - called when a swap count is duplicated | 2293 | * add_swap_count_continuation - called when a swap count is duplicated |
2348 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's | 2294 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's |
2349 | * page of the original vmalloc'ed swap_map, to hold the continuation count | 2295 | * page of the original vmalloc'ed swap_map, to hold the continuation count |
@@ -239,6 +239,47 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, | |||
239 | next->vm_prev = vma; | 239 | next->vm_prev = vma; |
240 | } | 240 | } |
241 | 241 | ||
242 | /* Check if the vma is being used as a stack by this task */ | ||
243 | static int vm_is_stack_for_task(struct task_struct *t, | ||
244 | struct vm_area_struct *vma) | ||
245 | { | ||
246 | return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * Check if the vma is being used as a stack. | ||
251 | * If is_group is non-zero, check in the entire thread group or else | ||
252 | * just check in the current task. Returns the pid of the task that | ||
253 | * the vma is stack for. | ||
254 | */ | ||
255 | pid_t vm_is_stack(struct task_struct *task, | ||
256 | struct vm_area_struct *vma, int in_group) | ||
257 | { | ||
258 | pid_t ret = 0; | ||
259 | |||
260 | if (vm_is_stack_for_task(task, vma)) | ||
261 | return task->pid; | ||
262 | |||
263 | if (in_group) { | ||
264 | struct task_struct *t; | ||
265 | rcu_read_lock(); | ||
266 | if (!pid_alive(task)) | ||
267 | goto done; | ||
268 | |||
269 | t = task; | ||
270 | do { | ||
271 | if (vm_is_stack_for_task(t, vma)) { | ||
272 | ret = t->pid; | ||
273 | goto done; | ||
274 | } | ||
275 | } while_each_thread(task, t); | ||
276 | done: | ||
277 | rcu_read_unlock(); | ||
278 | } | ||
279 | |||
280 | return ret; | ||
281 | } | ||
282 | |||
242 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) | 283 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) |
243 | void arch_pick_mmap_layout(struct mm_struct *mm) | 284 | void arch_pick_mmap_layout(struct mm_struct *mm) |
244 | { | 285 | { |
diff --git a/mm/vmscan.c b/mm/vmscan.c index c52b23552659..49f15ef0a99a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1138,7 +1138,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) | |||
1138 | * @mz: The mem_cgroup_zone to pull pages from. | 1138 | * @mz: The mem_cgroup_zone to pull pages from. |
1139 | * @dst: The temp list to put pages on to. | 1139 | * @dst: The temp list to put pages on to. |
1140 | * @nr_scanned: The number of pages that were scanned. | 1140 | * @nr_scanned: The number of pages that were scanned. |
1141 | * @order: The caller's attempted allocation order | 1141 | * @sc: The scan_control struct for this reclaim session |
1142 | * @mode: One of the LRU isolation modes | 1142 | * @mode: One of the LRU isolation modes |
1143 | * @active: True [1] if isolating active pages | 1143 | * @active: True [1] if isolating active pages |
1144 | * @file: True [1] if isolating file [!anon] pages | 1144 | * @file: True [1] if isolating file [!anon] pages |
@@ -1147,8 +1147,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) | |||
1147 | */ | 1147 | */ |
1148 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 1148 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
1149 | struct mem_cgroup_zone *mz, struct list_head *dst, | 1149 | struct mem_cgroup_zone *mz, struct list_head *dst, |
1150 | unsigned long *nr_scanned, int order, isolate_mode_t mode, | 1150 | unsigned long *nr_scanned, struct scan_control *sc, |
1151 | int active, int file) | 1151 | isolate_mode_t mode, int active, int file) |
1152 | { | 1152 | { |
1153 | struct lruvec *lruvec; | 1153 | struct lruvec *lruvec; |
1154 | struct list_head *src; | 1154 | struct list_head *src; |
@@ -1194,7 +1194,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1194 | BUG(); | 1194 | BUG(); |
1195 | } | 1195 | } |
1196 | 1196 | ||
1197 | if (!order) | 1197 | if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)) |
1198 | continue; | 1198 | continue; |
1199 | 1199 | ||
1200 | /* | 1200 | /* |
@@ -1208,8 +1208,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1208 | */ | 1208 | */ |
1209 | zone_id = page_zone_id(page); | 1209 | zone_id = page_zone_id(page); |
1210 | page_pfn = page_to_pfn(page); | 1210 | page_pfn = page_to_pfn(page); |
1211 | pfn = page_pfn & ~((1 << order) - 1); | 1211 | pfn = page_pfn & ~((1 << sc->order) - 1); |
1212 | end_pfn = pfn + (1 << order); | 1212 | end_pfn = pfn + (1 << sc->order); |
1213 | for (; pfn < end_pfn; pfn++) { | 1213 | for (; pfn < end_pfn; pfn++) { |
1214 | struct page *cursor_page; | 1214 | struct page *cursor_page; |
1215 | 1215 | ||
@@ -1275,7 +1275,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1275 | 1275 | ||
1276 | *nr_scanned = scan; | 1276 | *nr_scanned = scan; |
1277 | 1277 | ||
1278 | trace_mm_vmscan_lru_isolate(order, | 1278 | trace_mm_vmscan_lru_isolate(sc->order, |
1279 | nr_to_scan, scan, | 1279 | nr_to_scan, scan, |
1280 | nr_taken, | 1280 | nr_taken, |
1281 | nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, | 1281 | nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, |
@@ -1413,7 +1413,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz, | |||
1413 | unsigned long *nr_anon, | 1413 | unsigned long *nr_anon, |
1414 | unsigned long *nr_file) | 1414 | unsigned long *nr_file) |
1415 | { | 1415 | { |
1416 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | ||
1417 | struct zone *zone = mz->zone; | 1416 | struct zone *zone = mz->zone; |
1418 | unsigned int count[NR_LRU_LISTS] = { 0, }; | 1417 | unsigned int count[NR_LRU_LISTS] = { 0, }; |
1419 | unsigned long nr_active = 0; | 1418 | unsigned long nr_active = 0; |
@@ -1434,6 +1433,7 @@ update_isolated_counts(struct mem_cgroup_zone *mz, | |||
1434 | count[lru] += numpages; | 1433 | count[lru] += numpages; |
1435 | } | 1434 | } |
1436 | 1435 | ||
1436 | preempt_disable(); | ||
1437 | __count_vm_events(PGDEACTIVATE, nr_active); | 1437 | __count_vm_events(PGDEACTIVATE, nr_active); |
1438 | 1438 | ||
1439 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, | 1439 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, |
@@ -1448,8 +1448,9 @@ update_isolated_counts(struct mem_cgroup_zone *mz, | |||
1448 | *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; | 1448 | *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; |
1449 | *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; | 1449 | *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; |
1450 | 1450 | ||
1451 | reclaim_stat->recent_scanned[0] += *nr_anon; | 1451 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); |
1452 | reclaim_stat->recent_scanned[1] += *nr_file; | 1452 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); |
1453 | preempt_enable(); | ||
1453 | } | 1454 | } |
1454 | 1455 | ||
1455 | /* | 1456 | /* |
@@ -1509,8 +1510,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1509 | unsigned long nr_file; | 1510 | unsigned long nr_file; |
1510 | unsigned long nr_dirty = 0; | 1511 | unsigned long nr_dirty = 0; |
1511 | unsigned long nr_writeback = 0; | 1512 | unsigned long nr_writeback = 0; |
1512 | isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; | 1513 | isolate_mode_t isolate_mode = ISOLATE_INACTIVE; |
1513 | struct zone *zone = mz->zone; | 1514 | struct zone *zone = mz->zone; |
1515 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | ||
1514 | 1516 | ||
1515 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1517 | while (unlikely(too_many_isolated(zone, file, sc))) { |
1516 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1518 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
@@ -1522,20 +1524,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1522 | 1524 | ||
1523 | set_reclaim_mode(priority, sc, false); | 1525 | set_reclaim_mode(priority, sc, false); |
1524 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) | 1526 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) |
1525 | reclaim_mode |= ISOLATE_ACTIVE; | 1527 | isolate_mode |= ISOLATE_ACTIVE; |
1526 | 1528 | ||
1527 | lru_add_drain(); | 1529 | lru_add_drain(); |
1528 | 1530 | ||
1529 | if (!sc->may_unmap) | 1531 | if (!sc->may_unmap) |
1530 | reclaim_mode |= ISOLATE_UNMAPPED; | 1532 | isolate_mode |= ISOLATE_UNMAPPED; |
1531 | if (!sc->may_writepage) | 1533 | if (!sc->may_writepage) |
1532 | reclaim_mode |= ISOLATE_CLEAN; | 1534 | isolate_mode |= ISOLATE_CLEAN; |
1533 | 1535 | ||
1534 | spin_lock_irq(&zone->lru_lock); | 1536 | spin_lock_irq(&zone->lru_lock); |
1535 | 1537 | ||
1536 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, | 1538 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned, |
1537 | &nr_scanned, sc->order, | 1539 | sc, isolate_mode, 0, file); |
1538 | reclaim_mode, 0, file); | ||
1539 | if (global_reclaim(sc)) { | 1540 | if (global_reclaim(sc)) { |
1540 | zone->pages_scanned += nr_scanned; | 1541 | zone->pages_scanned += nr_scanned; |
1541 | if (current_is_kswapd()) | 1542 | if (current_is_kswapd()) |
@@ -1545,19 +1546,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1545 | __count_zone_vm_events(PGSCAN_DIRECT, zone, | 1546 | __count_zone_vm_events(PGSCAN_DIRECT, zone, |
1546 | nr_scanned); | 1547 | nr_scanned); |
1547 | } | 1548 | } |
1549 | spin_unlock_irq(&zone->lru_lock); | ||
1548 | 1550 | ||
1549 | if (nr_taken == 0) { | 1551 | if (nr_taken == 0) |
1550 | spin_unlock_irq(&zone->lru_lock); | ||
1551 | return 0; | 1552 | return 0; |
1552 | } | ||
1553 | 1553 | ||
1554 | update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); | 1554 | update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); |
1555 | 1555 | ||
1556 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); | ||
1557 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); | ||
1558 | |||
1559 | spin_unlock_irq(&zone->lru_lock); | ||
1560 | |||
1561 | nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, | 1556 | nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, |
1562 | &nr_dirty, &nr_writeback); | 1557 | &nr_dirty, &nr_writeback); |
1563 | 1558 | ||
@@ -1570,6 +1565,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1570 | 1565 | ||
1571 | spin_lock_irq(&zone->lru_lock); | 1566 | spin_lock_irq(&zone->lru_lock); |
1572 | 1567 | ||
1568 | reclaim_stat->recent_scanned[0] += nr_anon; | ||
1569 | reclaim_stat->recent_scanned[1] += nr_file; | ||
1570 | |||
1573 | if (current_is_kswapd()) | 1571 | if (current_is_kswapd()) |
1574 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); | 1572 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); |
1575 | __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); | 1573 | __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); |
@@ -1643,18 +1641,6 @@ static void move_active_pages_to_lru(struct zone *zone, | |||
1643 | unsigned long pgmoved = 0; | 1641 | unsigned long pgmoved = 0; |
1644 | struct page *page; | 1642 | struct page *page; |
1645 | 1643 | ||
1646 | if (buffer_heads_over_limit) { | ||
1647 | spin_unlock_irq(&zone->lru_lock); | ||
1648 | list_for_each_entry(page, list, lru) { | ||
1649 | if (page_has_private(page) && trylock_page(page)) { | ||
1650 | if (page_has_private(page)) | ||
1651 | try_to_release_page(page, 0); | ||
1652 | unlock_page(page); | ||
1653 | } | ||
1654 | } | ||
1655 | spin_lock_irq(&zone->lru_lock); | ||
1656 | } | ||
1657 | |||
1658 | while (!list_empty(list)) { | 1644 | while (!list_empty(list)) { |
1659 | struct lruvec *lruvec; | 1645 | struct lruvec *lruvec; |
1660 | 1646 | ||
@@ -1699,21 +1685,22 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1699 | struct page *page; | 1685 | struct page *page; |
1700 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | 1686 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); |
1701 | unsigned long nr_rotated = 0; | 1687 | unsigned long nr_rotated = 0; |
1702 | isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; | 1688 | isolate_mode_t isolate_mode = ISOLATE_ACTIVE; |
1703 | struct zone *zone = mz->zone; | 1689 | struct zone *zone = mz->zone; |
1704 | 1690 | ||
1705 | lru_add_drain(); | 1691 | lru_add_drain(); |
1706 | 1692 | ||
1693 | reset_reclaim_mode(sc); | ||
1694 | |||
1707 | if (!sc->may_unmap) | 1695 | if (!sc->may_unmap) |
1708 | reclaim_mode |= ISOLATE_UNMAPPED; | 1696 | isolate_mode |= ISOLATE_UNMAPPED; |
1709 | if (!sc->may_writepage) | 1697 | if (!sc->may_writepage) |
1710 | reclaim_mode |= ISOLATE_CLEAN; | 1698 | isolate_mode |= ISOLATE_CLEAN; |
1711 | 1699 | ||
1712 | spin_lock_irq(&zone->lru_lock); | 1700 | spin_lock_irq(&zone->lru_lock); |
1713 | 1701 | ||
1714 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, | 1702 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc, |
1715 | &nr_scanned, sc->order, | 1703 | isolate_mode, 1, file); |
1716 | reclaim_mode, 1, file); | ||
1717 | if (global_reclaim(sc)) | 1704 | if (global_reclaim(sc)) |
1718 | zone->pages_scanned += nr_scanned; | 1705 | zone->pages_scanned += nr_scanned; |
1719 | 1706 | ||
@@ -1737,6 +1724,14 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1737 | continue; | 1724 | continue; |
1738 | } | 1725 | } |
1739 | 1726 | ||
1727 | if (unlikely(buffer_heads_over_limit)) { | ||
1728 | if (page_has_private(page) && trylock_page(page)) { | ||
1729 | if (page_has_private(page)) | ||
1730 | try_to_release_page(page, 0); | ||
1731 | unlock_page(page); | ||
1732 | } | ||
1733 | } | ||
1734 | |||
1740 | if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { | 1735 | if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { |
1741 | nr_rotated += hpage_nr_pages(page); | 1736 | nr_rotated += hpage_nr_pages(page); |
1742 | /* | 1737 | /* |
@@ -2112,7 +2107,12 @@ restart: | |||
2112 | * with multiple processes reclaiming pages, the total | 2107 | * with multiple processes reclaiming pages, the total |
2113 | * freeing target can get unreasonably large. | 2108 | * freeing target can get unreasonably large. |
2114 | */ | 2109 | */ |
2115 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) | 2110 | if (nr_reclaimed >= nr_to_reclaim) |
2111 | nr_to_reclaim = 0; | ||
2112 | else | ||
2113 | nr_to_reclaim -= nr_reclaimed; | ||
2114 | |||
2115 | if (!nr_to_reclaim && priority < DEF_PRIORITY) | ||
2116 | break; | 2116 | break; |
2117 | } | 2117 | } |
2118 | blk_finish_plug(&plug); | 2118 | blk_finish_plug(&plug); |
@@ -2195,7 +2195,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | |||
2195 | * If compaction is deferred, reclaim up to a point where | 2195 | * If compaction is deferred, reclaim up to a point where |
2196 | * compaction will have a chance of success when re-enabled | 2196 | * compaction will have a chance of success when re-enabled |
2197 | */ | 2197 | */ |
2198 | if (compaction_deferred(zone)) | 2198 | if (compaction_deferred(zone, sc->order)) |
2199 | return watermark_ok; | 2199 | return watermark_ok; |
2200 | 2200 | ||
2201 | /* If compaction is not ready to start, keep reclaiming */ | 2201 | /* If compaction is not ready to start, keep reclaiming */ |
@@ -2235,6 +2235,14 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, | |||
2235 | unsigned long nr_soft_scanned; | 2235 | unsigned long nr_soft_scanned; |
2236 | bool aborted_reclaim = false; | 2236 | bool aborted_reclaim = false; |
2237 | 2237 | ||
2238 | /* | ||
2239 | * If the number of buffer_heads in the machine exceeds the maximum | ||
2240 | * allowed level, force direct reclaim to scan the highmem zone as | ||
2241 | * highmem pages could be pinning lowmem pages storing buffer_heads | ||
2242 | */ | ||
2243 | if (buffer_heads_over_limit) | ||
2244 | sc->gfp_mask |= __GFP_HIGHMEM; | ||
2245 | |||
2238 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2246 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2239 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2247 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
2240 | if (!populated_zone(zone)) | 2248 | if (!populated_zone(zone)) |
@@ -2255,8 +2263,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, | |||
2255 | * Even though compaction is invoked for any | 2263 | * Even though compaction is invoked for any |
2256 | * non-zero order, only frequent costly order | 2264 | * non-zero order, only frequent costly order |
2257 | * reclamation is disruptive enough to become a | 2265 | * reclamation is disruptive enough to become a |
2258 | * noticable problem, like transparent huge page | 2266 | * noticeable problem, like transparent huge |
2259 | * allocations. | 2267 | * page allocations. |
2260 | */ | 2268 | */ |
2261 | if (compaction_ready(zone, sc)) { | 2269 | if (compaction_ready(zone, sc)) { |
2262 | aborted_reclaim = true; | 2270 | aborted_reclaim = true; |
@@ -2337,7 +2345,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2337 | unsigned long writeback_threshold; | 2345 | unsigned long writeback_threshold; |
2338 | bool aborted_reclaim; | 2346 | bool aborted_reclaim; |
2339 | 2347 | ||
2340 | get_mems_allowed(); | ||
2341 | delayacct_freepages_start(); | 2348 | delayacct_freepages_start(); |
2342 | 2349 | ||
2343 | if (global_reclaim(sc)) | 2350 | if (global_reclaim(sc)) |
@@ -2401,7 +2408,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2401 | 2408 | ||
2402 | out: | 2409 | out: |
2403 | delayacct_freepages_end(); | 2410 | delayacct_freepages_end(); |
2404 | put_mems_allowed(); | ||
2405 | 2411 | ||
2406 | if (sc->nr_reclaimed) | 2412 | if (sc->nr_reclaimed) |
2407 | return sc->nr_reclaimed; | 2413 | return sc->nr_reclaimed; |
@@ -2724,6 +2730,17 @@ loop_again: | |||
2724 | */ | 2730 | */ |
2725 | age_active_anon(zone, &sc, priority); | 2731 | age_active_anon(zone, &sc, priority); |
2726 | 2732 | ||
2733 | /* | ||
2734 | * If the number of buffer_heads in the machine | ||
2735 | * exceeds the maximum allowed level and this node | ||
2736 | * has a highmem zone, force kswapd to reclaim from | ||
2737 | * it to relieve lowmem pressure. | ||
2738 | */ | ||
2739 | if (buffer_heads_over_limit && is_highmem_idx(i)) { | ||
2740 | end_zone = i; | ||
2741 | break; | ||
2742 | } | ||
2743 | |||
2727 | if (!zone_watermark_ok_safe(zone, order, | 2744 | if (!zone_watermark_ok_safe(zone, order, |
2728 | high_wmark_pages(zone), 0, 0)) { | 2745 | high_wmark_pages(zone), 0, 0)) { |
2729 | end_zone = i; | 2746 | end_zone = i; |
@@ -2753,7 +2770,7 @@ loop_again: | |||
2753 | */ | 2770 | */ |
2754 | for (i = 0; i <= end_zone; i++) { | 2771 | for (i = 0; i <= end_zone; i++) { |
2755 | struct zone *zone = pgdat->node_zones + i; | 2772 | struct zone *zone = pgdat->node_zones + i; |
2756 | int nr_slab; | 2773 | int nr_slab, testorder; |
2757 | unsigned long balance_gap; | 2774 | unsigned long balance_gap; |
2758 | 2775 | ||
2759 | if (!populated_zone(zone)) | 2776 | if (!populated_zone(zone)) |
@@ -2786,7 +2803,21 @@ loop_again: | |||
2786 | (zone->present_pages + | 2803 | (zone->present_pages + |
2787 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2804 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / |
2788 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2805 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
2789 | if (!zone_watermark_ok_safe(zone, order, | 2806 | /* |
2807 | * Kswapd reclaims only single pages with compaction | ||
2808 | * enabled. Trying too hard to reclaim until contiguous | ||
2809 | * free pages have become available can hurt performance | ||
2810 | * by evicting too much useful data from memory. | ||
2811 | * Do not reclaim more than needed for compaction. | ||
2812 | */ | ||
2813 | testorder = order; | ||
2814 | if (COMPACTION_BUILD && order && | ||
2815 | compaction_suitable(zone, order) != | ||
2816 | COMPACT_SKIPPED) | ||
2817 | testorder = 0; | ||
2818 | |||
2819 | if ((buffer_heads_over_limit && is_highmem_idx(i)) || | ||
2820 | !zone_watermark_ok_safe(zone, order, | ||
2790 | high_wmark_pages(zone) + balance_gap, | 2821 | high_wmark_pages(zone) + balance_gap, |
2791 | end_zone, 0)) { | 2822 | end_zone, 0)) { |
2792 | shrink_zone(priority, zone, &sc); | 2823 | shrink_zone(priority, zone, &sc); |
@@ -2815,7 +2846,7 @@ loop_again: | |||
2815 | continue; | 2846 | continue; |
2816 | } | 2847 | } |
2817 | 2848 | ||
2818 | if (!zone_watermark_ok_safe(zone, order, | 2849 | if (!zone_watermark_ok_safe(zone, testorder, |
2819 | high_wmark_pages(zone), end_zone, 0)) { | 2850 | high_wmark_pages(zone), end_zone, 0)) { |
2820 | all_zones_ok = 0; | 2851 | all_zones_ok = 0; |
2821 | /* | 2852 | /* |
@@ -2903,6 +2934,8 @@ out: | |||
2903 | * and it is potentially going to sleep here. | 2934 | * and it is potentially going to sleep here. |
2904 | */ | 2935 | */ |
2905 | if (order) { | 2936 | if (order) { |
2937 | int zones_need_compaction = 1; | ||
2938 | |||
2906 | for (i = 0; i <= end_zone; i++) { | 2939 | for (i = 0; i <= end_zone; i++) { |
2907 | struct zone *zone = pgdat->node_zones + i; | 2940 | struct zone *zone = pgdat->node_zones + i; |
2908 | 2941 | ||
@@ -2912,6 +2945,10 @@ out: | |||
2912 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2945 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
2913 | continue; | 2946 | continue; |
2914 | 2947 | ||
2948 | /* Would compaction fail due to lack of free memory? */ | ||
2949 | if (compaction_suitable(zone, order) == COMPACT_SKIPPED) | ||
2950 | goto loop_again; | ||
2951 | |||
2915 | /* Confirm the zone is balanced for order-0 */ | 2952 | /* Confirm the zone is balanced for order-0 */ |
2916 | if (!zone_watermark_ok(zone, 0, | 2953 | if (!zone_watermark_ok(zone, 0, |
2917 | high_wmark_pages(zone), 0, 0)) { | 2954 | high_wmark_pages(zone), 0, 0)) { |
@@ -2919,11 +2956,17 @@ out: | |||
2919 | goto loop_again; | 2956 | goto loop_again; |
2920 | } | 2957 | } |
2921 | 2958 | ||
2959 | /* Check if the memory needs to be defragmented. */ | ||
2960 | if (zone_watermark_ok(zone, order, | ||
2961 | low_wmark_pages(zone), *classzone_idx, 0)) | ||
2962 | zones_need_compaction = 0; | ||
2963 | |||
2922 | /* If balanced, clear the congested flag */ | 2964 | /* If balanced, clear the congested flag */ |
2923 | zone_clear_flag(zone, ZONE_CONGESTED); | 2965 | zone_clear_flag(zone, ZONE_CONGESTED); |
2924 | if (i <= *classzone_idx) | ||
2925 | balanced += zone->present_pages; | ||
2926 | } | 2966 | } |
2967 | |||
2968 | if (zones_need_compaction) | ||
2969 | compact_pgdat(pgdat, order); | ||
2927 | } | 2970 | } |
2928 | 2971 | ||
2929 | /* | 2972 | /* |