aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-03-22 12:04:48 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-22 12:04:48 -0400
commit95211279c5ad00a317c98221d7e4365e02f20836 (patch)
tree2ddc8625378d2915b8c96392f3cf6663b705ed55
parent5375871d432ae9fc581014ac117b96aaee3cd0c7 (diff)
parent12724850e8064f64b6223d26d78c0597c742c65a (diff)
Merge branch 'akpm' (Andrew's patch-bomb)
Merge first batch of patches from Andrew Morton: "A few misc things and all the MM queue" * emailed from Andrew Morton <akpm@linux-foundation.org>: (92 commits) memcg: avoid THP split in task migration thp: add HPAGE_PMD_* definitions for !CONFIG_TRANSPARENT_HUGEPAGE memcg: clean up existing move charge code mm/memcontrol.c: remove unnecessary 'break' in mem_cgroup_read() mm/memcontrol.c: remove redundant BUG_ON() in mem_cgroup_usage_unregister_event() mm/memcontrol.c: s/stealed/stolen/ memcg: fix performance of mem_cgroup_begin_update_page_stat() memcg: remove PCG_FILE_MAPPED memcg: use new logic for page stat accounting memcg: remove PCG_MOVE_LOCK flag from page_cgroup memcg: simplify move_account() check memcg: remove EXPORT_SYMBOL(mem_cgroup_update_page_stat) memcg: kill dead prev_priority stubs memcg: remove PCG_CACHE page_cgroup flag memcg: let css_get_next() rely upon rcu_read_lock() cgroup: revert ss_id_lock to spinlock idr: make idr_get_next() good for rcu_read_lock() memcg: remove unnecessary thp check in page stat accounting memcg: remove redundant returns memcg: enum lru_list lru ...
-rw-r--r--Documentation/filesystems/proc.txt32
-rw-r--r--Documentation/kernel-parameters.txt7
-rw-r--r--Documentation/vm/page-types.c2
-rw-r--r--Documentation/vm/pagemap.txt4
-rw-r--r--arch/sparc/kernel/signal32.c7
-rw-r--r--arch/sparc/kernel/signal_32.c7
-rw-r--r--arch/sparc/kernel/signal_64.c6
-rw-r--r--arch/x86/kernel/sys_x86_64.c34
-rw-r--r--arch/x86/kernel/vm86_32.c2
-rw-r--r--arch/x86/mm/hugetlbpage.c28
-rw-r--r--arch/x86/mm/numa_emulation.c2
-rw-r--r--arch/xtensa/kernel/signal.c35
-rw-r--r--drivers/idle/intel_idle.c8
-rw-r--r--drivers/tty/sysrq.c2
-rw-r--r--fs/exec.c2
-rw-r--r--fs/hugetlbfs/inode.c138
-rw-r--r--fs/namei.c6
-rw-r--r--fs/proc/base.c12
-rw-r--r--fs/proc/internal.h9
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/proc/task_mmu.c357
-rw-r--r--fs/proc/task_nommu.c69
-rw-r--r--fs/seq_file.c28
-rw-r--r--include/asm-generic/pgtable.h61
-rw-r--r--include/linux/cgroup.h2
-rw-r--r--include/linux/compaction.h20
-rw-r--r--include/linux/cpuset.h47
-rw-r--r--include/linux/huge_mm.h28
-rw-r--r--include/linux/hugetlb.h45
-rw-r--r--include/linux/init_task.h8
-rw-r--r--include/linux/kernel-page-flags.h1
-rw-r--r--include/linux/memcontrol.h58
-rw-r--r--include/linux/migrate.h2
-rw-r--r--include/linux/mm.h30
-rw-r--r--include/linux/mmzone.h1
-rw-r--r--include/linux/oom.h2
-rw-r--r--include/linux/page-flags.h20
-rw-r--r--include/linux/page_cgroup.h33
-rw-r--r--include/linux/rmap.h1
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/swap.h2
-rw-r--r--ipc/shm.c2
-rw-r--r--kernel/cgroup.c19
-rw-r--r--kernel/cpuset.c43
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c22
-rw-r--r--lib/idr.c8
-rw-r--r--mm/bootmem.c5
-rw-r--r--mm/compaction.c77
-rw-r--r--mm/filemap.c20
-rw-r--r--mm/huge_memory.c125
-rw-r--r--mm/hugetlb.c184
-rw-r--r--mm/ksm.c34
-rw-r--r--mm/memcontrol.c473
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c53
-rw-r--r--mm/mempolicy.c62
-rw-r--r--mm/migrate.c36
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mmap.c51
-rw-r--r--mm/mmu_context.c2
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/oom_kill.c166
-rw-r--r--mm/page-writeback.c1
-rw-r--r--mm/page_alloc.c58
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/pgtable-generic.c5
-rw-r--r--mm/rmap.c70
-rw-r--r--mm/shmem.c88
-rw-r--r--mm/slab.c13
-rw-r--r--mm/slub.c40
-rw-r--r--mm/sparse.c30
-rw-r--r--mm/swap.c4
-rw-r--r--mm/swap_state.c24
-rw-r--r--mm/swapfile.c58
-rw-r--r--mm/util.c41
-rw-r--r--mm/vmscan.c151
77 files changed, 1902 insertions, 1235 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a76a26a1db8a..b7413cb46dcb 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -290,7 +290,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
290 rsslim current limit in bytes on the rss 290 rsslim current limit in bytes on the rss
291 start_code address above which program text can run 291 start_code address above which program text can run
292 end_code address below which program text can run 292 end_code address below which program text can run
293 start_stack address of the start of the stack 293 start_stack address of the start of the main process stack
294 esp current value of ESP 294 esp current value of ESP
295 eip current value of EIP 295 eip current value of EIP
296 pending bitmap of pending signals 296 pending bitmap of pending signals
@@ -325,7 +325,7 @@ address perms offset dev inode pathname
325a7cb1000-a7cb2000 ---p 00000000 00:00 0 325a7cb1000-a7cb2000 ---p 00000000 00:00 0
326a7cb2000-a7eb2000 rw-p 00000000 00:00 0 326a7cb2000-a7eb2000 rw-p 00000000 00:00 0
327a7eb2000-a7eb3000 ---p 00000000 00:00 0 327a7eb2000-a7eb3000 ---p 00000000 00:00 0
328a7eb3000-a7ed5000 rw-p 00000000 00:00 0 328a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack:1001]
329a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 329a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6
330a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6 330a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6
331a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6 331a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6
@@ -357,11 +357,39 @@ is not associated with a file:
357 357
358 [heap] = the heap of the program 358 [heap] = the heap of the program
359 [stack] = the stack of the main process 359 [stack] = the stack of the main process
360 [stack:1001] = the stack of the thread with tid 1001
360 [vdso] = the "virtual dynamic shared object", 361 [vdso] = the "virtual dynamic shared object",
361 the kernel system call handler 362 the kernel system call handler
362 363
363 or if empty, the mapping is anonymous. 364 or if empty, the mapping is anonymous.
364 365
366The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint
367of the individual tasks of a process. In this file you will see a mapping marked
368as [stack] if that task sees it as a stack. This is a key difference from the
369content of /proc/PID/maps, where you will see all mappings that are being used
370as stack by all of those tasks. Hence, for the example above, the task-level
371map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
372
37308048000-08049000 r-xp 00000000 03:00 8312 /opt/test
37408049000-0804a000 rw-p 00001000 03:00 8312 /opt/test
3750804a000-0806b000 rw-p 00000000 00:00 0 [heap]
376a7cb1000-a7cb2000 ---p 00000000 00:00 0
377a7cb2000-a7eb2000 rw-p 00000000 00:00 0
378a7eb2000-a7eb3000 ---p 00000000 00:00 0
379a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack]
380a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6
381a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6
382a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6
383a800b000-a800e000 rw-p 00000000 00:00 0
384a800e000-a8022000 r-xp 00000000 03:00 14462 /lib/libpthread.so.0
385a8022000-a8023000 r--p 00013000 03:00 14462 /lib/libpthread.so.0
386a8023000-a8024000 rw-p 00014000 03:00 14462 /lib/libpthread.so.0
387a8024000-a8027000 rw-p 00000000 00:00 0
388a8027000-a8043000 r-xp 00000000 03:00 8317 /lib/ld-linux.so.2
389a8043000-a8044000 r--p 0001b000 03:00 8317 /lib/ld-linux.so.2
390a8044000-a8045000 rw-p 0001c000 03:00 8317 /lib/ld-linux.so.2
391aff35000-aff4a000 rw-p 00000000 00:00 0
392ffffe000-fffff000 r-xp 00000000 00:00 0 [vdso]
365 393
366The /proc/PID/smaps is an extension based on maps, showing the memory 394The /proc/PID/smaps is an extension based on maps, showing the memory
367consumption for each of the process's mappings. For each of mappings there 395consumption for each of the process's mappings. For each of mappings there
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8cadb7551fca..7986d79d9d17 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2635,6 +2635,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2635 to facilitate early boot debugging. 2635 to facilitate early boot debugging.
2636 See also Documentation/trace/events.txt 2636 See also Documentation/trace/events.txt
2637 2637
2638 transparent_hugepage=
2639 [KNL]
2640 Format: [always|madvise|never]
2641 Can be used to control the default behavior of the system
2642 with respect to transparent hugepages.
2643 See Documentation/vm/transhuge.txt for more details.
2644
2638 tsc= Disable clocksource stability checks for TSC. 2645 tsc= Disable clocksource stability checks for TSC.
2639 Format: <string> 2646 Format: <string>
2640 [x86] reliable: mark tsc clocksource as reliable, this 2647 [x86] reliable: mark tsc clocksource as reliable, this
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c
index 7445caa26d05..0b13f02d4059 100644
--- a/Documentation/vm/page-types.c
+++ b/Documentation/vm/page-types.c
@@ -98,6 +98,7 @@
98#define KPF_HWPOISON 19 98#define KPF_HWPOISON 19
99#define KPF_NOPAGE 20 99#define KPF_NOPAGE 20
100#define KPF_KSM 21 100#define KPF_KSM 21
101#define KPF_THP 22
101 102
102/* [32-] kernel hacking assistances */ 103/* [32-] kernel hacking assistances */
103#define KPF_RESERVED 32 104#define KPF_RESERVED 32
@@ -147,6 +148,7 @@ static const char *page_flag_names[] = {
147 [KPF_HWPOISON] = "X:hwpoison", 148 [KPF_HWPOISON] = "X:hwpoison",
148 [KPF_NOPAGE] = "n:nopage", 149 [KPF_NOPAGE] = "n:nopage",
149 [KPF_KSM] = "x:ksm", 150 [KPF_KSM] = "x:ksm",
151 [KPF_THP] = "t:thp",
150 152
151 [KPF_RESERVED] = "r:reserved", 153 [KPF_RESERVED] = "r:reserved",
152 [KPF_MLOCKED] = "m:mlocked", 154 [KPF_MLOCKED] = "m:mlocked",
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index df09b9650a81..4600cbe3d6be 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -60,6 +60,7 @@ There are three components to pagemap:
60 19. HWPOISON 60 19. HWPOISON
61 20. NOPAGE 61 20. NOPAGE
62 21. KSM 62 21. KSM
63 22. THP
63 64
64Short descriptions to the page flags: 65Short descriptions to the page flags:
65 66
@@ -97,6 +98,9 @@ Short descriptions to the page flags:
9721. KSM 9821. KSM
98 identical memory pages dynamically shared between one or more processes 99 identical memory pages dynamically shared between one or more processes
99 100
10122. THP
102 contiguous pages which construct transparent hugepages
103
100 [IO related page flags] 104 [IO related page flags]
101 1. ERROR IO error occurred 105 1. ERROR IO error occurred
102 3. UPTODATE page has up-to-date data 106 3. UPTODATE page has up-to-date data
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index 023b8860dc97..c8f5b50db89c 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -776,7 +776,6 @@ static inline int handle_signal32(unsigned long signr, struct k_sigaction *ka,
776 siginfo_t *info, 776 siginfo_t *info,
777 sigset_t *oldset, struct pt_regs *regs) 777 sigset_t *oldset, struct pt_regs *regs)
778{ 778{
779 sigset_t blocked;
780 int err; 779 int err;
781 780
782 if (ka->sa.sa_flags & SA_SIGINFO) 781 if (ka->sa.sa_flags & SA_SIGINFO)
@@ -787,11 +786,7 @@ static inline int handle_signal32(unsigned long signr, struct k_sigaction *ka,
787 if (err) 786 if (err)
788 return err; 787 return err;
789 788
790 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask); 789 block_sigmask(ka, signr);
791 if (!(ka->sa.sa_flags & SA_NOMASK))
792 sigaddset(&blocked, signr);
793 set_current_blocked(&blocked);
794
795 tracehook_signal_handler(signr, info, ka, regs, 0); 790 tracehook_signal_handler(signr, info, ka, regs, 0);
796 791
797 return 0; 792 return 0;
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index d54c6e53aba0..7bb71b6fbd20 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -465,7 +465,6 @@ static inline int
465handle_signal(unsigned long signr, struct k_sigaction *ka, 465handle_signal(unsigned long signr, struct k_sigaction *ka,
466 siginfo_t *info, sigset_t *oldset, struct pt_regs *regs) 466 siginfo_t *info, sigset_t *oldset, struct pt_regs *regs)
467{ 467{
468 sigset_t blocked;
469 int err; 468 int err;
470 469
471 if (ka->sa.sa_flags & SA_SIGINFO) 470 if (ka->sa.sa_flags & SA_SIGINFO)
@@ -476,11 +475,7 @@ handle_signal(unsigned long signr, struct k_sigaction *ka,
476 if (err) 475 if (err)
477 return err; 476 return err;
478 477
479 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask); 478 block_sigmask(ka, signr);
480 if (!(ka->sa.sa_flags & SA_NOMASK))
481 sigaddset(&blocked, signr);
482 set_current_blocked(&blocked);
483
484 tracehook_signal_handler(signr, info, ka, regs, 0); 479 tracehook_signal_handler(signr, info, ka, regs, 0);
485 480
486 return 0; 481 return 0;
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index f0836cd0e2f2..d8a67e60be80 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -479,18 +479,14 @@ static inline int handle_signal(unsigned long signr, struct k_sigaction *ka,
479 siginfo_t *info, 479 siginfo_t *info,
480 sigset_t *oldset, struct pt_regs *regs) 480 sigset_t *oldset, struct pt_regs *regs)
481{ 481{
482 sigset_t blocked;
483 int err; 482 int err;
484 483
485 err = setup_rt_frame(ka, regs, signr, oldset, 484 err = setup_rt_frame(ka, regs, signr, oldset,
486 (ka->sa.sa_flags & SA_SIGINFO) ? info : NULL); 485 (ka->sa.sa_flags & SA_SIGINFO) ? info : NULL);
487 if (err) 486 if (err)
488 return err; 487 return err;
489 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
490 if (!(ka->sa.sa_flags & SA_NOMASK))
491 sigaddset(&blocked, signr);
492 set_current_blocked(&blocked);
493 488
489 block_sigmask(ka, signr);
494 tracehook_signal_handler(signr, info, ka, regs, 0); 490 tracehook_signal_handler(signr, info, ka, regs, 0);
495 491
496 return 0; 492 return 0;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 051489082d59..ef59642ff1bf 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
195{ 195{
196 struct vm_area_struct *vma; 196 struct vm_area_struct *vma;
197 struct mm_struct *mm = current->mm; 197 struct mm_struct *mm = current->mm;
198 unsigned long addr = addr0; 198 unsigned long addr = addr0, start_addr;
199 199
200 /* requested length too big for entire address space */ 200 /* requested length too big for entire address space */
201 if (len > TASK_SIZE) 201 if (len > TASK_SIZE)
@@ -223,25 +223,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
223 mm->free_area_cache = mm->mmap_base; 223 mm->free_area_cache = mm->mmap_base;
224 } 224 }
225 225
226try_again:
226 /* either no address requested or can't fit in requested address hole */ 227 /* either no address requested or can't fit in requested address hole */
227 addr = mm->free_area_cache; 228 start_addr = addr = mm->free_area_cache;
228
229 /* make sure it can fit in the remaining address space */
230 if (addr > len) {
231 unsigned long tmp_addr = align_addr(addr - len, filp,
232 ALIGN_TOPDOWN);
233
234 vma = find_vma(mm, tmp_addr);
235 if (!vma || tmp_addr + len <= vma->vm_start)
236 /* remember the address as a hint for next time */
237 return mm->free_area_cache = tmp_addr;
238 }
239
240 if (mm->mmap_base < len)
241 goto bottomup;
242 229
243 addr = mm->mmap_base-len; 230 if (addr < len)
231 goto fail;
244 232
233 addr -= len;
245 do { 234 do {
246 addr = align_addr(addr, filp, ALIGN_TOPDOWN); 235 addr = align_addr(addr, filp, ALIGN_TOPDOWN);
247 236
@@ -263,6 +252,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
263 addr = vma->vm_start-len; 252 addr = vma->vm_start-len;
264 } while (len < vma->vm_start); 253 } while (len < vma->vm_start);
265 254
255fail:
256 /*
257 * if hint left us with no space for the requested
258 * mapping then try again:
259 */
260 if (start_addr != mm->mmap_base) {
261 mm->free_area_cache = mm->mmap_base;
262 mm->cached_hole_size = 0;
263 goto try_again;
264 }
265
266bottomup: 266bottomup:
267 /* 267 /*
268 * A failed mmap() very likely causes application failure, 268 * A failed mmap() very likely causes application failure,
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index b466cab5ba15..328cb37bb827 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
172 spinlock_t *ptl; 172 spinlock_t *ptl;
173 int i; 173 int i;
174 174
175 down_write(&mm->mmap_sem);
175 pgd = pgd_offset(mm, 0xA0000); 176 pgd = pgd_offset(mm, 0xA0000);
176 if (pgd_none_or_clear_bad(pgd)) 177 if (pgd_none_or_clear_bad(pgd))
177 goto out; 178 goto out;
@@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
190 } 191 }
191 pte_unmap_unlock(pte, ptl); 192 pte_unmap_unlock(pte, ptl);
192out: 193out:
194 up_write(&mm->mmap_sem);
193 flush_tlb(); 195 flush_tlb();
194} 196}
195 197
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 8ecbb4bba4b3..f6679a7fb8ca 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -308,10 +308,11 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
308{ 308{
309 struct hstate *h = hstate_file(file); 309 struct hstate *h = hstate_file(file);
310 struct mm_struct *mm = current->mm; 310 struct mm_struct *mm = current->mm;
311 struct vm_area_struct *vma, *prev_vma; 311 struct vm_area_struct *vma;
312 unsigned long base = mm->mmap_base, addr = addr0; 312 unsigned long base = mm->mmap_base;
313 unsigned long addr = addr0;
313 unsigned long largest_hole = mm->cached_hole_size; 314 unsigned long largest_hole = mm->cached_hole_size;
314 int first_time = 1; 315 unsigned long start_addr;
315 316
316 /* don't allow allocations above current base */ 317 /* don't allow allocations above current base */
317 if (mm->free_area_cache > base) 318 if (mm->free_area_cache > base)
@@ -322,6 +323,8 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
322 mm->free_area_cache = base; 323 mm->free_area_cache = base;
323 } 324 }
324try_again: 325try_again:
326 start_addr = mm->free_area_cache;
327
325 /* make sure it can fit in the remaining address space */ 328 /* make sure it can fit in the remaining address space */
326 if (mm->free_area_cache < len) 329 if (mm->free_area_cache < len)
327 goto fail; 330 goto fail;
@@ -337,22 +340,14 @@ try_again:
337 if (!vma) 340 if (!vma)
338 return addr; 341 return addr;
339 342
340 /* 343 if (addr + len <= vma->vm_start) {
341 * new region fits between prev_vma->vm_end and
342 * vma->vm_start, use it:
343 */
344 prev_vma = vma->vm_prev;
345 if (addr + len <= vma->vm_start &&
346 (!prev_vma || (addr >= prev_vma->vm_end))) {
347 /* remember the address as a hint for next time */ 344 /* remember the address as a hint for next time */
348 mm->cached_hole_size = largest_hole; 345 mm->cached_hole_size = largest_hole;
349 return (mm->free_area_cache = addr); 346 return (mm->free_area_cache = addr);
350 } else { 347 } else if (mm->free_area_cache == vma->vm_end) {
351 /* pull free_area_cache down to the first hole */ 348 /* pull free_area_cache down to the first hole */
352 if (mm->free_area_cache == vma->vm_end) { 349 mm->free_area_cache = vma->vm_start;
353 mm->free_area_cache = vma->vm_start; 350 mm->cached_hole_size = largest_hole;
354 mm->cached_hole_size = largest_hole;
355 }
356 } 351 }
357 352
358 /* remember the largest hole we saw so far */ 353 /* remember the largest hole we saw so far */
@@ -368,10 +363,9 @@ fail:
368 * if hint left us with no space for the requested 363 * if hint left us with no space for the requested
369 * mapping then try again: 364 * mapping then try again:
370 */ 365 */
371 if (first_time) { 366 if (start_addr != base) {
372 mm->free_area_cache = base; 367 mm->free_area_cache = base;
373 largest_hole = 0; 368 largest_hole = 0;
374 first_time = 0;
375 goto try_again; 369 goto try_again;
376 } 370 }
377 /* 371 /*
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 46db56845f18..740b0a355431 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -60,7 +60,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
60 eb->nid = nid; 60 eb->nid = nid;
61 61
62 if (emu_nid_to_phys[nid] == NUMA_NO_NODE) 62 if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
63 emu_nid_to_phys[nid] = pb->nid; 63 emu_nid_to_phys[nid] = nid;
64 64
65 pb->start += size; 65 pb->start += size;
66 if (pb->start >= pb->end) { 66 if (pb->start >= pb->end) {
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index f2220b5bdce6..b69b000349fc 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -260,10 +260,7 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3,
260 goto badframe; 260 goto badframe;
261 261
262 sigdelsetmask(&set, ~_BLOCKABLE); 262 sigdelsetmask(&set, ~_BLOCKABLE);
263 spin_lock_irq(&current->sighand->siglock); 263 set_current_blocked(&set);
264 current->blocked = set;
265 recalc_sigpending();
266 spin_unlock_irq(&current->sighand->siglock);
267 264
268 if (restore_sigcontext(regs, frame)) 265 if (restore_sigcontext(regs, frame))
269 goto badframe; 266 goto badframe;
@@ -336,8 +333,8 @@ gen_return_code(unsigned char *codemem)
336} 333}
337 334
338 335
339static void setup_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 336static int setup_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
340 sigset_t *set, struct pt_regs *regs) 337 sigset_t *set, struct pt_regs *regs)
341{ 338{
342 struct rt_sigframe *frame; 339 struct rt_sigframe *frame;
343 int err = 0; 340 int err = 0;
@@ -422,12 +419,11 @@ static void setup_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
422 current->comm, current->pid, signal, frame, regs->pc); 419 current->comm, current->pid, signal, frame, regs->pc);
423#endif 420#endif
424 421
425 return; 422 return 0;
426 423
427give_sigsegv: 424give_sigsegv:
428 if (sig == SIGSEGV) 425 force_sigsegv(sig, current);
429 ka->sa.sa_handler = SIG_DFL; 426 return -EFAULT;
430 force_sig(SIGSEGV, current);
431} 427}
432 428
433/* 429/*
@@ -449,11 +445,8 @@ asmlinkage long xtensa_rt_sigsuspend(sigset_t __user *unewset,
449 return -EFAULT; 445 return -EFAULT;
450 446
451 sigdelsetmask(&newset, ~_BLOCKABLE); 447 sigdelsetmask(&newset, ~_BLOCKABLE);
452 spin_lock_irq(&current->sighand->siglock);
453 saveset = current->blocked; 448 saveset = current->blocked;
454 current->blocked = newset; 449 set_current_blocked(&newset);
455 recalc_sigpending();
456 spin_unlock_irq(&current->sighand->siglock);
457 450
458 regs->areg[2] = -EINTR; 451 regs->areg[2] = -EINTR;
459 while (1) { 452 while (1) {
@@ -536,17 +529,11 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
536 529
537 /* Whee! Actually deliver the signal. */ 530 /* Whee! Actually deliver the signal. */
538 /* Set up the stack frame */ 531 /* Set up the stack frame */
539 setup_frame(signr, &ka, &info, oldset, regs); 532 ret = setup_frame(signr, &ka, &info, oldset, regs);
540 533 if (ret)
541 if (ka.sa.sa_flags & SA_ONESHOT) 534 return ret;
542 ka.sa.sa_handler = SIG_DFL;
543 535
544 spin_lock_irq(&current->sighand->siglock); 536 block_sigmask(&ka, signr);
545 sigorsets(&current->blocked, &current->blocked, &ka.sa.sa_mask);
546 if (!(ka.sa.sa_flags & SA_NODEFER))
547 sigaddset(&current->blocked, signr);
548 recalc_sigpending();
549 spin_unlock_irq(&current->sighand->siglock);
550 if (current->ptrace & PT_SINGLESTEP) 537 if (current->ptrace & PT_SINGLESTEP)
551 task_pt_regs(current)->icountlevel = 1; 538 task_pt_regs(current)->icountlevel = 1;
552 539
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 1c15e9b33575..d0f59c3f87ef 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -507,8 +507,7 @@ int intel_idle_cpu_init(int cpu)
507 int num_substates; 507 int num_substates;
508 508
509 if (cstate > max_cstate) { 509 if (cstate > max_cstate) {
510 printk(PREFIX "max_cstate %d reached\n", 510 printk(PREFIX "max_cstate %d reached\n", max_cstate);
511 max_cstate);
512 break; 511 break;
513 } 512 }
514 513
@@ -524,8 +523,9 @@ int intel_idle_cpu_init(int cpu)
524 dev->states_usage[dev->state_count].driver_data = 523 dev->states_usage[dev->state_count].driver_data =
525 (void *)get_driver_data(cstate); 524 (void *)get_driver_data(cstate);
526 525
527 dev->state_count += 1; 526 dev->state_count += 1;
528 } 527 }
528
529 dev->cpu = cpu; 529 dev->cpu = cpu;
530 530
531 if (cpuidle_register_device(dev)) { 531 if (cpuidle_register_device(dev)) {
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index ecb8e2203ac8..136e86faa1e1 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -346,7 +346,7 @@ static struct sysrq_key_op sysrq_term_op = {
346 346
347static void moom_callback(struct work_struct *ignored) 347static void moom_callback(struct work_struct *ignored)
348{ 348{
349 out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0, NULL); 349 out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0, NULL, true);
350} 350}
351 351
352static DECLARE_WORK(moom_work, moom_callback); 352static DECLARE_WORK(moom_work, moom_callback);
diff --git a/fs/exec.c b/fs/exec.c
index 0b931471d4f4..23559c227d9c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -822,7 +822,7 @@ static int exec_mmap(struct mm_struct *mm)
822 /* Notify parent that we're no longer interested in the old VM */ 822 /* Notify parent that we're no longer interested in the old VM */
823 tsk = current; 823 tsk = current;
824 old_mm = current->mm; 824 old_mm = current->mm;
825 sync_mm_rss(tsk, old_mm); 825 sync_mm_rss(old_mm);
826 mm_release(tsk, old_mm); 826 mm_release(tsk, old_mm);
827 827
828 if (old_mm) { 828 if (old_mm) {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 81932fa1861a..ea251749d9d5 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -41,6 +41,25 @@ const struct file_operations hugetlbfs_file_operations;
41static const struct inode_operations hugetlbfs_dir_inode_operations; 41static const struct inode_operations hugetlbfs_dir_inode_operations;
42static const struct inode_operations hugetlbfs_inode_operations; 42static const struct inode_operations hugetlbfs_inode_operations;
43 43
44struct hugetlbfs_config {
45 uid_t uid;
46 gid_t gid;
47 umode_t mode;
48 long nr_blocks;
49 long nr_inodes;
50 struct hstate *hstate;
51};
52
53struct hugetlbfs_inode_info {
54 struct shared_policy policy;
55 struct inode vfs_inode;
56};
57
58static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
59{
60 return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
61}
62
44static struct backing_dev_info hugetlbfs_backing_dev_info = { 63static struct backing_dev_info hugetlbfs_backing_dev_info = {
45 .name = "hugetlbfs", 64 .name = "hugetlbfs",
46 .ra_pages = 0, /* No readahead */ 65 .ra_pages = 0, /* No readahead */
@@ -154,10 +173,12 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
154 return addr; 173 return addr;
155 } 174 }
156 175
157 start_addr = mm->free_area_cache; 176 if (len > mm->cached_hole_size)
158 177 start_addr = mm->free_area_cache;
159 if (len <= mm->cached_hole_size) 178 else {
160 start_addr = TASK_UNMAPPED_BASE; 179 start_addr = TASK_UNMAPPED_BASE;
180 mm->cached_hole_size = 0;
181 }
161 182
162full_search: 183full_search:
163 addr = ALIGN(start_addr, huge_page_size(h)); 184 addr = ALIGN(start_addr, huge_page_size(h));
@@ -171,13 +192,18 @@ full_search:
171 */ 192 */
172 if (start_addr != TASK_UNMAPPED_BASE) { 193 if (start_addr != TASK_UNMAPPED_BASE) {
173 start_addr = TASK_UNMAPPED_BASE; 194 start_addr = TASK_UNMAPPED_BASE;
195 mm->cached_hole_size = 0;
174 goto full_search; 196 goto full_search;
175 } 197 }
176 return -ENOMEM; 198 return -ENOMEM;
177 } 199 }
178 200
179 if (!vma || addr + len <= vma->vm_start) 201 if (!vma || addr + len <= vma->vm_start) {
202 mm->free_area_cache = addr + len;
180 return addr; 203 return addr;
204 }
205 if (addr + mm->cached_hole_size < vma->vm_start)
206 mm->cached_hole_size = vma->vm_start - addr;
181 addr = ALIGN(vma->vm_end, huge_page_size(h)); 207 addr = ALIGN(vma->vm_end, huge_page_size(h));
182 } 208 }
183} 209}
@@ -238,17 +264,10 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
238 loff_t isize; 264 loff_t isize;
239 ssize_t retval = 0; 265 ssize_t retval = 0;
240 266
241 mutex_lock(&inode->i_mutex);
242
243 /* validate length */ 267 /* validate length */
244 if (len == 0) 268 if (len == 0)
245 goto out; 269 goto out;
246 270
247 isize = i_size_read(inode);
248 if (!isize)
249 goto out;
250
251 end_index = (isize - 1) >> huge_page_shift(h);
252 for (;;) { 271 for (;;) {
253 struct page *page; 272 struct page *page;
254 unsigned long nr, ret; 273 unsigned long nr, ret;
@@ -256,18 +275,21 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
256 275
257 /* nr is the maximum number of bytes to copy from this page */ 276 /* nr is the maximum number of bytes to copy from this page */
258 nr = huge_page_size(h); 277 nr = huge_page_size(h);
278 isize = i_size_read(inode);
279 if (!isize)
280 goto out;
281 end_index = (isize - 1) >> huge_page_shift(h);
259 if (index >= end_index) { 282 if (index >= end_index) {
260 if (index > end_index) 283 if (index > end_index)
261 goto out; 284 goto out;
262 nr = ((isize - 1) & ~huge_page_mask(h)) + 1; 285 nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
263 if (nr <= offset) { 286 if (nr <= offset)
264 goto out; 287 goto out;
265 }
266 } 288 }
267 nr = nr - offset; 289 nr = nr - offset;
268 290
269 /* Find the page */ 291 /* Find the page */
270 page = find_get_page(mapping, index); 292 page = find_lock_page(mapping, index);
271 if (unlikely(page == NULL)) { 293 if (unlikely(page == NULL)) {
272 /* 294 /*
273 * We have a HOLE, zero out the user-buffer for the 295 * We have a HOLE, zero out the user-buffer for the
@@ -279,17 +301,18 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
279 else 301 else
280 ra = 0; 302 ra = 0;
281 } else { 303 } else {
304 unlock_page(page);
305
282 /* 306 /*
283 * We have the page, copy it to user space buffer. 307 * We have the page, copy it to user space buffer.
284 */ 308 */
285 ra = hugetlbfs_read_actor(page, offset, buf, len, nr); 309 ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
286 ret = ra; 310 ret = ra;
311 page_cache_release(page);
287 } 312 }
288 if (ra < 0) { 313 if (ra < 0) {
289 if (retval == 0) 314 if (retval == 0)
290 retval = ra; 315 retval = ra;
291 if (page)
292 page_cache_release(page);
293 goto out; 316 goto out;
294 } 317 }
295 318
@@ -299,16 +322,12 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
299 index += offset >> huge_page_shift(h); 322 index += offset >> huge_page_shift(h);
300 offset &= ~huge_page_mask(h); 323 offset &= ~huge_page_mask(h);
301 324
302 if (page)
303 page_cache_release(page);
304
305 /* short read or no more work */ 325 /* short read or no more work */
306 if ((ret != nr) || (len == 0)) 326 if ((ret != nr) || (len == 0))
307 break; 327 break;
308 } 328 }
309out: 329out:
310 *ppos = ((loff_t)index << huge_page_shift(h)) + offset; 330 *ppos = ((loff_t)index << huge_page_shift(h)) + offset;
311 mutex_unlock(&inode->i_mutex);
312 return retval; 331 return retval;
313} 332}
314 333
@@ -607,9 +626,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
607 spin_lock(&sbinfo->stat_lock); 626 spin_lock(&sbinfo->stat_lock);
608 /* If no limits set, just report 0 for max/free/used 627 /* If no limits set, just report 0 for max/free/used
609 * blocks, like simple_statfs() */ 628 * blocks, like simple_statfs() */
610 if (sbinfo->max_blocks >= 0) { 629 if (sbinfo->spool) {
611 buf->f_blocks = sbinfo->max_blocks; 630 long free_pages;
612 buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; 631
632 spin_lock(&sbinfo->spool->lock);
633 buf->f_blocks = sbinfo->spool->max_hpages;
634 free_pages = sbinfo->spool->max_hpages
635 - sbinfo->spool->used_hpages;
636 buf->f_bavail = buf->f_bfree = free_pages;
637 spin_unlock(&sbinfo->spool->lock);
613 buf->f_files = sbinfo->max_inodes; 638 buf->f_files = sbinfo->max_inodes;
614 buf->f_ffree = sbinfo->free_inodes; 639 buf->f_ffree = sbinfo->free_inodes;
615 } 640 }
@@ -625,6 +650,10 @@ static void hugetlbfs_put_super(struct super_block *sb)
625 650
626 if (sbi) { 651 if (sbi) {
627 sb->s_fs_info = NULL; 652 sb->s_fs_info = NULL;
653
654 if (sbi->spool)
655 hugepage_put_subpool(sbi->spool);
656
628 kfree(sbi); 657 kfree(sbi);
629 } 658 }
630} 659}
@@ -853,10 +882,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
853 sb->s_fs_info = sbinfo; 882 sb->s_fs_info = sbinfo;
854 sbinfo->hstate = config.hstate; 883 sbinfo->hstate = config.hstate;
855 spin_lock_init(&sbinfo->stat_lock); 884 spin_lock_init(&sbinfo->stat_lock);
856 sbinfo->max_blocks = config.nr_blocks;
857 sbinfo->free_blocks = config.nr_blocks;
858 sbinfo->max_inodes = config.nr_inodes; 885 sbinfo->max_inodes = config.nr_inodes;
859 sbinfo->free_inodes = config.nr_inodes; 886 sbinfo->free_inodes = config.nr_inodes;
887 sbinfo->spool = NULL;
888 if (config.nr_blocks != -1) {
889 sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
890 if (!sbinfo->spool)
891 goto out_free;
892 }
860 sb->s_maxbytes = MAX_LFS_FILESIZE; 893 sb->s_maxbytes = MAX_LFS_FILESIZE;
861 sb->s_blocksize = huge_page_size(config.hstate); 894 sb->s_blocksize = huge_page_size(config.hstate);
862 sb->s_blocksize_bits = huge_page_shift(config.hstate); 895 sb->s_blocksize_bits = huge_page_shift(config.hstate);
@@ -868,38 +901,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
868 goto out_free; 901 goto out_free;
869 return 0; 902 return 0;
870out_free: 903out_free:
904 if (sbinfo->spool)
905 kfree(sbinfo->spool);
871 kfree(sbinfo); 906 kfree(sbinfo);
872 return -ENOMEM; 907 return -ENOMEM;
873} 908}
874 909
875int hugetlb_get_quota(struct address_space *mapping, long delta)
876{
877 int ret = 0;
878 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
879
880 if (sbinfo->free_blocks > -1) {
881 spin_lock(&sbinfo->stat_lock);
882 if (sbinfo->free_blocks - delta >= 0)
883 sbinfo->free_blocks -= delta;
884 else
885 ret = -ENOMEM;
886 spin_unlock(&sbinfo->stat_lock);
887 }
888
889 return ret;
890}
891
892void hugetlb_put_quota(struct address_space *mapping, long delta)
893{
894 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
895
896 if (sbinfo->free_blocks > -1) {
897 spin_lock(&sbinfo->stat_lock);
898 sbinfo->free_blocks += delta;
899 spin_unlock(&sbinfo->stat_lock);
900 }
901}
902
903static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, 910static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
904 int flags, const char *dev_name, void *data) 911 int flags, const char *dev_name, void *data)
905{ 912{
@@ -919,8 +926,8 @@ static int can_do_hugetlb_shm(void)
919 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); 926 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
920} 927}
921 928
922struct file *hugetlb_file_setup(const char *name, size_t size, 929struct file *hugetlb_file_setup(const char *name, unsigned long addr,
923 vm_flags_t acctflag, 930 size_t size, vm_flags_t acctflag,
924 struct user_struct **user, int creat_flags) 931 struct user_struct **user, int creat_flags)
925{ 932{
926 int error = -ENOMEM; 933 int error = -ENOMEM;
@@ -929,6 +936,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
929 struct path path; 936 struct path path;
930 struct dentry *root; 937 struct dentry *root;
931 struct qstr quick_string; 938 struct qstr quick_string;
939 struct hstate *hstate;
940 unsigned long num_pages;
932 941
933 *user = NULL; 942 *user = NULL;
934 if (!hugetlbfs_vfsmount) 943 if (!hugetlbfs_vfsmount)
@@ -937,7 +946,11 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
937 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 946 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
938 *user = current_user(); 947 *user = current_user();
939 if (user_shm_lock(size, *user)) { 948 if (user_shm_lock(size, *user)) {
940 printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n"); 949 task_lock(current);
950 printk_once(KERN_WARNING
951 "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
952 current->comm, current->pid);
953 task_unlock(current);
941 } else { 954 } else {
942 *user = NULL; 955 *user = NULL;
943 return ERR_PTR(-EPERM); 956 return ERR_PTR(-EPERM);
@@ -958,10 +971,12 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
958 if (!inode) 971 if (!inode)
959 goto out_dentry; 972 goto out_dentry;
960 973
974 hstate = hstate_inode(inode);
975 size += addr & ~huge_page_mask(hstate);
976 num_pages = ALIGN(size, huge_page_size(hstate)) >>
977 huge_page_shift(hstate);
961 error = -ENOMEM; 978 error = -ENOMEM;
962 if (hugetlb_reserve_pages(inode, 0, 979 if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag))
963 size >> huge_page_shift(hstate_inode(inode)), NULL,
964 acctflag))
965 goto out_inode; 980 goto out_inode;
966 981
967 d_instantiate(path.dentry, inode); 982 d_instantiate(path.dentry, inode);
@@ -997,6 +1012,7 @@ static int __init init_hugetlbfs_fs(void)
997 if (error) 1012 if (error)
998 return error; 1013 return error;
999 1014
1015 error = -ENOMEM;
1000 hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", 1016 hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
1001 sizeof(struct hugetlbfs_inode_info), 1017 sizeof(struct hugetlbfs_inode_info),
1002 0, 0, init_once); 1018 0, 0, init_once);
@@ -1015,10 +1031,10 @@ static int __init init_hugetlbfs_fs(void)
1015 } 1031 }
1016 1032
1017 error = PTR_ERR(vfsmount); 1033 error = PTR_ERR(vfsmount);
1034 unregister_filesystem(&hugetlbfs_fs_type);
1018 1035
1019 out: 1036 out:
1020 if (error) 1037 kmem_cache_destroy(hugetlbfs_inode_cachep);
1021 kmem_cache_destroy(hugetlbfs_inode_cachep);
1022 out2: 1038 out2:
1023 bdi_destroy(&hugetlbfs_backing_dev_info); 1039 bdi_destroy(&hugetlbfs_backing_dev_info);
1024 return error; 1040 return error;
diff --git a/fs/namei.c b/fs/namei.c
index 13e6a1f191a9..a94a7f9a03ea 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1455,9 +1455,15 @@ done:
1455} 1455}
1456EXPORT_SYMBOL(full_name_hash); 1456EXPORT_SYMBOL(full_name_hash);
1457 1457
1458#ifdef CONFIG_64BIT
1458#define ONEBYTES 0x0101010101010101ul 1459#define ONEBYTES 0x0101010101010101ul
1459#define SLASHBYTES 0x2f2f2f2f2f2f2f2ful 1460#define SLASHBYTES 0x2f2f2f2f2f2f2f2ful
1460#define HIGHBITS 0x8080808080808080ul 1461#define HIGHBITS 0x8080808080808080ul
1462#else
1463#define ONEBYTES 0x01010101ul
1464#define SLASHBYTES 0x2f2f2f2ful
1465#define HIGHBITS 0x80808080ul
1466#endif
1461 1467
1462/* Return the high bit set in the first byte that is a zero */ 1468/* Return the high bit set in the first byte that is a zero */
1463static inline unsigned long has_zero(unsigned long a) 1469static inline unsigned long has_zero(unsigned long a)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 965d4bde3a3b..3b42c1418f31 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2989,9 +2989,9 @@ static const struct pid_entry tgid_base_stuff[] = {
2989 INF("cmdline", S_IRUGO, proc_pid_cmdline), 2989 INF("cmdline", S_IRUGO, proc_pid_cmdline),
2990 ONE("stat", S_IRUGO, proc_tgid_stat), 2990 ONE("stat", S_IRUGO, proc_tgid_stat),
2991 ONE("statm", S_IRUGO, proc_pid_statm), 2991 ONE("statm", S_IRUGO, proc_pid_statm),
2992 REG("maps", S_IRUGO, proc_maps_operations), 2992 REG("maps", S_IRUGO, proc_pid_maps_operations),
2993#ifdef CONFIG_NUMA 2993#ifdef CONFIG_NUMA
2994 REG("numa_maps", S_IRUGO, proc_numa_maps_operations), 2994 REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
2995#endif 2995#endif
2996 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), 2996 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
2997 LNK("cwd", proc_cwd_link), 2997 LNK("cwd", proc_cwd_link),
@@ -3002,7 +3002,7 @@ static const struct pid_entry tgid_base_stuff[] = {
3002 REG("mountstats", S_IRUSR, proc_mountstats_operations), 3002 REG("mountstats", S_IRUSR, proc_mountstats_operations),
3003#ifdef CONFIG_PROC_PAGE_MONITOR 3003#ifdef CONFIG_PROC_PAGE_MONITOR
3004 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 3004 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
3005 REG("smaps", S_IRUGO, proc_smaps_operations), 3005 REG("smaps", S_IRUGO, proc_pid_smaps_operations),
3006 REG("pagemap", S_IRUGO, proc_pagemap_operations), 3006 REG("pagemap", S_IRUGO, proc_pagemap_operations),
3007#endif 3007#endif
3008#ifdef CONFIG_SECURITY 3008#ifdef CONFIG_SECURITY
@@ -3348,9 +3348,9 @@ static const struct pid_entry tid_base_stuff[] = {
3348 INF("cmdline", S_IRUGO, proc_pid_cmdline), 3348 INF("cmdline", S_IRUGO, proc_pid_cmdline),
3349 ONE("stat", S_IRUGO, proc_tid_stat), 3349 ONE("stat", S_IRUGO, proc_tid_stat),
3350 ONE("statm", S_IRUGO, proc_pid_statm), 3350 ONE("statm", S_IRUGO, proc_pid_statm),
3351 REG("maps", S_IRUGO, proc_maps_operations), 3351 REG("maps", S_IRUGO, proc_tid_maps_operations),
3352#ifdef CONFIG_NUMA 3352#ifdef CONFIG_NUMA
3353 REG("numa_maps", S_IRUGO, proc_numa_maps_operations), 3353 REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations),
3354#endif 3354#endif
3355 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), 3355 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
3356 LNK("cwd", proc_cwd_link), 3356 LNK("cwd", proc_cwd_link),
@@ -3360,7 +3360,7 @@ static const struct pid_entry tid_base_stuff[] = {
3360 REG("mountinfo", S_IRUGO, proc_mountinfo_operations), 3360 REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
3361#ifdef CONFIG_PROC_PAGE_MONITOR 3361#ifdef CONFIG_PROC_PAGE_MONITOR
3362 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 3362 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
3363 REG("smaps", S_IRUGO, proc_smaps_operations), 3363 REG("smaps", S_IRUGO, proc_tid_smaps_operations),
3364 REG("pagemap", S_IRUGO, proc_pagemap_operations), 3364 REG("pagemap", S_IRUGO, proc_pagemap_operations),
3365#endif 3365#endif
3366#ifdef CONFIG_SECURITY 3366#ifdef CONFIG_SECURITY
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 292577531ad1..c44efe19798f 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -53,9 +53,12 @@ extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
53 struct pid *pid, struct task_struct *task); 53 struct pid *pid, struct task_struct *task);
54extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); 54extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
55 55
56extern const struct file_operations proc_maps_operations; 56extern const struct file_operations proc_pid_maps_operations;
57extern const struct file_operations proc_numa_maps_operations; 57extern const struct file_operations proc_tid_maps_operations;
58extern const struct file_operations proc_smaps_operations; 58extern const struct file_operations proc_pid_numa_maps_operations;
59extern const struct file_operations proc_tid_numa_maps_operations;
60extern const struct file_operations proc_pid_smaps_operations;
61extern const struct file_operations proc_tid_smaps_operations;
59extern const struct file_operations proc_clear_refs_operations; 62extern const struct file_operations proc_clear_refs_operations;
60extern const struct file_operations proc_pagemap_operations; 63extern const struct file_operations proc_pagemap_operations;
61extern const struct file_operations proc_net_operations; 64extern const struct file_operations proc_net_operations;
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 6d8e6a9e93ab..7fcd0d60a968 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -115,6 +115,8 @@ u64 stable_page_flags(struct page *page)
115 u |= 1 << KPF_COMPOUND_TAIL; 115 u |= 1 << KPF_COMPOUND_TAIL;
116 if (PageHuge(page)) 116 if (PageHuge(page))
117 u |= 1 << KPF_HUGE; 117 u |= 1 << KPF_HUGE;
118 else if (PageTransCompound(page))
119 u |= 1 << KPF_THP;
118 120
119 /* 121 /*
120 * Caveats on high order pages: page->_count will only be set 122 * Caveats on high order pages: page->_count will only be set
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7dcd2a250495..9694cc283511 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -209,16 +209,20 @@ static int do_maps_open(struct inode *inode, struct file *file,
209 return ret; 209 return ret;
210} 210}
211 211
212static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) 212static void
213show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
213{ 214{
214 struct mm_struct *mm = vma->vm_mm; 215 struct mm_struct *mm = vma->vm_mm;
215 struct file *file = vma->vm_file; 216 struct file *file = vma->vm_file;
217 struct proc_maps_private *priv = m->private;
218 struct task_struct *task = priv->task;
216 vm_flags_t flags = vma->vm_flags; 219 vm_flags_t flags = vma->vm_flags;
217 unsigned long ino = 0; 220 unsigned long ino = 0;
218 unsigned long long pgoff = 0; 221 unsigned long long pgoff = 0;
219 unsigned long start, end; 222 unsigned long start, end;
220 dev_t dev = 0; 223 dev_t dev = 0;
221 int len; 224 int len;
225 const char *name = NULL;
222 226
223 if (file) { 227 if (file) {
224 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 228 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
@@ -252,36 +256,57 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
252 if (file) { 256 if (file) {
253 pad_len_spaces(m, len); 257 pad_len_spaces(m, len);
254 seq_path(m, &file->f_path, "\n"); 258 seq_path(m, &file->f_path, "\n");
255 } else { 259 goto done;
256 const char *name = arch_vma_name(vma); 260 }
257 if (!name) { 261
258 if (mm) { 262 name = arch_vma_name(vma);
259 if (vma->vm_start <= mm->brk && 263 if (!name) {
260 vma->vm_end >= mm->start_brk) { 264 pid_t tid;
261 name = "[heap]"; 265
262 } else if (vma->vm_start <= mm->start_stack && 266 if (!mm) {
263 vma->vm_end >= mm->start_stack) { 267 name = "[vdso]";
264 name = "[stack]"; 268 goto done;
265 } 269 }
270
271 if (vma->vm_start <= mm->brk &&
272 vma->vm_end >= mm->start_brk) {
273 name = "[heap]";
274 goto done;
275 }
276
277 tid = vm_is_stack(task, vma, is_pid);
278
279 if (tid != 0) {
280 /*
281 * Thread stack in /proc/PID/task/TID/maps or
282 * the main process stack.
283 */
284 if (!is_pid || (vma->vm_start <= mm->start_stack &&
285 vma->vm_end >= mm->start_stack)) {
286 name = "[stack]";
266 } else { 287 } else {
267 name = "[vdso]"; 288 /* Thread stack in /proc/PID/maps */
289 pad_len_spaces(m, len);
290 seq_printf(m, "[stack:%d]", tid);
268 } 291 }
269 } 292 }
270 if (name) { 293 }
271 pad_len_spaces(m, len); 294
272 seq_puts(m, name); 295done:
273 } 296 if (name) {
297 pad_len_spaces(m, len);
298 seq_puts(m, name);
274 } 299 }
275 seq_putc(m, '\n'); 300 seq_putc(m, '\n');
276} 301}
277 302
278static int show_map(struct seq_file *m, void *v) 303static int show_map(struct seq_file *m, void *v, int is_pid)
279{ 304{
280 struct vm_area_struct *vma = v; 305 struct vm_area_struct *vma = v;
281 struct proc_maps_private *priv = m->private; 306 struct proc_maps_private *priv = m->private;
282 struct task_struct *task = priv->task; 307 struct task_struct *task = priv->task;
283 308
284 show_map_vma(m, vma); 309 show_map_vma(m, vma, is_pid);
285 310
286 if (m->count < m->size) /* vma is copied successfully */ 311 if (m->count < m->size) /* vma is copied successfully */
287 m->version = (vma != get_gate_vma(task->mm)) 312 m->version = (vma != get_gate_vma(task->mm))
@@ -289,20 +314,49 @@ static int show_map(struct seq_file *m, void *v)
289 return 0; 314 return 0;
290} 315}
291 316
317static int show_pid_map(struct seq_file *m, void *v)
318{
319 return show_map(m, v, 1);
320}
321
322static int show_tid_map(struct seq_file *m, void *v)
323{
324 return show_map(m, v, 0);
325}
326
292static const struct seq_operations proc_pid_maps_op = { 327static const struct seq_operations proc_pid_maps_op = {
293 .start = m_start, 328 .start = m_start,
294 .next = m_next, 329 .next = m_next,
295 .stop = m_stop, 330 .stop = m_stop,
296 .show = show_map 331 .show = show_pid_map
297}; 332};
298 333
299static int maps_open(struct inode *inode, struct file *file) 334static const struct seq_operations proc_tid_maps_op = {
335 .start = m_start,
336 .next = m_next,
337 .stop = m_stop,
338 .show = show_tid_map
339};
340
341static int pid_maps_open(struct inode *inode, struct file *file)
300{ 342{
301 return do_maps_open(inode, file, &proc_pid_maps_op); 343 return do_maps_open(inode, file, &proc_pid_maps_op);
302} 344}
303 345
304const struct file_operations proc_maps_operations = { 346static int tid_maps_open(struct inode *inode, struct file *file)
305 .open = maps_open, 347{
348 return do_maps_open(inode, file, &proc_tid_maps_op);
349}
350
351const struct file_operations proc_pid_maps_operations = {
352 .open = pid_maps_open,
353 .read = seq_read,
354 .llseek = seq_lseek,
355 .release = seq_release_private,
356};
357
358const struct file_operations proc_tid_maps_operations = {
359 .open = tid_maps_open,
306 .read = seq_read, 360 .read = seq_read,
307 .llseek = seq_lseek, 361 .llseek = seq_lseek,
308 .release = seq_release_private, 362 .release = seq_release_private,
@@ -394,21 +448,15 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
394 pte_t *pte; 448 pte_t *pte;
395 spinlock_t *ptl; 449 spinlock_t *ptl;
396 450
397 spin_lock(&walk->mm->page_table_lock); 451 if (pmd_trans_huge_lock(pmd, vma) == 1) {
398 if (pmd_trans_huge(*pmd)) { 452 smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
399 if (pmd_trans_splitting(*pmd)) {
400 spin_unlock(&walk->mm->page_table_lock);
401 wait_split_huge_page(vma->anon_vma, pmd);
402 } else {
403 smaps_pte_entry(*(pte_t *)pmd, addr,
404 HPAGE_PMD_SIZE, walk);
405 spin_unlock(&walk->mm->page_table_lock);
406 mss->anonymous_thp += HPAGE_PMD_SIZE;
407 return 0;
408 }
409 } else {
410 spin_unlock(&walk->mm->page_table_lock); 453 spin_unlock(&walk->mm->page_table_lock);
454 mss->anonymous_thp += HPAGE_PMD_SIZE;
455 return 0;
411 } 456 }
457
458 if (pmd_trans_unstable(pmd))
459 return 0;
412 /* 460 /*
413 * The mmap_sem held all the way back in m_start() is what 461 * The mmap_sem held all the way back in m_start() is what
414 * keeps khugepaged out of here and from collapsing things 462 * keeps khugepaged out of here and from collapsing things
@@ -422,7 +470,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
422 return 0; 470 return 0;
423} 471}
424 472
425static int show_smap(struct seq_file *m, void *v) 473static int show_smap(struct seq_file *m, void *v, int is_pid)
426{ 474{
427 struct proc_maps_private *priv = m->private; 475 struct proc_maps_private *priv = m->private;
428 struct task_struct *task = priv->task; 476 struct task_struct *task = priv->task;
@@ -440,7 +488,7 @@ static int show_smap(struct seq_file *m, void *v)
440 if (vma->vm_mm && !is_vm_hugetlb_page(vma)) 488 if (vma->vm_mm && !is_vm_hugetlb_page(vma))
441 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); 489 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
442 490
443 show_map_vma(m, vma); 491 show_map_vma(m, vma, is_pid);
444 492
445 seq_printf(m, 493 seq_printf(m,
446 "Size: %8lu kB\n" 494 "Size: %8lu kB\n"
@@ -479,20 +527,49 @@ static int show_smap(struct seq_file *m, void *v)
479 return 0; 527 return 0;
480} 528}
481 529
530static int show_pid_smap(struct seq_file *m, void *v)
531{
532 return show_smap(m, v, 1);
533}
534
535static int show_tid_smap(struct seq_file *m, void *v)
536{
537 return show_smap(m, v, 0);
538}
539
482static const struct seq_operations proc_pid_smaps_op = { 540static const struct seq_operations proc_pid_smaps_op = {
483 .start = m_start, 541 .start = m_start,
484 .next = m_next, 542 .next = m_next,
485 .stop = m_stop, 543 .stop = m_stop,
486 .show = show_smap 544 .show = show_pid_smap
545};
546
547static const struct seq_operations proc_tid_smaps_op = {
548 .start = m_start,
549 .next = m_next,
550 .stop = m_stop,
551 .show = show_tid_smap
487}; 552};
488 553
489static int smaps_open(struct inode *inode, struct file *file) 554static int pid_smaps_open(struct inode *inode, struct file *file)
490{ 555{
491 return do_maps_open(inode, file, &proc_pid_smaps_op); 556 return do_maps_open(inode, file, &proc_pid_smaps_op);
492} 557}
493 558
494const struct file_operations proc_smaps_operations = { 559static int tid_smaps_open(struct inode *inode, struct file *file)
495 .open = smaps_open, 560{
561 return do_maps_open(inode, file, &proc_tid_smaps_op);
562}
563
564const struct file_operations proc_pid_smaps_operations = {
565 .open = pid_smaps_open,
566 .read = seq_read,
567 .llseek = seq_lseek,
568 .release = seq_release_private,
569};
570
571const struct file_operations proc_tid_smaps_operations = {
572 .open = tid_smaps_open,
496 .read = seq_read, 573 .read = seq_read,
497 .llseek = seq_lseek, 574 .llseek = seq_lseek,
498 .release = seq_release_private, 575 .release = seq_release_private,
@@ -507,6 +584,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
507 struct page *page; 584 struct page *page;
508 585
509 split_huge_page_pmd(walk->mm, pmd); 586 split_huge_page_pmd(walk->mm, pmd);
587 if (pmd_trans_unstable(pmd))
588 return 0;
510 589
511 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 590 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
512 for (; addr != end; pte++, addr += PAGE_SIZE) { 591 for (; addr != end; pte++, addr += PAGE_SIZE) {
@@ -598,11 +677,18 @@ const struct file_operations proc_clear_refs_operations = {
598 .llseek = noop_llseek, 677 .llseek = noop_llseek,
599}; 678};
600 679
680typedef struct {
681 u64 pme;
682} pagemap_entry_t;
683
601struct pagemapread { 684struct pagemapread {
602 int pos, len; 685 int pos, len;
603 u64 *buffer; 686 pagemap_entry_t *buffer;
604}; 687};
605 688
689#define PAGEMAP_WALK_SIZE (PMD_SIZE)
690#define PAGEMAP_WALK_MASK (PMD_MASK)
691
606#define PM_ENTRY_BYTES sizeof(u64) 692#define PM_ENTRY_BYTES sizeof(u64)
607#define PM_STATUS_BITS 3 693#define PM_STATUS_BITS 3
608#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) 694#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
@@ -620,10 +706,15 @@ struct pagemapread {
620#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) 706#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT)
621#define PM_END_OF_BUFFER 1 707#define PM_END_OF_BUFFER 1
622 708
623static int add_to_pagemap(unsigned long addr, u64 pfn, 709static inline pagemap_entry_t make_pme(u64 val)
710{
711 return (pagemap_entry_t) { .pme = val };
712}
713
714static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
624 struct pagemapread *pm) 715 struct pagemapread *pm)
625{ 716{
626 pm->buffer[pm->pos++] = pfn; 717 pm->buffer[pm->pos++] = *pme;
627 if (pm->pos >= pm->len) 718 if (pm->pos >= pm->len)
628 return PM_END_OF_BUFFER; 719 return PM_END_OF_BUFFER;
629 return 0; 720 return 0;
@@ -635,8 +726,10 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
635 struct pagemapread *pm = walk->private; 726 struct pagemapread *pm = walk->private;
636 unsigned long addr; 727 unsigned long addr;
637 int err = 0; 728 int err = 0;
729 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
730
638 for (addr = start; addr < end; addr += PAGE_SIZE) { 731 for (addr = start; addr < end; addr += PAGE_SIZE) {
639 err = add_to_pagemap(addr, PM_NOT_PRESENT, pm); 732 err = add_to_pagemap(addr, &pme, pm);
640 if (err) 733 if (err)
641 break; 734 break;
642 } 735 }
@@ -649,17 +742,35 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte)
649 return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); 742 return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
650} 743}
651 744
652static u64 pte_to_pagemap_entry(pte_t pte) 745static void pte_to_pagemap_entry(pagemap_entry_t *pme, pte_t pte)
653{ 746{
654 u64 pme = 0;
655 if (is_swap_pte(pte)) 747 if (is_swap_pte(pte))
656 pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte)) 748 *pme = make_pme(PM_PFRAME(swap_pte_to_pagemap_entry(pte))
657 | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP; 749 | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP);
658 else if (pte_present(pte)) 750 else if (pte_present(pte))
659 pme = PM_PFRAME(pte_pfn(pte)) 751 *pme = make_pme(PM_PFRAME(pte_pfn(pte))
660 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; 752 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
661 return pme; 753}
754
755#ifdef CONFIG_TRANSPARENT_HUGEPAGE
756static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
757 pmd_t pmd, int offset)
758{
759 /*
760 * Currently pmd for thp is always present because thp can not be
761 * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
762 * This if-check is just to prepare for future implementation.
763 */
764 if (pmd_present(pmd))
765 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
766 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
662} 767}
768#else
769static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
770 pmd_t pmd, int offset)
771{
772}
773#endif
663 774
664static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 775static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
665 struct mm_walk *walk) 776 struct mm_walk *walk)
@@ -668,13 +779,30 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
668 struct pagemapread *pm = walk->private; 779 struct pagemapread *pm = walk->private;
669 pte_t *pte; 780 pte_t *pte;
670 int err = 0; 781 int err = 0;
782 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
671 783
672 split_huge_page_pmd(walk->mm, pmd); 784 if (pmd_trans_unstable(pmd))
785 return 0;
673 786
674 /* find the first VMA at or above 'addr' */ 787 /* find the first VMA at or above 'addr' */
675 vma = find_vma(walk->mm, addr); 788 vma = find_vma(walk->mm, addr);
789 spin_lock(&walk->mm->page_table_lock);
790 if (pmd_trans_huge_lock(pmd, vma) == 1) {
791 for (; addr != end; addr += PAGE_SIZE) {
792 unsigned long offset;
793
794 offset = (addr & ~PAGEMAP_WALK_MASK) >>
795 PAGE_SHIFT;
796 thp_pmd_to_pagemap_entry(&pme, *pmd, offset);
797 err = add_to_pagemap(addr, &pme, pm);
798 if (err)
799 break;
800 }
801 spin_unlock(&walk->mm->page_table_lock);
802 return err;
803 }
804
676 for (; addr != end; addr += PAGE_SIZE) { 805 for (; addr != end; addr += PAGE_SIZE) {
677 u64 pfn = PM_NOT_PRESENT;
678 806
679 /* check to see if we've left 'vma' behind 807 /* check to see if we've left 'vma' behind
680 * and need a new, higher one */ 808 * and need a new, higher one */
@@ -686,11 +814,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
686 if (vma && (vma->vm_start <= addr) && 814 if (vma && (vma->vm_start <= addr) &&
687 !is_vm_hugetlb_page(vma)) { 815 !is_vm_hugetlb_page(vma)) {
688 pte = pte_offset_map(pmd, addr); 816 pte = pte_offset_map(pmd, addr);
689 pfn = pte_to_pagemap_entry(*pte); 817 pte_to_pagemap_entry(&pme, *pte);
690 /* unmap before userspace copy */ 818 /* unmap before userspace copy */
691 pte_unmap(pte); 819 pte_unmap(pte);
692 } 820 }
693 err = add_to_pagemap(addr, pfn, pm); 821 err = add_to_pagemap(addr, &pme, pm);
694 if (err) 822 if (err)
695 return err; 823 return err;
696 } 824 }
@@ -701,13 +829,12 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
701} 829}
702 830
703#ifdef CONFIG_HUGETLB_PAGE 831#ifdef CONFIG_HUGETLB_PAGE
704static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) 832static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme,
833 pte_t pte, int offset)
705{ 834{
706 u64 pme = 0;
707 if (pte_present(pte)) 835 if (pte_present(pte))
708 pme = PM_PFRAME(pte_pfn(pte) + offset) 836 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
709 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; 837 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
710 return pme;
711} 838}
712 839
713/* This function walks within one hugetlb entry in the single call */ 840/* This function walks within one hugetlb entry in the single call */
@@ -717,12 +844,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
717{ 844{
718 struct pagemapread *pm = walk->private; 845 struct pagemapread *pm = walk->private;
719 int err = 0; 846 int err = 0;
720 u64 pfn; 847 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
721 848
722 for (; addr != end; addr += PAGE_SIZE) { 849 for (; addr != end; addr += PAGE_SIZE) {
723 int offset = (addr & ~hmask) >> PAGE_SHIFT; 850 int offset = (addr & ~hmask) >> PAGE_SHIFT;
724 pfn = huge_pte_to_pagemap_entry(*pte, offset); 851 huge_pte_to_pagemap_entry(&pme, *pte, offset);
725 err = add_to_pagemap(addr, pfn, pm); 852 err = add_to_pagemap(addr, &pme, pm);
726 if (err) 853 if (err)
727 return err; 854 return err;
728 } 855 }
@@ -757,8 +884,6 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
757 * determine which areas of memory are actually mapped and llseek to 884 * determine which areas of memory are actually mapped and llseek to
758 * skip over unmapped regions. 885 * skip over unmapped regions.
759 */ 886 */
760#define PAGEMAP_WALK_SIZE (PMD_SIZE)
761#define PAGEMAP_WALK_MASK (PMD_MASK)
762static ssize_t pagemap_read(struct file *file, char __user *buf, 887static ssize_t pagemap_read(struct file *file, char __user *buf,
763 size_t count, loff_t *ppos) 888 size_t count, loff_t *ppos)
764{ 889{
@@ -941,26 +1066,21 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
941 pte_t *pte; 1066 pte_t *pte;
942 1067
943 md = walk->private; 1068 md = walk->private;
944 spin_lock(&walk->mm->page_table_lock); 1069
945 if (pmd_trans_huge(*pmd)) { 1070 if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
946 if (pmd_trans_splitting(*pmd)) { 1071 pte_t huge_pte = *(pte_t *)pmd;
947 spin_unlock(&walk->mm->page_table_lock); 1072 struct page *page;
948 wait_split_huge_page(md->vma->anon_vma, pmd); 1073
949 } else { 1074 page = can_gather_numa_stats(huge_pte, md->vma, addr);
950 pte_t huge_pte = *(pte_t *)pmd; 1075 if (page)
951 struct page *page; 1076 gather_stats(page, md, pte_dirty(huge_pte),
952 1077 HPAGE_PMD_SIZE/PAGE_SIZE);
953 page = can_gather_numa_stats(huge_pte, md->vma, addr);
954 if (page)
955 gather_stats(page, md, pte_dirty(huge_pte),
956 HPAGE_PMD_SIZE/PAGE_SIZE);
957 spin_unlock(&walk->mm->page_table_lock);
958 return 0;
959 }
960 } else {
961 spin_unlock(&walk->mm->page_table_lock); 1078 spin_unlock(&walk->mm->page_table_lock);
1079 return 0;
962 } 1080 }
963 1081
1082 if (pmd_trans_unstable(pmd))
1083 return 0;
964 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 1084 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
965 do { 1085 do {
966 struct page *page = can_gather_numa_stats(*pte, md->vma, addr); 1086 struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
@@ -1002,7 +1122,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
1002/* 1122/*
1003 * Display pages allocated per node and memory policy via /proc. 1123 * Display pages allocated per node and memory policy via /proc.
1004 */ 1124 */
1005static int show_numa_map(struct seq_file *m, void *v) 1125static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1006{ 1126{
1007 struct numa_maps_private *numa_priv = m->private; 1127 struct numa_maps_private *numa_priv = m->private;
1008 struct proc_maps_private *proc_priv = &numa_priv->proc_maps; 1128 struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
@@ -1039,9 +1159,19 @@ static int show_numa_map(struct seq_file *m, void *v)
1039 seq_path(m, &file->f_path, "\n\t= "); 1159 seq_path(m, &file->f_path, "\n\t= ");
1040 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 1160 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1041 seq_printf(m, " heap"); 1161 seq_printf(m, " heap");
1042 } else if (vma->vm_start <= mm->start_stack && 1162 } else {
1043 vma->vm_end >= mm->start_stack) { 1163 pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid);
1044 seq_printf(m, " stack"); 1164 if (tid != 0) {
1165 /*
1166 * Thread stack in /proc/PID/task/TID/maps or
1167 * the main process stack.
1168 */
1169 if (!is_pid || (vma->vm_start <= mm->start_stack &&
1170 vma->vm_end >= mm->start_stack))
1171 seq_printf(m, " stack");
1172 else
1173 seq_printf(m, " stack:%d", tid);
1174 }
1045 } 1175 }
1046 1176
1047 if (is_vm_hugetlb_page(vma)) 1177 if (is_vm_hugetlb_page(vma))
@@ -1084,21 +1214,39 @@ out:
1084 return 0; 1214 return 0;
1085} 1215}
1086 1216
1217static int show_pid_numa_map(struct seq_file *m, void *v)
1218{
1219 return show_numa_map(m, v, 1);
1220}
1221
1222static int show_tid_numa_map(struct seq_file *m, void *v)
1223{
1224 return show_numa_map(m, v, 0);
1225}
1226
1087static const struct seq_operations proc_pid_numa_maps_op = { 1227static const struct seq_operations proc_pid_numa_maps_op = {
1088 .start = m_start, 1228 .start = m_start,
1089 .next = m_next, 1229 .next = m_next,
1090 .stop = m_stop, 1230 .stop = m_stop,
1091 .show = show_numa_map, 1231 .show = show_pid_numa_map,
1232};
1233
1234static const struct seq_operations proc_tid_numa_maps_op = {
1235 .start = m_start,
1236 .next = m_next,
1237 .stop = m_stop,
1238 .show = show_tid_numa_map,
1092}; 1239};
1093 1240
1094static int numa_maps_open(struct inode *inode, struct file *file) 1241static int numa_maps_open(struct inode *inode, struct file *file,
1242 const struct seq_operations *ops)
1095{ 1243{
1096 struct numa_maps_private *priv; 1244 struct numa_maps_private *priv;
1097 int ret = -ENOMEM; 1245 int ret = -ENOMEM;
1098 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 1246 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
1099 if (priv) { 1247 if (priv) {
1100 priv->proc_maps.pid = proc_pid(inode); 1248 priv->proc_maps.pid = proc_pid(inode);
1101 ret = seq_open(file, &proc_pid_numa_maps_op); 1249 ret = seq_open(file, ops);
1102 if (!ret) { 1250 if (!ret) {
1103 struct seq_file *m = file->private_data; 1251 struct seq_file *m = file->private_data;
1104 m->private = priv; 1252 m->private = priv;
@@ -1109,8 +1257,25 @@ static int numa_maps_open(struct inode *inode, struct file *file)
1109 return ret; 1257 return ret;
1110} 1258}
1111 1259
1112const struct file_operations proc_numa_maps_operations = { 1260static int pid_numa_maps_open(struct inode *inode, struct file *file)
1113 .open = numa_maps_open, 1261{
1262 return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
1263}
1264
1265static int tid_numa_maps_open(struct inode *inode, struct file *file)
1266{
1267 return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
1268}
1269
1270const struct file_operations proc_pid_numa_maps_operations = {
1271 .open = pid_numa_maps_open,
1272 .read = seq_read,
1273 .llseek = seq_lseek,
1274 .release = seq_release_private,
1275};
1276
1277const struct file_operations proc_tid_numa_maps_operations = {
1278 .open = tid_numa_maps_open,
1114 .read = seq_read, 1279 .read = seq_read,
1115 .llseek = seq_lseek, 1280 .llseek = seq_lseek,
1116 .release = seq_release_private, 1281 .release = seq_release_private,
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 980de547c070..74fe164d1b23 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -134,9 +134,11 @@ static void pad_len_spaces(struct seq_file *m, int len)
134/* 134/*
135 * display a single VMA to a sequenced file 135 * display a single VMA to a sequenced file
136 */ 136 */
137static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) 137static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
138 int is_pid)
138{ 139{
139 struct mm_struct *mm = vma->vm_mm; 140 struct mm_struct *mm = vma->vm_mm;
141 struct proc_maps_private *priv = m->private;
140 unsigned long ino = 0; 142 unsigned long ino = 0;
141 struct file *file; 143 struct file *file;
142 dev_t dev = 0; 144 dev_t dev = 0;
@@ -168,10 +170,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
168 pad_len_spaces(m, len); 170 pad_len_spaces(m, len);
169 seq_path(m, &file->f_path, ""); 171 seq_path(m, &file->f_path, "");
170 } else if (mm) { 172 } else if (mm) {
171 if (vma->vm_start <= mm->start_stack && 173 pid_t tid = vm_is_stack(priv->task, vma, is_pid);
172 vma->vm_end >= mm->start_stack) { 174
175 if (tid != 0) {
173 pad_len_spaces(m, len); 176 pad_len_spaces(m, len);
174 seq_puts(m, "[stack]"); 177 /*
178 * Thread stack in /proc/PID/task/TID/maps or
179 * the main process stack.
180 */
181 if (!is_pid || (vma->vm_start <= mm->start_stack &&
182 vma->vm_end >= mm->start_stack))
183 seq_printf(m, "[stack]");
184 else
185 seq_printf(m, "[stack:%d]", tid);
175 } 186 }
176 } 187 }
177 188
@@ -182,11 +193,22 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
182/* 193/*
183 * display mapping lines for a particular process's /proc/pid/maps 194 * display mapping lines for a particular process's /proc/pid/maps
184 */ 195 */
185static int show_map(struct seq_file *m, void *_p) 196static int show_map(struct seq_file *m, void *_p, int is_pid)
186{ 197{
187 struct rb_node *p = _p; 198 struct rb_node *p = _p;
188 199
189 return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); 200 return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb),
201 is_pid);
202}
203
204static int show_pid_map(struct seq_file *m, void *_p)
205{
206 return show_map(m, _p, 1);
207}
208
209static int show_tid_map(struct seq_file *m, void *_p)
210{
211 return show_map(m, _p, 0);
190} 212}
191 213
192static void *m_start(struct seq_file *m, loff_t *pos) 214static void *m_start(struct seq_file *m, loff_t *pos)
@@ -240,10 +262,18 @@ static const struct seq_operations proc_pid_maps_ops = {
240 .start = m_start, 262 .start = m_start,
241 .next = m_next, 263 .next = m_next,
242 .stop = m_stop, 264 .stop = m_stop,
243 .show = show_map 265 .show = show_pid_map
266};
267
268static const struct seq_operations proc_tid_maps_ops = {
269 .start = m_start,
270 .next = m_next,
271 .stop = m_stop,
272 .show = show_tid_map
244}; 273};
245 274
246static int maps_open(struct inode *inode, struct file *file) 275static int maps_open(struct inode *inode, struct file *file,
276 const struct seq_operations *ops)
247{ 277{
248 struct proc_maps_private *priv; 278 struct proc_maps_private *priv;
249 int ret = -ENOMEM; 279 int ret = -ENOMEM;
@@ -251,7 +281,7 @@ static int maps_open(struct inode *inode, struct file *file)
251 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 281 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
252 if (priv) { 282 if (priv) {
253 priv->pid = proc_pid(inode); 283 priv->pid = proc_pid(inode);
254 ret = seq_open(file, &proc_pid_maps_ops); 284 ret = seq_open(file, ops);
255 if (!ret) { 285 if (!ret) {
256 struct seq_file *m = file->private_data; 286 struct seq_file *m = file->private_data;
257 m->private = priv; 287 m->private = priv;
@@ -262,8 +292,25 @@ static int maps_open(struct inode *inode, struct file *file)
262 return ret; 292 return ret;
263} 293}
264 294
265const struct file_operations proc_maps_operations = { 295static int pid_maps_open(struct inode *inode, struct file *file)
266 .open = maps_open, 296{
297 return maps_open(inode, file, &proc_pid_maps_ops);
298}
299
300static int tid_maps_open(struct inode *inode, struct file *file)
301{
302 return maps_open(inode, file, &proc_tid_maps_ops);
303}
304
305const struct file_operations proc_pid_maps_operations = {
306 .open = pid_maps_open,
307 .read = seq_read,
308 .llseek = seq_lseek,
309 .release = seq_release_private,
310};
311
312const struct file_operations proc_tid_maps_operations = {
313 .open = tid_maps_open,
267 .read = seq_read, 314 .read = seq_read,
268 .llseek = seq_lseek, 315 .llseek = seq_lseek,
269 .release = seq_release_private, 316 .release = seq_release_private,
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 4023d6be939b..aa242dc99373 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -140,9 +140,21 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
140 140
141 mutex_lock(&m->lock); 141 mutex_lock(&m->lock);
142 142
143 /*
144 * seq_file->op->..m_start/m_stop/m_next may do special actions
145 * or optimisations based on the file->f_version, so we want to
146 * pass the file->f_version to those methods.
147 *
148 * seq_file->version is just copy of f_version, and seq_file
149 * methods can treat it simply as file version.
150 * It is copied in first and copied out after all operations.
151 * It is convenient to have it as part of structure to avoid the
152 * need of passing another argument to all the seq_file methods.
153 */
154 m->version = file->f_version;
155
143 /* Don't assume *ppos is where we left it */ 156 /* Don't assume *ppos is where we left it */
144 if (unlikely(*ppos != m->read_pos)) { 157 if (unlikely(*ppos != m->read_pos)) {
145 m->read_pos = *ppos;
146 while ((err = traverse(m, *ppos)) == -EAGAIN) 158 while ((err = traverse(m, *ppos)) == -EAGAIN)
147 ; 159 ;
148 if (err) { 160 if (err) {
@@ -152,21 +164,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
152 m->index = 0; 164 m->index = 0;
153 m->count = 0; 165 m->count = 0;
154 goto Done; 166 goto Done;
167 } else {
168 m->read_pos = *ppos;
155 } 169 }
156 } 170 }
157 171
158 /*
159 * seq_file->op->..m_start/m_stop/m_next may do special actions
160 * or optimisations based on the file->f_version, so we want to
161 * pass the file->f_version to those methods.
162 *
163 * seq_file->version is just copy of f_version, and seq_file
164 * methods can treat it simply as file version.
165 * It is copied in first and copied out after all operations.
166 * It is convenient to have it as part of structure to avoid the
167 * need of passing another argument to all the seq_file methods.
168 */
169 m->version = file->f_version;
170 /* grab buffer if we didn't have one */ 172 /* grab buffer if we didn't have one */
171 if (!m->buf) { 173 if (!m->buf) {
172 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); 174 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 76bff2bff15e..a03c098b0cce 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -425,6 +425,8 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
425 unsigned long size); 425 unsigned long size);
426#endif 426#endif
427 427
428#ifdef CONFIG_MMU
429
428#ifndef CONFIG_TRANSPARENT_HUGEPAGE 430#ifndef CONFIG_TRANSPARENT_HUGEPAGE
429static inline int pmd_trans_huge(pmd_t pmd) 431static inline int pmd_trans_huge(pmd_t pmd)
430{ 432{
@@ -441,7 +443,66 @@ static inline int pmd_write(pmd_t pmd)
441 return 0; 443 return 0;
442} 444}
443#endif /* __HAVE_ARCH_PMD_WRITE */ 445#endif /* __HAVE_ARCH_PMD_WRITE */
446#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
447
448/*
449 * This function is meant to be used by sites walking pagetables with
450 * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
451 * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd
452 * into a null pmd and the transhuge page fault can convert a null pmd
453 * into an hugepmd or into a regular pmd (if the hugepage allocation
454 * fails). While holding the mmap_sem in read mode the pmd becomes
455 * stable and stops changing under us only if it's not null and not a
456 * transhuge pmd. When those races occurs and this function makes a
457 * difference vs the standard pmd_none_or_clear_bad, the result is
458 * undefined so behaving like if the pmd was none is safe (because it
459 * can return none anyway). The compiler level barrier() is critically
460 * important to compute the two checks atomically on the same pmdval.
461 */
462static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
463{
464 /* depend on compiler for an atomic pmd read */
465 pmd_t pmdval = *pmd;
466 /*
467 * The barrier will stabilize the pmdval in a register or on
468 * the stack so that it will stop changing under the code.
469 */
470#ifdef CONFIG_TRANSPARENT_HUGEPAGE
471 barrier();
472#endif
473 if (pmd_none(pmdval))
474 return 1;
475 if (unlikely(pmd_bad(pmdval))) {
476 if (!pmd_trans_huge(pmdval))
477 pmd_clear_bad(pmd);
478 return 1;
479 }
480 return 0;
481}
482
483/*
484 * This is a noop if Transparent Hugepage Support is not built into
485 * the kernel. Otherwise it is equivalent to
486 * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in
487 * places that already verified the pmd is not none and they want to
488 * walk ptes while holding the mmap sem in read mode (write mode don't
489 * need this). If THP is not enabled, the pmd can't go away under the
490 * code even if MADV_DONTNEED runs, but if THP is enabled we need to
491 * run a pmd_trans_unstable before walking the ptes after
492 * split_huge_page_pmd returns (because it may have run when the pmd
493 * become null, but then a page fault can map in a THP and not a
494 * regular page).
495 */
496static inline int pmd_trans_unstable(pmd_t *pmd)
497{
498#ifdef CONFIG_TRANSPARENT_HUGEPAGE
499 return pmd_none_or_trans_huge_or_clear_bad(pmd);
500#else
501 return 0;
444#endif 502#endif
503}
504
505#endif /* CONFIG_MMU */
445 506
446#endif /* !__ASSEMBLY__ */ 507#endif /* !__ASSEMBLY__ */
447 508
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 501adb1b2f43..5a85b3415c1b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -498,7 +498,7 @@ struct cgroup_subsys {
498 struct list_head sibling; 498 struct list_head sibling;
499 /* used when use_id == true */ 499 /* used when use_id == true */
500 struct idr idr; 500 struct idr idr;
501 rwlock_t id_lock; 501 spinlock_t id_lock;
502 502
503 /* should be defined only by modular subsystems */ 503 /* should be defined only by modular subsystems */
504 struct module *module; 504 struct module *module;
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index bb2bbdbe5464..51a90b7f2d60 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -23,6 +23,7 @@ extern int fragmentation_index(struct zone *zone, unsigned int order);
23extern unsigned long try_to_compact_pages(struct zonelist *zonelist, 23extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
24 int order, gfp_t gfp_mask, nodemask_t *mask, 24 int order, gfp_t gfp_mask, nodemask_t *mask,
25 bool sync); 25 bool sync);
26extern int compact_pgdat(pg_data_t *pgdat, int order);
26extern unsigned long compaction_suitable(struct zone *zone, int order); 27extern unsigned long compaction_suitable(struct zone *zone, int order);
27 28
28/* Do not skip compaction more than 64 times */ 29/* Do not skip compaction more than 64 times */
@@ -33,20 +34,26 @@ extern unsigned long compaction_suitable(struct zone *zone, int order);
33 * allocation success. 1 << compact_defer_limit compactions are skipped up 34 * allocation success. 1 << compact_defer_limit compactions are skipped up
34 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT 35 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
35 */ 36 */
36static inline void defer_compaction(struct zone *zone) 37static inline void defer_compaction(struct zone *zone, int order)
37{ 38{
38 zone->compact_considered = 0; 39 zone->compact_considered = 0;
39 zone->compact_defer_shift++; 40 zone->compact_defer_shift++;
40 41
42 if (order < zone->compact_order_failed)
43 zone->compact_order_failed = order;
44
41 if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) 45 if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
42 zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; 46 zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
43} 47}
44 48
45/* Returns true if compaction should be skipped this time */ 49/* Returns true if compaction should be skipped this time */
46static inline bool compaction_deferred(struct zone *zone) 50static inline bool compaction_deferred(struct zone *zone, int order)
47{ 51{
48 unsigned long defer_limit = 1UL << zone->compact_defer_shift; 52 unsigned long defer_limit = 1UL << zone->compact_defer_shift;
49 53
54 if (order < zone->compact_order_failed)
55 return false;
56
50 /* Avoid possible overflow */ 57 /* Avoid possible overflow */
51 if (++zone->compact_considered > defer_limit) 58 if (++zone->compact_considered > defer_limit)
52 zone->compact_considered = defer_limit; 59 zone->compact_considered = defer_limit;
@@ -62,16 +69,21 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
62 return COMPACT_CONTINUE; 69 return COMPACT_CONTINUE;
63} 70}
64 71
72static inline int compact_pgdat(pg_data_t *pgdat, int order)
73{
74 return COMPACT_CONTINUE;
75}
76
65static inline unsigned long compaction_suitable(struct zone *zone, int order) 77static inline unsigned long compaction_suitable(struct zone *zone, int order)
66{ 78{
67 return COMPACT_SKIPPED; 79 return COMPACT_SKIPPED;
68} 80}
69 81
70static inline void defer_compaction(struct zone *zone) 82static inline void defer_compaction(struct zone *zone, int order)
71{ 83{
72} 84}
73 85
74static inline bool compaction_deferred(struct zone *zone) 86static inline bool compaction_deferred(struct zone *zone, int order)
75{ 87{
76 return 1; 88 return 1;
77} 89}
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index e9eaec522655..7a7e5fd2a277 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -89,42 +89,33 @@ extern void rebuild_sched_domains(void);
89extern void cpuset_print_task_mems_allowed(struct task_struct *p); 89extern void cpuset_print_task_mems_allowed(struct task_struct *p);
90 90
91/* 91/*
92 * reading current mems_allowed and mempolicy in the fastpath must protected 92 * get_mems_allowed is required when making decisions involving mems_allowed
93 * by get_mems_allowed() 93 * such as during page allocation. mems_allowed can be updated in parallel
94 * and depending on the new value an operation can fail potentially causing
95 * process failure. A retry loop with get_mems_allowed and put_mems_allowed
96 * prevents these artificial failures.
94 */ 97 */
95static inline void get_mems_allowed(void) 98static inline unsigned int get_mems_allowed(void)
96{ 99{
97 current->mems_allowed_change_disable++; 100 return read_seqcount_begin(&current->mems_allowed_seq);
98
99 /*
100 * ensure that reading mems_allowed and mempolicy happens after the
101 * update of ->mems_allowed_change_disable.
102 *
103 * the write-side task finds ->mems_allowed_change_disable is not 0,
104 * and knows the read-side task is reading mems_allowed or mempolicy,
105 * so it will clear old bits lazily.
106 */
107 smp_mb();
108} 101}
109 102
110static inline void put_mems_allowed(void) 103/*
104 * If this returns false, the operation that took place after get_mems_allowed
105 * may have failed. It is up to the caller to retry the operation if
106 * appropriate.
107 */
108static inline bool put_mems_allowed(unsigned int seq)
111{ 109{
112 /* 110 return !read_seqcount_retry(&current->mems_allowed_seq, seq);
113 * ensure that reading mems_allowed and mempolicy before reducing
114 * mems_allowed_change_disable.
115 *
116 * the write-side task will know that the read-side task is still
117 * reading mems_allowed or mempolicy, don't clears old bits in the
118 * nodemask.
119 */
120 smp_mb();
121 --ACCESS_ONCE(current->mems_allowed_change_disable);
122} 111}
123 112
124static inline void set_mems_allowed(nodemask_t nodemask) 113static inline void set_mems_allowed(nodemask_t nodemask)
125{ 114{
126 task_lock(current); 115 task_lock(current);
116 write_seqcount_begin(&current->mems_allowed_seq);
127 current->mems_allowed = nodemask; 117 current->mems_allowed = nodemask;
118 write_seqcount_end(&current->mems_allowed_seq);
128 task_unlock(current); 119 task_unlock(current);
129} 120}
130 121
@@ -234,12 +225,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
234{ 225{
235} 226}
236 227
237static inline void get_mems_allowed(void) 228static inline unsigned int get_mems_allowed(void)
238{ 229{
230 return 0;
239} 231}
240 232
241static inline void put_mems_allowed(void) 233static inline bool put_mems_allowed(unsigned int seq)
242{ 234{
235 return true;
243} 236}
244 237
245#endif /* !CONFIG_CPUSETS */ 238#endif /* !CONFIG_CPUSETS */
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 1b921299abc4..c8af7a2efb52 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -51,6 +51,9 @@ extern pmd_t *page_check_address_pmd(struct page *page,
51 unsigned long address, 51 unsigned long address,
52 enum page_check_address_pmd_flag flag); 52 enum page_check_address_pmd_flag flag);
53 53
54#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
55#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
56
54#ifdef CONFIG_TRANSPARENT_HUGEPAGE 57#ifdef CONFIG_TRANSPARENT_HUGEPAGE
55#define HPAGE_PMD_SHIFT HPAGE_SHIFT 58#define HPAGE_PMD_SHIFT HPAGE_SHIFT
56#define HPAGE_PMD_MASK HPAGE_MASK 59#define HPAGE_PMD_MASK HPAGE_MASK
@@ -102,8 +105,6 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
102 BUG_ON(pmd_trans_splitting(*____pmd) || \ 105 BUG_ON(pmd_trans_splitting(*____pmd) || \
103 pmd_trans_huge(*____pmd)); \ 106 pmd_trans_huge(*____pmd)); \
104 } while (0) 107 } while (0)
105#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
106#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
107#if HPAGE_PMD_ORDER > MAX_ORDER 108#if HPAGE_PMD_ORDER > MAX_ORDER
108#error "hugepages can't be allocated by the buddy allocator" 109#error "hugepages can't be allocated by the buddy allocator"
109#endif 110#endif
@@ -113,6 +114,18 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
113 unsigned long start, 114 unsigned long start,
114 unsigned long end, 115 unsigned long end,
115 long adjust_next); 116 long adjust_next);
117extern int __pmd_trans_huge_lock(pmd_t *pmd,
118 struct vm_area_struct *vma);
119/* mmap_sem must be held on entry */
120static inline int pmd_trans_huge_lock(pmd_t *pmd,
121 struct vm_area_struct *vma)
122{
123 VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
124 if (pmd_trans_huge(*pmd))
125 return __pmd_trans_huge_lock(pmd, vma);
126 else
127 return 0;
128}
116static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, 129static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
117 unsigned long start, 130 unsigned long start,
118 unsigned long end, 131 unsigned long end,
@@ -146,9 +159,9 @@ static inline struct page *compound_trans_head(struct page *page)
146 return page; 159 return page;
147} 160}
148#else /* CONFIG_TRANSPARENT_HUGEPAGE */ 161#else /* CONFIG_TRANSPARENT_HUGEPAGE */
149#define HPAGE_PMD_SHIFT ({ BUG(); 0; }) 162#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
150#define HPAGE_PMD_MASK ({ BUG(); 0; }) 163#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
151#define HPAGE_PMD_SIZE ({ BUG(); 0; }) 164#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
152 165
153#define hpage_nr_pages(x) 1 166#define hpage_nr_pages(x) 1
154 167
@@ -176,6 +189,11 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
176 long adjust_next) 189 long adjust_next)
177{ 190{
178} 191}
192static inline int pmd_trans_huge_lock(pmd_t *pmd,
193 struct vm_area_struct *vma)
194{
195 return 0;
196}
179#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 197#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
180 198
181#endif /* _LINUX_HUGE_MM_H */ 199#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d9d6c868b86b..000837e126e6 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -14,6 +14,15 @@ struct user_struct;
14#include <linux/shm.h> 14#include <linux/shm.h>
15#include <asm/tlbflush.h> 15#include <asm/tlbflush.h>
16 16
17struct hugepage_subpool {
18 spinlock_t lock;
19 long count;
20 long max_hpages, used_hpages;
21};
22
23struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
24void hugepage_put_subpool(struct hugepage_subpool *spool);
25
17int PageHuge(struct page *page); 26int PageHuge(struct page *page);
18 27
19void reset_vma_resv_huge_pages(struct vm_area_struct *vma); 28void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
@@ -128,35 +137,14 @@ enum {
128}; 137};
129 138
130#ifdef CONFIG_HUGETLBFS 139#ifdef CONFIG_HUGETLBFS
131struct hugetlbfs_config {
132 uid_t uid;
133 gid_t gid;
134 umode_t mode;
135 long nr_blocks;
136 long nr_inodes;
137 struct hstate *hstate;
138};
139
140struct hugetlbfs_sb_info { 140struct hugetlbfs_sb_info {
141 long max_blocks; /* blocks allowed */
142 long free_blocks; /* blocks free */
143 long max_inodes; /* inodes allowed */ 141 long max_inodes; /* inodes allowed */
144 long free_inodes; /* inodes free */ 142 long free_inodes; /* inodes free */
145 spinlock_t stat_lock; 143 spinlock_t stat_lock;
146 struct hstate *hstate; 144 struct hstate *hstate;
145 struct hugepage_subpool *spool;
147}; 146};
148 147
149
150struct hugetlbfs_inode_info {
151 struct shared_policy policy;
152 struct inode vfs_inode;
153};
154
155static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
156{
157 return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
158}
159
160static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) 148static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
161{ 149{
162 return sb->s_fs_info; 150 return sb->s_fs_info;
@@ -164,10 +152,9 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
164 152
165extern const struct file_operations hugetlbfs_file_operations; 153extern const struct file_operations hugetlbfs_file_operations;
166extern const struct vm_operations_struct hugetlb_vm_ops; 154extern const struct vm_operations_struct hugetlb_vm_ops;
167struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, 155struct file *hugetlb_file_setup(const char *name, unsigned long addr,
156 size_t size, vm_flags_t acct,
168 struct user_struct **user, int creat_flags); 157 struct user_struct **user, int creat_flags);
169int hugetlb_get_quota(struct address_space *mapping, long delta);
170void hugetlb_put_quota(struct address_space *mapping, long delta);
171 158
172static inline int is_file_hugepages(struct file *file) 159static inline int is_file_hugepages(struct file *file)
173{ 160{
@@ -179,15 +166,11 @@ static inline int is_file_hugepages(struct file *file)
179 return 0; 166 return 0;
180} 167}
181 168
182static inline void set_file_hugepages(struct file *file)
183{
184 file->f_op = &hugetlbfs_file_operations;
185}
186#else /* !CONFIG_HUGETLBFS */ 169#else /* !CONFIG_HUGETLBFS */
187 170
188#define is_file_hugepages(file) 0 171#define is_file_hugepages(file) 0
189#define set_file_hugepages(file) BUG() 172static inline struct file *
190static inline struct file *hugetlb_file_setup(const char *name, size_t size, 173hugetlb_file_setup(const char *name, unsigned long addr, size_t size,
191 vm_flags_t acctflag, struct user_struct **user, int creat_flags) 174 vm_flags_t acctflag, struct user_struct **user, int creat_flags)
192{ 175{
193 return ERR_PTR(-ENOSYS); 176 return ERR_PTR(-ENOSYS);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index f994d51f70f2..e4baff5f7ff4 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -29,6 +29,13 @@ extern struct fs_struct init_fs;
29#define INIT_GROUP_RWSEM(sig) 29#define INIT_GROUP_RWSEM(sig)
30#endif 30#endif
31 31
32#ifdef CONFIG_CPUSETS
33#define INIT_CPUSET_SEQ \
34 .mems_allowed_seq = SEQCNT_ZERO,
35#else
36#define INIT_CPUSET_SEQ
37#endif
38
32#define INIT_SIGNALS(sig) { \ 39#define INIT_SIGNALS(sig) { \
33 .nr_threads = 1, \ 40 .nr_threads = 1, \
34 .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ 41 .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
@@ -192,6 +199,7 @@ extern struct cred init_cred;
192 INIT_FTRACE_GRAPH \ 199 INIT_FTRACE_GRAPH \
193 INIT_TRACE_RECURSION \ 200 INIT_TRACE_RECURSION \
194 INIT_TASK_RCU_PREEMPT(tsk) \ 201 INIT_TASK_RCU_PREEMPT(tsk) \
202 INIT_CPUSET_SEQ \
195} 203}
196 204
197 205
diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h
index bd92a89f4b0a..26a65711676f 100644
--- a/include/linux/kernel-page-flags.h
+++ b/include/linux/kernel-page-flags.h
@@ -30,6 +30,7 @@
30#define KPF_NOPAGE 20 30#define KPF_NOPAGE 20
31 31
32#define KPF_KSM 21 32#define KPF_KSM 21
33#define KPF_THP 22
33 34
34/* kernel hacking assistances 35/* kernel hacking assistances
35 * WARNING: subject to change, never rely on them! 36 * WARNING: subject to change, never rely on them!
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b80de520670b..f94efd2f6c27 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -77,7 +77,8 @@ extern void mem_cgroup_uncharge_end(void);
77extern void mem_cgroup_uncharge_page(struct page *page); 77extern void mem_cgroup_uncharge_page(struct page *page);
78extern void mem_cgroup_uncharge_cache_page(struct page *page); 78extern void mem_cgroup_uncharge_cache_page(struct page *page);
79 79
80extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask); 80extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
81 int order);
81int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); 82int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg);
82 83
83extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); 84extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
@@ -140,6 +141,34 @@ static inline bool mem_cgroup_disabled(void)
140 return false; 141 return false;
141} 142}
142 143
144void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked,
145 unsigned long *flags);
146
147extern atomic_t memcg_moving;
148
149static inline void mem_cgroup_begin_update_page_stat(struct page *page,
150 bool *locked, unsigned long *flags)
151{
152 if (mem_cgroup_disabled())
153 return;
154 rcu_read_lock();
155 *locked = false;
156 if (atomic_read(&memcg_moving))
157 __mem_cgroup_begin_update_page_stat(page, locked, flags);
158}
159
160void __mem_cgroup_end_update_page_stat(struct page *page,
161 unsigned long *flags);
162static inline void mem_cgroup_end_update_page_stat(struct page *page,
163 bool *locked, unsigned long *flags)
164{
165 if (mem_cgroup_disabled())
166 return;
167 if (*locked)
168 __mem_cgroup_end_update_page_stat(page, flags);
169 rcu_read_unlock();
170}
171
143void mem_cgroup_update_page_stat(struct page *page, 172void mem_cgroup_update_page_stat(struct page *page,
144 enum mem_cgroup_page_stat_item idx, 173 enum mem_cgroup_page_stat_item idx,
145 int val); 174 int val);
@@ -298,21 +327,6 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
298{ 327{
299} 328}
300 329
301static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg)
302{
303 return 0;
304}
305
306static inline void mem_cgroup_note_reclaim_priority(struct mem_cgroup *memcg,
307 int priority)
308{
309}
310
311static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *memcg,
312 int priority)
313{
314}
315
316static inline bool mem_cgroup_disabled(void) 330static inline bool mem_cgroup_disabled(void)
317{ 331{
318 return true; 332 return true;
@@ -355,6 +369,16 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
355{ 369{
356} 370}
357 371
372static inline void mem_cgroup_begin_update_page_stat(struct page *page,
373 bool *locked, unsigned long *flags)
374{
375}
376
377static inline void mem_cgroup_end_update_page_stat(struct page *page,
378 bool *locked, unsigned long *flags)
379{
380}
381
358static inline void mem_cgroup_inc_page_stat(struct page *page, 382static inline void mem_cgroup_inc_page_stat(struct page *page,
359 enum mem_cgroup_page_stat_item idx) 383 enum mem_cgroup_page_stat_item idx)
360{ 384{
@@ -391,7 +415,7 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
391 struct page *newpage) 415 struct page *newpage)
392{ 416{
393} 417}
394#endif /* CONFIG_CGROUP_MEM_CONT */ 418#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
395 419
396#if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) 420#if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
397static inline bool 421static inline bool
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 05ed2828a553..855c337b20c3 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -8,7 +8,6 @@
8typedef struct page *new_page_t(struct page *, unsigned long private, int **); 8typedef struct page *new_page_t(struct page *, unsigned long private, int **);
9 9
10#ifdef CONFIG_MIGRATION 10#ifdef CONFIG_MIGRATION
11#define PAGE_MIGRATION 1
12 11
13extern void putback_lru_pages(struct list_head *l); 12extern void putback_lru_pages(struct list_head *l);
14extern int migrate_page(struct address_space *, 13extern int migrate_page(struct address_space *,
@@ -32,7 +31,6 @@ extern void migrate_page_copy(struct page *newpage, struct page *page);
32extern int migrate_huge_page_move_mapping(struct address_space *mapping, 31extern int migrate_huge_page_move_mapping(struct address_space *mapping,
33 struct page *newpage, struct page *page); 32 struct page *newpage, struct page *page);
34#else 33#else
35#define PAGE_MIGRATION 0
36 34
37static inline void putback_lru_pages(struct list_head *l) {} 35static inline void putback_lru_pages(struct list_head *l) {}
38static inline int migrate_pages(struct list_head *l, new_page_t x, 36static inline int migrate_pages(struct list_head *l, new_page_t x,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b5bb54d6d667..ee67e326b6f8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1040,6 +1040,9 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma,
1040 !vma_growsup(vma->vm_next, addr); 1040 !vma_growsup(vma->vm_next, addr);
1041} 1041}
1042 1042
1043extern pid_t
1044vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group);
1045
1043extern unsigned long move_page_tables(struct vm_area_struct *vma, 1046extern unsigned long move_page_tables(struct vm_area_struct *vma,
1044 unsigned long old_addr, struct vm_area_struct *new_vma, 1047 unsigned long old_addr, struct vm_area_struct *new_vma,
1045 unsigned long new_addr, unsigned long len); 1048 unsigned long new_addr, unsigned long len);
@@ -1058,19 +1061,20 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
1058/* 1061/*
1059 * per-process(per-mm_struct) statistics. 1062 * per-process(per-mm_struct) statistics.
1060 */ 1063 */
1061static inline void set_mm_counter(struct mm_struct *mm, int member, long value)
1062{
1063 atomic_long_set(&mm->rss_stat.count[member], value);
1064}
1065
1066#if defined(SPLIT_RSS_COUNTING)
1067unsigned long get_mm_counter(struct mm_struct *mm, int member);
1068#else
1069static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) 1064static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
1070{ 1065{
1071 return atomic_long_read(&mm->rss_stat.count[member]); 1066 long val = atomic_long_read(&mm->rss_stat.count[member]);
1072} 1067
1068#ifdef SPLIT_RSS_COUNTING
1069 /*
1070 * counter is updated in asynchronous manner and may go to minus.
1071 * But it's never be expected number for users.
1072 */
1073 if (val < 0)
1074 val = 0;
1073#endif 1075#endif
1076 return (unsigned long)val;
1077}
1074 1078
1075static inline void add_mm_counter(struct mm_struct *mm, int member, long value) 1079static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
1076{ 1080{
@@ -1127,9 +1131,9 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
1127} 1131}
1128 1132
1129#if defined(SPLIT_RSS_COUNTING) 1133#if defined(SPLIT_RSS_COUNTING)
1130void sync_mm_rss(struct task_struct *task, struct mm_struct *mm); 1134void sync_mm_rss(struct mm_struct *mm);
1131#else 1135#else
1132static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) 1136static inline void sync_mm_rss(struct mm_struct *mm)
1133{ 1137{
1134} 1138}
1135#endif 1139#endif
@@ -1291,8 +1295,6 @@ extern void get_pfn_range_for_nid(unsigned int nid,
1291extern unsigned long find_min_pfn_with_active_regions(void); 1295extern unsigned long find_min_pfn_with_active_regions(void);
1292extern void free_bootmem_with_active_regions(int nid, 1296extern void free_bootmem_with_active_regions(int nid,
1293 unsigned long max_low_pfn); 1297 unsigned long max_low_pfn);
1294int add_from_early_node_map(struct range *range, int az,
1295 int nr_range, int nid);
1296extern void sparse_memory_present_with_active_regions(int nid); 1298extern void sparse_memory_present_with_active_regions(int nid);
1297 1299
1298#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 1300#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 650ba2fb3301..dff711509661 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -365,6 +365,7 @@ struct zone {
365 */ 365 */
366 unsigned int compact_considered; 366 unsigned int compact_considered;
367 unsigned int compact_defer_shift; 367 unsigned int compact_defer_shift;
368 int compact_order_failed;
368#endif 369#endif
369 370
370 ZONE_PADDING(_pad1_) 371 ZONE_PADDING(_pad1_)
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 552fba9c7d5a..3d7647536b03 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -49,7 +49,7 @@ extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
49extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); 49extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
50 50
51extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 51extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
52 int order, nodemask_t *mask); 52 int order, nodemask_t *mask, bool force_kill);
53extern int register_oom_notifier(struct notifier_block *nb); 53extern int register_oom_notifier(struct notifier_block *nb);
54extern int unregister_oom_notifier(struct notifier_block *nb); 54extern int unregister_oom_notifier(struct notifier_block *nb);
55 55
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e90a673be67e..6b25758e028e 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -414,11 +414,26 @@ static inline int PageTransHuge(struct page *page)
414 return PageHead(page); 414 return PageHead(page);
415} 415}
416 416
417/*
418 * PageTransCompound returns true for both transparent huge pages
419 * and hugetlbfs pages, so it should only be called when it's known
420 * that hugetlbfs pages aren't involved.
421 */
417static inline int PageTransCompound(struct page *page) 422static inline int PageTransCompound(struct page *page)
418{ 423{
419 return PageCompound(page); 424 return PageCompound(page);
420} 425}
421 426
427/*
428 * PageTransTail returns true for both transparent huge pages
429 * and hugetlbfs pages, so it should only be called when it's known
430 * that hugetlbfs pages aren't involved.
431 */
432static inline int PageTransTail(struct page *page)
433{
434 return PageTail(page);
435}
436
422#else 437#else
423 438
424static inline int PageTransHuge(struct page *page) 439static inline int PageTransHuge(struct page *page)
@@ -430,6 +445,11 @@ static inline int PageTransCompound(struct page *page)
430{ 445{
431 return 0; 446 return 0;
432} 447}
448
449static inline int PageTransTail(struct page *page)
450{
451 return 0;
452}
433#endif 453#endif
434 454
435#ifdef CONFIG_MMU 455#ifdef CONFIG_MMU
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index a2d11771c84b..a88cdba27809 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -4,12 +4,8 @@
4enum { 4enum {
5 /* flags for mem_cgroup */ 5 /* flags for mem_cgroup */
6 PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ 6 PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */
7 PCG_CACHE, /* charged as cache */
8 PCG_USED, /* this object is in use. */ 7 PCG_USED, /* this object is in use. */
9 PCG_MIGRATION, /* under page migration */ 8 PCG_MIGRATION, /* under page migration */
10 /* flags for mem_cgroup and file and I/O status */
11 PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */
12 PCG_FILE_MAPPED, /* page is accounted as "mapped" */
13 __NR_PCG_FLAGS, 9 __NR_PCG_FLAGS,
14}; 10};
15 11
@@ -64,19 +60,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
64static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \ 60static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
65 { return test_and_clear_bit(PCG_##lname, &pc->flags); } 61 { return test_and_clear_bit(PCG_##lname, &pc->flags); }
66 62
67/* Cache flag is set only once (at allocation) */
68TESTPCGFLAG(Cache, CACHE)
69CLEARPCGFLAG(Cache, CACHE)
70SETPCGFLAG(Cache, CACHE)
71
72TESTPCGFLAG(Used, USED) 63TESTPCGFLAG(Used, USED)
73CLEARPCGFLAG(Used, USED) 64CLEARPCGFLAG(Used, USED)
74SETPCGFLAG(Used, USED) 65SETPCGFLAG(Used, USED)
75 66
76SETPCGFLAG(FileMapped, FILE_MAPPED)
77CLEARPCGFLAG(FileMapped, FILE_MAPPED)
78TESTPCGFLAG(FileMapped, FILE_MAPPED)
79
80SETPCGFLAG(Migration, MIGRATION) 67SETPCGFLAG(Migration, MIGRATION)
81CLEARPCGFLAG(Migration, MIGRATION) 68CLEARPCGFLAG(Migration, MIGRATION)
82TESTPCGFLAG(Migration, MIGRATION) 69TESTPCGFLAG(Migration, MIGRATION)
@@ -85,7 +72,7 @@ static inline void lock_page_cgroup(struct page_cgroup *pc)
85{ 72{
86 /* 73 /*
87 * Don't take this lock in IRQ context. 74 * Don't take this lock in IRQ context.
88 * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION 75 * This lock is for pc->mem_cgroup, USED, MIGRATION
89 */ 76 */
90 bit_spin_lock(PCG_LOCK, &pc->flags); 77 bit_spin_lock(PCG_LOCK, &pc->flags);
91} 78}
@@ -95,24 +82,6 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc)
95 bit_spin_unlock(PCG_LOCK, &pc->flags); 82 bit_spin_unlock(PCG_LOCK, &pc->flags);
96} 83}
97 84
98static inline void move_lock_page_cgroup(struct page_cgroup *pc,
99 unsigned long *flags)
100{
101 /*
102 * We know updates to pc->flags of page cache's stats are from both of
103 * usual context or IRQ context. Disable IRQ to avoid deadlock.
104 */
105 local_irq_save(*flags);
106 bit_spin_lock(PCG_MOVE_LOCK, &pc->flags);
107}
108
109static inline void move_unlock_page_cgroup(struct page_cgroup *pc,
110 unsigned long *flags)
111{
112 bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags);
113 local_irq_restore(*flags);
114}
115
116#else /* CONFIG_CGROUP_MEM_RES_CTLR */ 85#else /* CONFIG_CGROUP_MEM_RES_CTLR */
117struct page_cgroup; 86struct page_cgroup;
118 87
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 1cdd62a2788a..fd07c4542cee 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -122,7 +122,6 @@ void unlink_anon_vmas(struct vm_area_struct *);
122int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); 122int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
123void anon_vma_moveto_tail(struct vm_area_struct *); 123void anon_vma_moveto_tail(struct vm_area_struct *);
124int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); 124int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
125void __anon_vma_link(struct vm_area_struct *);
126 125
127static inline void anon_vma_merge(struct vm_area_struct *vma, 126static inline void anon_vma_merge(struct vm_area_struct *vma,
128 struct vm_area_struct *next) 127 struct vm_area_struct *next)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e074e1e54f85..0c147a4260a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1514,7 +1514,7 @@ struct task_struct {
1514#endif 1514#endif
1515#ifdef CONFIG_CPUSETS 1515#ifdef CONFIG_CPUSETS
1516 nodemask_t mems_allowed; /* Protected by alloc_lock */ 1516 nodemask_t mems_allowed; /* Protected by alloc_lock */
1517 int mems_allowed_change_disable; 1517 seqcount_t mems_allowed_seq; /* Seqence no to catch updates */
1518 int cpuset_mem_spread_rotor; 1518 int cpuset_mem_spread_rotor;
1519 int cpuset_slab_spread_rotor; 1519 int cpuset_slab_spread_rotor;
1520#endif 1520#endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3e60228e7299..b86b5c20617d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -223,6 +223,7 @@ extern void lru_add_page_tail(struct zone* zone,
223extern void activate_page(struct page *); 223extern void activate_page(struct page *);
224extern void mark_page_accessed(struct page *); 224extern void mark_page_accessed(struct page *);
225extern void lru_add_drain(void); 225extern void lru_add_drain(void);
226extern void lru_add_drain_cpu(int cpu);
226extern int lru_add_drain_all(void); 227extern int lru_add_drain_all(void);
227extern void rotate_reclaimable_page(struct page *page); 228extern void rotate_reclaimable_page(struct page *page);
228extern void deactivate_page(struct page *page); 229extern void deactivate_page(struct page *page);
@@ -329,7 +330,6 @@ extern long total_swap_pages;
329extern void si_swapinfo(struct sysinfo *); 330extern void si_swapinfo(struct sysinfo *);
330extern swp_entry_t get_swap_page(void); 331extern swp_entry_t get_swap_page(void);
331extern swp_entry_t get_swap_page_of_type(int); 332extern swp_entry_t get_swap_page_of_type(int);
332extern int valid_swaphandles(swp_entry_t, unsigned long *);
333extern int add_swap_count_continuation(swp_entry_t, gfp_t); 333extern int add_swap_count_continuation(swp_entry_t, gfp_t);
334extern void swap_shmem_alloc(swp_entry_t); 334extern void swap_shmem_alloc(swp_entry_t);
335extern int swap_duplicate(swp_entry_t); 335extern int swap_duplicate(swp_entry_t);
diff --git a/ipc/shm.c b/ipc/shm.c
index b76be5bda6c2..406c5b208193 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -482,7 +482,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
482 /* hugetlb_file_setup applies strict accounting */ 482 /* hugetlb_file_setup applies strict accounting */
483 if (shmflg & SHM_NORESERVE) 483 if (shmflg & SHM_NORESERVE)
484 acctflag = VM_NORESERVE; 484 acctflag = VM_NORESERVE;
485 file = hugetlb_file_setup(name, size, acctflag, 485 file = hugetlb_file_setup(name, 0, size, acctflag,
486 &shp->mlock_user, HUGETLB_SHMFS_INODE); 486 &shp->mlock_user, HUGETLB_SHMFS_INODE);
487 } else { 487 } else {
488 /* 488 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1ece8e20fdb5..f4ea4b6f3cf1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4881,9 +4881,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4881 4881
4882 rcu_assign_pointer(id->css, NULL); 4882 rcu_assign_pointer(id->css, NULL);
4883 rcu_assign_pointer(css->id, NULL); 4883 rcu_assign_pointer(css->id, NULL);
4884 write_lock(&ss->id_lock); 4884 spin_lock(&ss->id_lock);
4885 idr_remove(&ss->idr, id->id); 4885 idr_remove(&ss->idr, id->id);
4886 write_unlock(&ss->id_lock); 4886 spin_unlock(&ss->id_lock);
4887 kfree_rcu(id, rcu_head); 4887 kfree_rcu(id, rcu_head);
4888} 4888}
4889EXPORT_SYMBOL_GPL(free_css_id); 4889EXPORT_SYMBOL_GPL(free_css_id);
@@ -4909,10 +4909,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4909 error = -ENOMEM; 4909 error = -ENOMEM;
4910 goto err_out; 4910 goto err_out;
4911 } 4911 }
4912 write_lock(&ss->id_lock); 4912 spin_lock(&ss->id_lock);
4913 /* Don't use 0. allocates an ID of 1-65535 */ 4913 /* Don't use 0. allocates an ID of 1-65535 */
4914 error = idr_get_new_above(&ss->idr, newid, 1, &myid); 4914 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
4915 write_unlock(&ss->id_lock); 4915 spin_unlock(&ss->id_lock);
4916 4916
4917 /* Returns error when there are no free spaces for new ID.*/ 4917 /* Returns error when there are no free spaces for new ID.*/
4918 if (error) { 4918 if (error) {
@@ -4927,9 +4927,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4927 return newid; 4927 return newid;
4928remove_idr: 4928remove_idr:
4929 error = -ENOSPC; 4929 error = -ENOSPC;
4930 write_lock(&ss->id_lock); 4930 spin_lock(&ss->id_lock);
4931 idr_remove(&ss->idr, myid); 4931 idr_remove(&ss->idr, myid);
4932 write_unlock(&ss->id_lock); 4932 spin_unlock(&ss->id_lock);
4933err_out: 4933err_out:
4934 kfree(newid); 4934 kfree(newid);
4935 return ERR_PTR(error); 4935 return ERR_PTR(error);
@@ -4941,7 +4941,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4941{ 4941{
4942 struct css_id *newid; 4942 struct css_id *newid;
4943 4943
4944 rwlock_init(&ss->id_lock); 4944 spin_lock_init(&ss->id_lock);
4945 idr_init(&ss->idr); 4945 idr_init(&ss->idr);
4946 4946
4947 newid = get_new_cssid(ss, 0); 4947 newid = get_new_cssid(ss, 0);
@@ -5029,6 +5029,8 @@ css_get_next(struct cgroup_subsys *ss, int id,
5029 return NULL; 5029 return NULL;
5030 5030
5031 BUG_ON(!ss->use_id); 5031 BUG_ON(!ss->use_id);
5032 WARN_ON_ONCE(!rcu_read_lock_held());
5033
5032 /* fill start point for scan */ 5034 /* fill start point for scan */
5033 tmpid = id; 5035 tmpid = id;
5034 while (1) { 5036 while (1) {
@@ -5036,10 +5038,7 @@ css_get_next(struct cgroup_subsys *ss, int id,
5036 * scan next entry from bitmap(tree), tmpid is updated after 5038 * scan next entry from bitmap(tree), tmpid is updated after
5037 * idr_get_next(). 5039 * idr_get_next().
5038 */ 5040 */
5039 read_lock(&ss->id_lock);
5040 tmp = idr_get_next(&ss->idr, &tmpid); 5041 tmp = idr_get_next(&ss->idr, &tmpid);
5041 read_unlock(&ss->id_lock);
5042
5043 if (!tmp) 5042 if (!tmp)
5044 break; 5043 break;
5045 if (tmp->depth >= depth && tmp->stack[depth] == rootid) { 5044 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 5d575836dba6..1010cc61931f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
964{ 964{
965 bool need_loop; 965 bool need_loop;
966 966
967repeat:
968 /* 967 /*
969 * Allow tasks that have access to memory reserves because they have 968 * Allow tasks that have access to memory reserves because they have
970 * been OOM killed to get memory anywhere. 969 * been OOM killed to get memory anywhere.
@@ -983,45 +982,19 @@ repeat:
983 */ 982 */
984 need_loop = task_has_mempolicy(tsk) || 983 need_loop = task_has_mempolicy(tsk) ||
985 !nodes_intersects(*newmems, tsk->mems_allowed); 984 !nodes_intersects(*newmems, tsk->mems_allowed);
986 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
987 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
988 985
989 /* 986 if (need_loop)
990 * ensure checking ->mems_allowed_change_disable after setting all new 987 write_seqcount_begin(&tsk->mems_allowed_seq);
991 * allowed nodes.
992 *
993 * the read-side task can see an nodemask with new allowed nodes and
994 * old allowed nodes. and if it allocates page when cpuset clears newly
995 * disallowed ones continuous, it can see the new allowed bits.
996 *
997 * And if setting all new allowed nodes is after the checking, setting
998 * all new allowed nodes and clearing newly disallowed ones will be done
999 * continuous, and the read-side task may find no node to alloc page.
1000 */
1001 smp_mb();
1002 988
1003 /* 989 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1004 * Allocation of memory is very fast, we needn't sleep when waiting 990 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
1005 * for the read-side.
1006 */
1007 while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
1008 task_unlock(tsk);
1009 if (!task_curr(tsk))
1010 yield();
1011 goto repeat;
1012 }
1013
1014 /*
1015 * ensure checking ->mems_allowed_change_disable before clearing all new
1016 * disallowed nodes.
1017 *
1018 * if clearing newly disallowed bits before the checking, the read-side
1019 * task may find no node to alloc page.
1020 */
1021 smp_mb();
1022 991
1023 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); 992 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1024 tsk->mems_allowed = *newmems; 993 tsk->mems_allowed = *newmems;
994
995 if (need_loop)
996 write_seqcount_end(&tsk->mems_allowed_seq);
997
1025 task_unlock(tsk); 998 task_unlock(tsk);
1026} 999}
1027 1000
diff --git a/kernel/exit.c b/kernel/exit.c
index 7ad335c3045a..16b07bfac224 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -935,7 +935,7 @@ void do_exit(long code)
935 acct_update_integrals(tsk); 935 acct_update_integrals(tsk);
936 /* sync mm's RSS info before statistics gathering */ 936 /* sync mm's RSS info before statistics gathering */
937 if (tsk->mm) 937 if (tsk->mm)
938 sync_mm_rss(tsk, tsk->mm); 938 sync_mm_rss(tsk->mm);
939 group_dead = atomic_dec_and_test(&tsk->signal->live); 939 group_dead = atomic_dec_and_test(&tsk->signal->live);
940 if (group_dead) { 940 if (group_dead) {
941 hrtimer_cancel(&tsk->signal->real_timer); 941 hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/kernel/fork.c b/kernel/fork.c
index 26a7138bb849..37674ec55cde 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -512,6 +512,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
512 return NULL; 512 return NULL;
513} 513}
514 514
515static void check_mm(struct mm_struct *mm)
516{
517 int i;
518
519 for (i = 0; i < NR_MM_COUNTERS; i++) {
520 long x = atomic_long_read(&mm->rss_stat.count[i]);
521
522 if (unlikely(x))
523 printk(KERN_ALERT "BUG: Bad rss-counter state "
524 "mm:%p idx:%d val:%ld\n", mm, i, x);
525 }
526
527#ifdef CONFIG_TRANSPARENT_HUGEPAGE
528 VM_BUG_ON(mm->pmd_huge_pte);
529#endif
530}
531
515/* 532/*
516 * Allocate and initialize an mm_struct. 533 * Allocate and initialize an mm_struct.
517 */ 534 */
@@ -539,9 +556,7 @@ void __mmdrop(struct mm_struct *mm)
539 mm_free_pgd(mm); 556 mm_free_pgd(mm);
540 destroy_context(mm); 557 destroy_context(mm);
541 mmu_notifier_mm_destroy(mm); 558 mmu_notifier_mm_destroy(mm);
542#ifdef CONFIG_TRANSPARENT_HUGEPAGE 559 check_mm(mm);
543 VM_BUG_ON(mm->pmd_huge_pte);
544#endif
545 free_mm(mm); 560 free_mm(mm);
546} 561}
547EXPORT_SYMBOL_GPL(__mmdrop); 562EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1223,6 +1238,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1223#ifdef CONFIG_CPUSETS 1238#ifdef CONFIG_CPUSETS
1224 p->cpuset_mem_spread_rotor = NUMA_NO_NODE; 1239 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
1225 p->cpuset_slab_spread_rotor = NUMA_NO_NODE; 1240 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
1241 seqcount_init(&p->mems_allowed_seq);
1226#endif 1242#endif
1227#ifdef CONFIG_TRACE_IRQFLAGS 1243#ifdef CONFIG_TRACE_IRQFLAGS
1228 p->irq_events = 0; 1244 p->irq_events = 0;
diff --git a/lib/idr.c b/lib/idr.c
index ed055b297c81..12499ba7967e 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -595,8 +595,10 @@ EXPORT_SYMBOL(idr_for_each);
595 * Returns pointer to registered object with id, which is next number to 595 * Returns pointer to registered object with id, which is next number to
596 * given id. After being looked up, *@nextidp will be updated for the next 596 * given id. After being looked up, *@nextidp will be updated for the next
597 * iteration. 597 * iteration.
598 *
599 * This function can be called under rcu_read_lock(), given that the leaf
600 * pointers lifetimes are correctly managed.
598 */ 601 */
599
600void *idr_get_next(struct idr *idp, int *nextidp) 602void *idr_get_next(struct idr *idp, int *nextidp)
601{ 603{
602 struct idr_layer *p, *pa[MAX_LEVEL]; 604 struct idr_layer *p, *pa[MAX_LEVEL];
@@ -605,11 +607,11 @@ void *idr_get_next(struct idr *idp, int *nextidp)
605 int n, max; 607 int n, max;
606 608
607 /* find first ent */ 609 /* find first ent */
608 n = idp->layers * IDR_BITS;
609 max = 1 << n;
610 p = rcu_dereference_raw(idp->top); 610 p = rcu_dereference_raw(idp->top);
611 if (!p) 611 if (!p)
612 return NULL; 612 return NULL;
613 n = (p->layer + 1) * IDR_BITS;
614 max = 1 << n;
613 615
614 while (id < max) { 616 while (id < max) {
615 while (n > 0 && p) { 617 while (n > 0 && p) {
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 668e94df8cf2..0131170c9d54 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -766,14 +766,13 @@ void * __init alloc_bootmem_section(unsigned long size,
766 unsigned long section_nr) 766 unsigned long section_nr)
767{ 767{
768 bootmem_data_t *bdata; 768 bootmem_data_t *bdata;
769 unsigned long pfn, goal, limit; 769 unsigned long pfn, goal;
770 770
771 pfn = section_nr_to_pfn(section_nr); 771 pfn = section_nr_to_pfn(section_nr);
772 goal = pfn << PAGE_SHIFT; 772 goal = pfn << PAGE_SHIFT;
773 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
774 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; 773 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
775 774
776 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); 775 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
777} 776}
778#endif 777#endif
779 778
diff --git a/mm/compaction.c b/mm/compaction.c
index d9ebebe1a2aa..74a8c825ff28 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,7 +35,7 @@ struct compact_control {
35 unsigned long migrate_pfn; /* isolate_migratepages search base */ 35 unsigned long migrate_pfn; /* isolate_migratepages search base */
36 bool sync; /* Synchronous migration */ 36 bool sync; /* Synchronous migration */
37 37
38 unsigned int order; /* order a direct compactor needs */ 38 int order; /* order a direct compactor needs */
39 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 39 int migratetype; /* MOVABLE, RECLAIMABLE etc */
40 struct zone *zone; 40 struct zone *zone;
41}; 41};
@@ -675,49 +675,71 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
675 675
676 676
677/* Compact all zones within a node */ 677/* Compact all zones within a node */
678static int compact_node(int nid) 678static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
679{ 679{
680 int zoneid; 680 int zoneid;
681 pg_data_t *pgdat;
682 struct zone *zone; 681 struct zone *zone;
683 682
684 if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
685 return -EINVAL;
686 pgdat = NODE_DATA(nid);
687
688 /* Flush pending updates to the LRU lists */
689 lru_add_drain_all();
690
691 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 683 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
692 struct compact_control cc = {
693 .nr_freepages = 0,
694 .nr_migratepages = 0,
695 .order = -1,
696 .sync = true,
697 };
698 684
699 zone = &pgdat->node_zones[zoneid]; 685 zone = &pgdat->node_zones[zoneid];
700 if (!populated_zone(zone)) 686 if (!populated_zone(zone))
701 continue; 687 continue;
702 688
703 cc.zone = zone; 689 cc->nr_freepages = 0;
704 INIT_LIST_HEAD(&cc.freepages); 690 cc->nr_migratepages = 0;
705 INIT_LIST_HEAD(&cc.migratepages); 691 cc->zone = zone;
706 692 INIT_LIST_HEAD(&cc->freepages);
707 compact_zone(zone, &cc); 693 INIT_LIST_HEAD(&cc->migratepages);
694
695 if (cc->order == -1 || !compaction_deferred(zone, cc->order))
696 compact_zone(zone, cc);
697
698 if (cc->order > 0) {
699 int ok = zone_watermark_ok(zone, cc->order,
700 low_wmark_pages(zone), 0, 0);
701 if (ok && cc->order > zone->compact_order_failed)
702 zone->compact_order_failed = cc->order + 1;
703 /* Currently async compaction is never deferred. */
704 else if (!ok && cc->sync)
705 defer_compaction(zone, cc->order);
706 }
708 707
709 VM_BUG_ON(!list_empty(&cc.freepages)); 708 VM_BUG_ON(!list_empty(&cc->freepages));
710 VM_BUG_ON(!list_empty(&cc.migratepages)); 709 VM_BUG_ON(!list_empty(&cc->migratepages));
711 } 710 }
712 711
713 return 0; 712 return 0;
714} 713}
715 714
715int compact_pgdat(pg_data_t *pgdat, int order)
716{
717 struct compact_control cc = {
718 .order = order,
719 .sync = false,
720 };
721
722 return __compact_pgdat(pgdat, &cc);
723}
724
725static int compact_node(int nid)
726{
727 struct compact_control cc = {
728 .order = -1,
729 .sync = true,
730 };
731
732 return __compact_pgdat(NODE_DATA(nid), &cc);
733}
734
716/* Compact all nodes in the system */ 735/* Compact all nodes in the system */
717static int compact_nodes(void) 736static int compact_nodes(void)
718{ 737{
719 int nid; 738 int nid;
720 739
740 /* Flush pending updates to the LRU lists */
741 lru_add_drain_all();
742
721 for_each_online_node(nid) 743 for_each_online_node(nid)
722 compact_node(nid); 744 compact_node(nid);
723 745
@@ -750,7 +772,14 @@ ssize_t sysfs_compact_node(struct device *dev,
750 struct device_attribute *attr, 772 struct device_attribute *attr,
751 const char *buf, size_t count) 773 const char *buf, size_t count)
752{ 774{
753 compact_node(dev->id); 775 int nid = dev->id;
776
777 if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
778 /* Flush pending updates to the LRU lists */
779 lru_add_drain_all();
780
781 compact_node(nid);
782 }
754 783
755 return count; 784 return count;
756} 785}
diff --git a/mm/filemap.c b/mm/filemap.c
index 2f8165075a5a..843042045dc9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -101,9 +101,8 @@
101 * ->inode->i_lock (zap_pte_range->set_page_dirty) 101 * ->inode->i_lock (zap_pte_range->set_page_dirty)
102 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 102 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
103 * 103 *
104 * (code doesn't rely on that order, so you could switch it around) 104 * ->i_mmap_mutex
105 * ->tasklist_lock (memory_failure, collect_procs_ao) 105 * ->tasklist_lock (memory_failure, collect_procs_ao)
106 * ->i_mmap_mutex
107 */ 106 */
108 107
109/* 108/*
@@ -500,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
500 struct page *page; 499 struct page *page;
501 500
502 if (cpuset_do_page_mem_spread()) { 501 if (cpuset_do_page_mem_spread()) {
503 get_mems_allowed(); 502 unsigned int cpuset_mems_cookie;
504 n = cpuset_mem_spread_node(); 503 do {
505 page = alloc_pages_exact_node(n, gfp, 0); 504 cpuset_mems_cookie = get_mems_allowed();
506 put_mems_allowed(); 505 n = cpuset_mem_spread_node();
506 page = alloc_pages_exact_node(n, gfp, 0);
507 } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
508
507 return page; 509 return page;
508 } 510 }
509 return alloc_pages(gfp, 0); 511 return alloc_pages(gfp, 0);
@@ -2341,7 +2343,9 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
2341 struct page *page; 2343 struct page *page;
2342 gfp_t gfp_notmask = 0; 2344 gfp_t gfp_notmask = 0;
2343 2345
2344 gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE; 2346 gfp_mask = mapping_gfp_mask(mapping);
2347 if (mapping_cap_account_dirty(mapping))
2348 gfp_mask |= __GFP_WRITE;
2345 if (flags & AOP_FLAG_NOFS) 2349 if (flags & AOP_FLAG_NOFS)
2346 gfp_notmask = __GFP_FS; 2350 gfp_notmask = __GFP_FS;
2347repeat: 2351repeat:
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8f7fc394f636..f0e5306eeb55 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1031,32 +1031,23 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1031{ 1031{
1032 int ret = 0; 1032 int ret = 0;
1033 1033
1034 spin_lock(&tlb->mm->page_table_lock); 1034 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1035 if (likely(pmd_trans_huge(*pmd))) { 1035 struct page *page;
1036 if (unlikely(pmd_trans_splitting(*pmd))) { 1036 pgtable_t pgtable;
1037 spin_unlock(&tlb->mm->page_table_lock); 1037 pgtable = get_pmd_huge_pte(tlb->mm);
1038 wait_split_huge_page(vma->anon_vma, 1038 page = pmd_page(*pmd);
1039 pmd); 1039 pmd_clear(pmd);
1040 } else { 1040 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1041 struct page *page; 1041 page_remove_rmap(page);
1042 pgtable_t pgtable; 1042 VM_BUG_ON(page_mapcount(page) < 0);
1043 pgtable = get_pmd_huge_pte(tlb->mm); 1043 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1044 page = pmd_page(*pmd); 1044 VM_BUG_ON(!PageHead(page));
1045 pmd_clear(pmd); 1045 tlb->mm->nr_ptes--;
1046 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1047 page_remove_rmap(page);
1048 VM_BUG_ON(page_mapcount(page) < 0);
1049 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1050 VM_BUG_ON(!PageHead(page));
1051 tlb->mm->nr_ptes--;
1052 spin_unlock(&tlb->mm->page_table_lock);
1053 tlb_remove_page(tlb, page);
1054 pte_free(tlb->mm, pgtable);
1055 ret = 1;
1056 }
1057 } else
1058 spin_unlock(&tlb->mm->page_table_lock); 1046 spin_unlock(&tlb->mm->page_table_lock);
1059 1047 tlb_remove_page(tlb, page);
1048 pte_free(tlb->mm, pgtable);
1049 ret = 1;
1050 }
1060 return ret; 1051 return ret;
1061} 1052}
1062 1053
@@ -1066,21 +1057,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1066{ 1057{
1067 int ret = 0; 1058 int ret = 0;
1068 1059
1069 spin_lock(&vma->vm_mm->page_table_lock); 1060 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1070 if (likely(pmd_trans_huge(*pmd))) { 1061 /*
1071 ret = !pmd_trans_splitting(*pmd); 1062 * All logical pages in the range are present
1072 spin_unlock(&vma->vm_mm->page_table_lock); 1063 * if backed by a huge page.
1073 if (unlikely(!ret)) 1064 */
1074 wait_split_huge_page(vma->anon_vma, pmd);
1075 else {
1076 /*
1077 * All logical pages in the range are present
1078 * if backed by a huge page.
1079 */
1080 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1081 }
1082 } else
1083 spin_unlock(&vma->vm_mm->page_table_lock); 1065 spin_unlock(&vma->vm_mm->page_table_lock);
1066 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1067 ret = 1;
1068 }
1084 1069
1085 return ret; 1070 return ret;
1086} 1071}
@@ -1110,20 +1095,11 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1110 goto out; 1095 goto out;
1111 } 1096 }
1112 1097
1113 spin_lock(&mm->page_table_lock); 1098 ret = __pmd_trans_huge_lock(old_pmd, vma);
1114 if (likely(pmd_trans_huge(*old_pmd))) { 1099 if (ret == 1) {
1115 if (pmd_trans_splitting(*old_pmd)) { 1100 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1116 spin_unlock(&mm->page_table_lock); 1101 VM_BUG_ON(!pmd_none(*new_pmd));
1117 wait_split_huge_page(vma->anon_vma, old_pmd); 1102 set_pmd_at(mm, new_addr, new_pmd, pmd);
1118 ret = -1;
1119 } else {
1120 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1121 VM_BUG_ON(!pmd_none(*new_pmd));
1122 set_pmd_at(mm, new_addr, new_pmd, pmd);
1123 spin_unlock(&mm->page_table_lock);
1124 ret = 1;
1125 }
1126 } else {
1127 spin_unlock(&mm->page_table_lock); 1103 spin_unlock(&mm->page_table_lock);
1128 } 1104 }
1129out: 1105out:
@@ -1136,24 +1112,41 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1136 struct mm_struct *mm = vma->vm_mm; 1112 struct mm_struct *mm = vma->vm_mm;
1137 int ret = 0; 1113 int ret = 0;
1138 1114
1139 spin_lock(&mm->page_table_lock); 1115 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1116 pmd_t entry;
1117 entry = pmdp_get_and_clear(mm, addr, pmd);
1118 entry = pmd_modify(entry, newprot);
1119 set_pmd_at(mm, addr, pmd, entry);
1120 spin_unlock(&vma->vm_mm->page_table_lock);
1121 ret = 1;
1122 }
1123
1124 return ret;
1125}
1126
1127/*
1128 * Returns 1 if a given pmd maps a stable (not under splitting) thp.
1129 * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
1130 *
1131 * Note that if it returns 1, this routine returns without unlocking page
1132 * table locks. So callers must unlock them.
1133 */
1134int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1135{
1136 spin_lock(&vma->vm_mm->page_table_lock);
1140 if (likely(pmd_trans_huge(*pmd))) { 1137 if (likely(pmd_trans_huge(*pmd))) {
1141 if (unlikely(pmd_trans_splitting(*pmd))) { 1138 if (unlikely(pmd_trans_splitting(*pmd))) {
1142 spin_unlock(&mm->page_table_lock); 1139 spin_unlock(&vma->vm_mm->page_table_lock);
1143 wait_split_huge_page(vma->anon_vma, pmd); 1140 wait_split_huge_page(vma->anon_vma, pmd);
1141 return -1;
1144 } else { 1142 } else {
1145 pmd_t entry; 1143 /* Thp mapped by 'pmd' is stable, so we can
1146 1144 * handle it as it is. */
1147 entry = pmdp_get_and_clear(mm, addr, pmd); 1145 return 1;
1148 entry = pmd_modify(entry, newprot);
1149 set_pmd_at(mm, addr, pmd, entry);
1150 spin_unlock(&vma->vm_mm->page_table_lock);
1151 ret = 1;
1152 } 1146 }
1153 } else 1147 }
1154 spin_unlock(&vma->vm_mm->page_table_lock); 1148 spin_unlock(&vma->vm_mm->page_table_lock);
1155 1149 return 0;
1156 return ret;
1157} 1150}
1158 1151
1159pmd_t *page_check_address_pmd(struct page *page, 1152pmd_t *page_check_address_pmd(struct page *page,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a876871f6be5..afa057a1d3fe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
53 */ 53 */
54static DEFINE_SPINLOCK(hugetlb_lock); 54static DEFINE_SPINLOCK(hugetlb_lock);
55 55
56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
57{
58 bool free = (spool->count == 0) && (spool->used_hpages == 0);
59
60 spin_unlock(&spool->lock);
61
62 /* If no pages are used, and no other handles to the subpool
63 * remain, free the subpool the subpool remain */
64 if (free)
65 kfree(spool);
66}
67
68struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
69{
70 struct hugepage_subpool *spool;
71
72 spool = kmalloc(sizeof(*spool), GFP_KERNEL);
73 if (!spool)
74 return NULL;
75
76 spin_lock_init(&spool->lock);
77 spool->count = 1;
78 spool->max_hpages = nr_blocks;
79 spool->used_hpages = 0;
80
81 return spool;
82}
83
84void hugepage_put_subpool(struct hugepage_subpool *spool)
85{
86 spin_lock(&spool->lock);
87 BUG_ON(!spool->count);
88 spool->count--;
89 unlock_or_release_subpool(spool);
90}
91
92static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
93 long delta)
94{
95 int ret = 0;
96
97 if (!spool)
98 return 0;
99
100 spin_lock(&spool->lock);
101 if ((spool->used_hpages + delta) <= spool->max_hpages) {
102 spool->used_hpages += delta;
103 } else {
104 ret = -ENOMEM;
105 }
106 spin_unlock(&spool->lock);
107
108 return ret;
109}
110
111static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
112 long delta)
113{
114 if (!spool)
115 return;
116
117 spin_lock(&spool->lock);
118 spool->used_hpages -= delta;
119 /* If hugetlbfs_put_super couldn't free spool due to
120 * an outstanding quota reference, free it now. */
121 unlock_or_release_subpool(spool);
122}
123
124static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
125{
126 return HUGETLBFS_SB(inode->i_sb)->spool;
127}
128
129static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
130{
131 return subpool_inode(vma->vm_file->f_dentry->d_inode);
132}
133
56/* 134/*
57 * Region tracking -- allows tracking of reservations and instantiated pages 135 * Region tracking -- allows tracking of reservations and instantiated pages
58 * across the pages in a mapping. 136 * across the pages in a mapping.
@@ -454,14 +532,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
454 struct vm_area_struct *vma, 532 struct vm_area_struct *vma,
455 unsigned long address, int avoid_reserve) 533 unsigned long address, int avoid_reserve)
456{ 534{
457 struct page *page = NULL; 535 struct page *page;
458 struct mempolicy *mpol; 536 struct mempolicy *mpol;
459 nodemask_t *nodemask; 537 nodemask_t *nodemask;
460 struct zonelist *zonelist; 538 struct zonelist *zonelist;
461 struct zone *zone; 539 struct zone *zone;
462 struct zoneref *z; 540 struct zoneref *z;
541 unsigned int cpuset_mems_cookie;
463 542
464 get_mems_allowed(); 543retry_cpuset:
544 cpuset_mems_cookie = get_mems_allowed();
465 zonelist = huge_zonelist(vma, address, 545 zonelist = huge_zonelist(vma, address,
466 htlb_alloc_mask, &mpol, &nodemask); 546 htlb_alloc_mask, &mpol, &nodemask);
467 /* 547 /*
@@ -488,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
488 } 568 }
489 } 569 }
490 } 570 }
491err: 571
492 mpol_cond_put(mpol); 572 mpol_cond_put(mpol);
493 put_mems_allowed(); 573 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
574 goto retry_cpuset;
494 return page; 575 return page;
576
577err:
578 mpol_cond_put(mpol);
579 return NULL;
495} 580}
496 581
497static void update_and_free_page(struct hstate *h, struct page *page) 582static void update_and_free_page(struct hstate *h, struct page *page)
@@ -533,9 +618,9 @@ static void free_huge_page(struct page *page)
533 */ 618 */
534 struct hstate *h = page_hstate(page); 619 struct hstate *h = page_hstate(page);
535 int nid = page_to_nid(page); 620 int nid = page_to_nid(page);
536 struct address_space *mapping; 621 struct hugepage_subpool *spool =
622 (struct hugepage_subpool *)page_private(page);
537 623
538 mapping = (struct address_space *) page_private(page);
539 set_page_private(page, 0); 624 set_page_private(page, 0);
540 page->mapping = NULL; 625 page->mapping = NULL;
541 BUG_ON(page_count(page)); 626 BUG_ON(page_count(page));
@@ -551,8 +636,7 @@ static void free_huge_page(struct page *page)
551 enqueue_huge_page(h, page); 636 enqueue_huge_page(h, page);
552 } 637 }
553 spin_unlock(&hugetlb_lock); 638 spin_unlock(&hugetlb_lock);
554 if (mapping) 639 hugepage_subpool_put_pages(spool, 1);
555 hugetlb_put_quota(mapping, 1);
556} 640}
557 641
558static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 642static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -852,6 +936,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
852 struct page *page, *tmp; 936 struct page *page, *tmp;
853 int ret, i; 937 int ret, i;
854 int needed, allocated; 938 int needed, allocated;
939 bool alloc_ok = true;
855 940
856 needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 941 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
857 if (needed <= 0) { 942 if (needed <= 0) {
@@ -867,17 +952,13 @@ retry:
867 spin_unlock(&hugetlb_lock); 952 spin_unlock(&hugetlb_lock);
868 for (i = 0; i < needed; i++) { 953 for (i = 0; i < needed; i++) {
869 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 954 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
870 if (!page) 955 if (!page) {
871 /* 956 alloc_ok = false;
872 * We were not able to allocate enough pages to 957 break;
873 * satisfy the entire reservation so we free what 958 }
874 * we've allocated so far.
875 */
876 goto free;
877
878 list_add(&page->lru, &surplus_list); 959 list_add(&page->lru, &surplus_list);
879 } 960 }
880 allocated += needed; 961 allocated += i;
881 962
882 /* 963 /*
883 * After retaking hugetlb_lock, we need to recalculate 'needed' 964 * After retaking hugetlb_lock, we need to recalculate 'needed'
@@ -886,9 +967,16 @@ retry:
886 spin_lock(&hugetlb_lock); 967 spin_lock(&hugetlb_lock);
887 needed = (h->resv_huge_pages + delta) - 968 needed = (h->resv_huge_pages + delta) -
888 (h->free_huge_pages + allocated); 969 (h->free_huge_pages + allocated);
889 if (needed > 0) 970 if (needed > 0) {
890 goto retry; 971 if (alloc_ok)
891 972 goto retry;
973 /*
974 * We were not able to allocate enough pages to
975 * satisfy the entire reservation so we free what
976 * we've allocated so far.
977 */
978 goto free;
979 }
892 /* 980 /*
893 * The surplus_list now contains _at_least_ the number of extra pages 981 * The surplus_list now contains _at_least_ the number of extra pages
894 * needed to accommodate the reservation. Add the appropriate number 982 * needed to accommodate the reservation. Add the appropriate number
@@ -914,10 +1002,10 @@ retry:
914 VM_BUG_ON(page_count(page)); 1002 VM_BUG_ON(page_count(page));
915 enqueue_huge_page(h, page); 1003 enqueue_huge_page(h, page);
916 } 1004 }
1005free:
917 spin_unlock(&hugetlb_lock); 1006 spin_unlock(&hugetlb_lock);
918 1007
919 /* Free unnecessary surplus pages to the buddy allocator */ 1008 /* Free unnecessary surplus pages to the buddy allocator */
920free:
921 if (!list_empty(&surplus_list)) { 1009 if (!list_empty(&surplus_list)) {
922 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1010 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
923 list_del(&page->lru); 1011 list_del(&page->lru);
@@ -966,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,
966/* 1054/*
967 * Determine if the huge page at addr within the vma has an associated 1055 * Determine if the huge page at addr within the vma has an associated
968 * reservation. Where it does not we will need to logically increase 1056 * reservation. Where it does not we will need to logically increase
969 * reservation and actually increase quota before an allocation can occur. 1057 * reservation and actually increase subpool usage before an allocation
970 * Where any new reservation would be required the reservation change is 1058 * can occur. Where any new reservation would be required the
971 * prepared, but not committed. Once the page has been quota'd allocated 1059 * reservation change is prepared, but not committed. Once the page
972 * an instantiated the change should be committed via vma_commit_reservation. 1060 * has been allocated from the subpool and instantiated the change should
973 * No action is required on failure. 1061 * be committed via vma_commit_reservation. No action is required on
1062 * failure.
974 */ 1063 */
975static long vma_needs_reservation(struct hstate *h, 1064static long vma_needs_reservation(struct hstate *h,
976 struct vm_area_struct *vma, unsigned long addr) 1065 struct vm_area_struct *vma, unsigned long addr)
@@ -1019,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,
1019static struct page *alloc_huge_page(struct vm_area_struct *vma, 1108static struct page *alloc_huge_page(struct vm_area_struct *vma,
1020 unsigned long addr, int avoid_reserve) 1109 unsigned long addr, int avoid_reserve)
1021{ 1110{
1111 struct hugepage_subpool *spool = subpool_vma(vma);
1022 struct hstate *h = hstate_vma(vma); 1112 struct hstate *h = hstate_vma(vma);
1023 struct page *page; 1113 struct page *page;
1024 struct address_space *mapping = vma->vm_file->f_mapping;
1025 struct inode *inode = mapping->host;
1026 long chg; 1114 long chg;
1027 1115
1028 /* 1116 /*
1029 * Processes that did not create the mapping will have no reserves and 1117 * Processes that did not create the mapping will have no
1030 * will not have accounted against quota. Check that the quota can be 1118 * reserves and will not have accounted against subpool
1031 * made before satisfying the allocation 1119 * limit. Check that the subpool limit can be made before
1032 * MAP_NORESERVE mappings may also need pages and quota allocated 1120 * satisfying the allocation MAP_NORESERVE mappings may also
1033 * if no reserve mapping overlaps. 1121 * need pages and subpool limit allocated allocated if no reserve
1122 * mapping overlaps.
1034 */ 1123 */
1035 chg = vma_needs_reservation(h, vma, addr); 1124 chg = vma_needs_reservation(h, vma, addr);
1036 if (chg < 0) 1125 if (chg < 0)
1037 return ERR_PTR(-VM_FAULT_OOM); 1126 return ERR_PTR(-VM_FAULT_OOM);
1038 if (chg) 1127 if (chg)
1039 if (hugetlb_get_quota(inode->i_mapping, chg)) 1128 if (hugepage_subpool_get_pages(spool, chg))
1040 return ERR_PTR(-VM_FAULT_SIGBUS); 1129 return ERR_PTR(-VM_FAULT_SIGBUS);
1041 1130
1042 spin_lock(&hugetlb_lock); 1131 spin_lock(&hugetlb_lock);
@@ -1046,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1046 if (!page) { 1135 if (!page) {
1047 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1136 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1048 if (!page) { 1137 if (!page) {
1049 hugetlb_put_quota(inode->i_mapping, chg); 1138 hugepage_subpool_put_pages(spool, chg);
1050 return ERR_PTR(-VM_FAULT_SIGBUS); 1139 return ERR_PTR(-VM_FAULT_SIGBUS);
1051 } 1140 }
1052 } 1141 }
1053 1142
1054 set_page_private(page, (unsigned long) mapping); 1143 set_page_private(page, (unsigned long)spool);
1055 1144
1056 vma_commit_reservation(h, vma, addr); 1145 vma_commit_reservation(h, vma, addr);
1057 1146
@@ -2072,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2072{ 2161{
2073 struct hstate *h = hstate_vma(vma); 2162 struct hstate *h = hstate_vma(vma);
2074 struct resv_map *reservations = vma_resv_map(vma); 2163 struct resv_map *reservations = vma_resv_map(vma);
2164 struct hugepage_subpool *spool = subpool_vma(vma);
2075 unsigned long reserve; 2165 unsigned long reserve;
2076 unsigned long start; 2166 unsigned long start;
2077 unsigned long end; 2167 unsigned long end;
@@ -2087,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2087 2177
2088 if (reserve) { 2178 if (reserve) {
2089 hugetlb_acct_memory(h, -reserve); 2179 hugetlb_acct_memory(h, -reserve);
2090 hugetlb_put_quota(vma->vm_file->f_mapping, reserve); 2180 hugepage_subpool_put_pages(spool, reserve);
2091 } 2181 }
2092 } 2182 }
2093} 2183}
@@ -2276,6 +2366,10 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2276 if (pte_dirty(pte)) 2366 if (pte_dirty(pte))
2277 set_page_dirty(page); 2367 set_page_dirty(page);
2278 list_add(&page->lru, &page_list); 2368 list_add(&page->lru, &page_list);
2369
2370 /* Bail out after unmapping reference page if supplied */
2371 if (ref_page)
2372 break;
2279 } 2373 }
2280 flush_tlb_range(vma, start, end); 2374 flush_tlb_range(vma, start, end);
2281 spin_unlock(&mm->page_table_lock); 2375 spin_unlock(&mm->page_table_lock);
@@ -2316,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2316 */ 2410 */
2317 address = address & huge_page_mask(h); 2411 address = address & huge_page_mask(h);
2318 pgoff = vma_hugecache_offset(h, vma, address); 2412 pgoff = vma_hugecache_offset(h, vma, address);
2319 mapping = (struct address_space *)page_private(page); 2413 mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
2320 2414
2321 /* 2415 /*
2322 * Take the mapping lock for the duration of the table walk. As 2416 * Take the mapping lock for the duration of the table walk. As
@@ -2869,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode,
2869{ 2963{
2870 long ret, chg; 2964 long ret, chg;
2871 struct hstate *h = hstate_inode(inode); 2965 struct hstate *h = hstate_inode(inode);
2966 struct hugepage_subpool *spool = subpool_inode(inode);
2872 2967
2873 /* 2968 /*
2874 * Only apply hugepage reservation if asked. At fault time, an 2969 * Only apply hugepage reservation if asked. At fault time, an
2875 * attempt will be made for VM_NORESERVE to allocate a page 2970 * attempt will be made for VM_NORESERVE to allocate a page
2876 * and filesystem quota without using reserves 2971 * without using reserves
2877 */ 2972 */
2878 if (vm_flags & VM_NORESERVE) 2973 if (vm_flags & VM_NORESERVE)
2879 return 0; 2974 return 0;
@@ -2900,17 +2995,17 @@ int hugetlb_reserve_pages(struct inode *inode,
2900 if (chg < 0) 2995 if (chg < 0)
2901 return chg; 2996 return chg;
2902 2997
2903 /* There must be enough filesystem quota for the mapping */ 2998 /* There must be enough pages in the subpool for the mapping */
2904 if (hugetlb_get_quota(inode->i_mapping, chg)) 2999 if (hugepage_subpool_get_pages(spool, chg))
2905 return -ENOSPC; 3000 return -ENOSPC;
2906 3001
2907 /* 3002 /*
2908 * Check enough hugepages are available for the reservation. 3003 * Check enough hugepages are available for the reservation.
2909 * Hand back the quota if there are not 3004 * Hand the pages back to the subpool if there are not
2910 */ 3005 */
2911 ret = hugetlb_acct_memory(h, chg); 3006 ret = hugetlb_acct_memory(h, chg);
2912 if (ret < 0) { 3007 if (ret < 0) {
2913 hugetlb_put_quota(inode->i_mapping, chg); 3008 hugepage_subpool_put_pages(spool, chg);
2914 return ret; 3009 return ret;
2915 } 3010 }
2916 3011
@@ -2934,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2934{ 3029{
2935 struct hstate *h = hstate_inode(inode); 3030 struct hstate *h = hstate_inode(inode);
2936 long chg = region_truncate(&inode->i_mapping->private_list, offset); 3031 long chg = region_truncate(&inode->i_mapping->private_list, offset);
3032 struct hugepage_subpool *spool = subpool_inode(inode);
2937 3033
2938 spin_lock(&inode->i_lock); 3034 spin_lock(&inode->i_lock);
2939 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 3035 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
2940 spin_unlock(&inode->i_lock); 3036 spin_unlock(&inode->i_lock);
2941 3037
2942 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 3038 hugepage_subpool_put_pages(spool, (chg - freed));
2943 hugetlb_acct_memory(h, -(chg - freed)); 3039 hugetlb_acct_memory(h, -(chg - freed));
2944} 3040}
2945 3041
diff --git a/mm/ksm.c b/mm/ksm.c
index a6d3fb7e6c10..47c885368890 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -374,6 +374,20 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
374 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; 374 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
375} 375}
376 376
377static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
378 unsigned long addr)
379{
380 struct vm_area_struct *vma;
381 if (ksm_test_exit(mm))
382 return NULL;
383 vma = find_vma(mm, addr);
384 if (!vma || vma->vm_start > addr)
385 return NULL;
386 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
387 return NULL;
388 return vma;
389}
390
377static void break_cow(struct rmap_item *rmap_item) 391static void break_cow(struct rmap_item *rmap_item)
378{ 392{
379 struct mm_struct *mm = rmap_item->mm; 393 struct mm_struct *mm = rmap_item->mm;
@@ -387,15 +401,9 @@ static void break_cow(struct rmap_item *rmap_item)
387 put_anon_vma(rmap_item->anon_vma); 401 put_anon_vma(rmap_item->anon_vma);
388 402
389 down_read(&mm->mmap_sem); 403 down_read(&mm->mmap_sem);
390 if (ksm_test_exit(mm)) 404 vma = find_mergeable_vma(mm, addr);
391 goto out; 405 if (vma)
392 vma = find_vma(mm, addr); 406 break_ksm(vma, addr);
393 if (!vma || vma->vm_start > addr)
394 goto out;
395 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
396 goto out;
397 break_ksm(vma, addr);
398out:
399 up_read(&mm->mmap_sem); 407 up_read(&mm->mmap_sem);
400} 408}
401 409
@@ -421,12 +429,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
421 struct page *page; 429 struct page *page;
422 430
423 down_read(&mm->mmap_sem); 431 down_read(&mm->mmap_sem);
424 if (ksm_test_exit(mm)) 432 vma = find_mergeable_vma(mm, addr);
425 goto out; 433 if (!vma)
426 vma = find_vma(mm, addr);
427 if (!vma || vma->vm_start > addr)
428 goto out;
429 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
430 goto out; 434 goto out;
431 435
432 page = follow_page(vma, addr, FOLL_GET); 436 page = follow_page(vma, addr, FOLL_GET);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 26c6f4ec20f4..b2ee6df0e9bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -89,7 +89,6 @@ enum mem_cgroup_stat_index {
89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
92 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
93 MEM_CGROUP_STAT_NSTATS, 92 MEM_CGROUP_STAT_NSTATS,
94}; 93};
95 94
@@ -135,7 +134,7 @@ struct mem_cgroup_reclaim_iter {
135 */ 134 */
136struct mem_cgroup_per_zone { 135struct mem_cgroup_per_zone {
137 struct lruvec lruvec; 136 struct lruvec lruvec;
138 unsigned long count[NR_LRU_LISTS]; 137 unsigned long lru_size[NR_LRU_LISTS];
139 138
140 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 139 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
141 140
@@ -144,11 +143,9 @@ struct mem_cgroup_per_zone {
144 unsigned long long usage_in_excess;/* Set to the value by which */ 143 unsigned long long usage_in_excess;/* Set to the value by which */
145 /* the soft limit is exceeded*/ 144 /* the soft limit is exceeded*/
146 bool on_tree; 145 bool on_tree;
147 struct mem_cgroup *mem; /* Back pointer, we cannot */ 146 struct mem_cgroup *memcg; /* Back pointer, we cannot */
148 /* use container_of */ 147 /* use container_of */
149}; 148};
150/* Macro for accessing counter */
151#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
152 149
153struct mem_cgroup_per_node { 150struct mem_cgroup_per_node {
154 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 151 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
@@ -300,6 +297,12 @@ struct mem_cgroup {
300 */ 297 */
301 unsigned long move_charge_at_immigrate; 298 unsigned long move_charge_at_immigrate;
302 /* 299 /*
300 * set > 0 if pages under this cgroup are moving to other cgroup.
301 */
302 atomic_t moving_account;
303 /* taken only while moving_account > 0 */
304 spinlock_t move_lock;
305 /*
303 * percpu counter. 306 * percpu counter.
304 */ 307 */
305 struct mem_cgroup_stat_cpu *stat; 308 struct mem_cgroup_stat_cpu *stat;
@@ -612,9 +615,9 @@ retry:
612 * we will to add it back at the end of reclaim to its correct 615 * we will to add it back at the end of reclaim to its correct
613 * position in the tree. 616 * position in the tree.
614 */ 617 */
615 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 618 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
616 if (!res_counter_soft_limit_excess(&mz->mem->res) || 619 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
617 !css_tryget(&mz->mem->css)) 620 !css_tryget(&mz->memcg->css))
618 goto retry; 621 goto retry;
619done: 622done:
620 return mz; 623 return mz;
@@ -692,15 +695,19 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
692} 695}
693 696
694static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 697static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
695 bool file, int nr_pages) 698 bool anon, int nr_pages)
696{ 699{
697 preempt_disable(); 700 preempt_disable();
698 701
699 if (file) 702 /*
700 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 703 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
704 * counted as CACHE even if it's on ANON LRU.
705 */
706 if (anon)
707 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
701 nr_pages); 708 nr_pages);
702 else 709 else
703 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 710 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
704 nr_pages); 711 nr_pages);
705 712
706 /* pagein of a big page is an event. So, ignore page size */ 713 /* pagein of a big page is an event. So, ignore page size */
@@ -721,14 +728,14 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
721 unsigned int lru_mask) 728 unsigned int lru_mask)
722{ 729{
723 struct mem_cgroup_per_zone *mz; 730 struct mem_cgroup_per_zone *mz;
724 enum lru_list l; 731 enum lru_list lru;
725 unsigned long ret = 0; 732 unsigned long ret = 0;
726 733
727 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 734 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
728 735
729 for_each_lru(l) { 736 for_each_lru(lru) {
730 if (BIT(l) & lru_mask) 737 if (BIT(lru) & lru_mask)
731 ret += MEM_CGROUP_ZSTAT(mz, l); 738 ret += mz->lru_size[lru];
732 } 739 }
733 return ret; 740 return ret;
734} 741}
@@ -1077,7 +1084,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1077 1084
1078 mz = page_cgroup_zoneinfo(memcg, page); 1085 mz = page_cgroup_zoneinfo(memcg, page);
1079 /* compound_order() is stabilized through lru_lock */ 1086 /* compound_order() is stabilized through lru_lock */
1080 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 1087 mz->lru_size[lru] += 1 << compound_order(page);
1081 return &mz->lruvec; 1088 return &mz->lruvec;
1082} 1089}
1083 1090
@@ -1105,8 +1112,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
1105 VM_BUG_ON(!memcg); 1112 VM_BUG_ON(!memcg);
1106 mz = page_cgroup_zoneinfo(memcg, page); 1113 mz = page_cgroup_zoneinfo(memcg, page);
1107 /* huge page split is done under lru_lock. so, we have no races. */ 1114 /* huge page split is done under lru_lock. so, we have no races. */
1108 VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); 1115 VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
1109 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); 1116 mz->lru_size[lru] -= 1 << compound_order(page);
1110} 1117}
1111 1118
1112void mem_cgroup_lru_del(struct page *page) 1119void mem_cgroup_lru_del(struct page *page)
@@ -1285,40 +1292,48 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1285 return memcg->swappiness; 1292 return memcg->swappiness;
1286} 1293}
1287 1294
1288static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1295/*
1289{ 1296 * memcg->moving_account is used for checking possibility that some thread is
1290 int cpu; 1297 * calling move_account(). When a thread on CPU-A starts moving pages under
1298 * a memcg, other threads should check memcg->moving_account under
1299 * rcu_read_lock(), like this:
1300 *
1301 * CPU-A CPU-B
1302 * rcu_read_lock()
1303 * memcg->moving_account+1 if (memcg->mocing_account)
1304 * take heavy locks.
1305 * synchronize_rcu() update something.
1306 * rcu_read_unlock()
1307 * start move here.
1308 */
1291 1309
1292 get_online_cpus(); 1310/* for quick checking without looking up memcg */
1293 spin_lock(&memcg->pcp_counter_lock); 1311atomic_t memcg_moving __read_mostly;
1294 for_each_online_cpu(cpu)
1295 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1296 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1297 spin_unlock(&memcg->pcp_counter_lock);
1298 put_online_cpus();
1299 1312
1313static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1314{
1315 atomic_inc(&memcg_moving);
1316 atomic_inc(&memcg->moving_account);
1300 synchronize_rcu(); 1317 synchronize_rcu();
1301} 1318}
1302 1319
1303static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1320static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1304{ 1321{
1305 int cpu; 1322 /*
1306 1323 * Now, mem_cgroup_clear_mc() may call this function with NULL.
1307 if (!memcg) 1324 * We check NULL in callee rather than caller.
1308 return; 1325 */
1309 get_online_cpus(); 1326 if (memcg) {
1310 spin_lock(&memcg->pcp_counter_lock); 1327 atomic_dec(&memcg_moving);
1311 for_each_online_cpu(cpu) 1328 atomic_dec(&memcg->moving_account);
1312 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; 1329 }
1313 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1314 spin_unlock(&memcg->pcp_counter_lock);
1315 put_online_cpus();
1316} 1330}
1331
1317/* 1332/*
1318 * 2 routines for checking "mem" is under move_account() or not. 1333 * 2 routines for checking "mem" is under move_account() or not.
1319 * 1334 *
1320 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used 1335 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
1321 * for avoiding race in accounting. If true, 1336 * is used for avoiding races in accounting. If true,
1322 * pc->mem_cgroup may be overwritten. 1337 * pc->mem_cgroup may be overwritten.
1323 * 1338 *
1324 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1339 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
@@ -1326,10 +1341,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1326 * waiting at hith-memory prressure caused by "move". 1341 * waiting at hith-memory prressure caused by "move".
1327 */ 1342 */
1328 1343
1329static bool mem_cgroup_stealed(struct mem_cgroup *memcg) 1344static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1330{ 1345{
1331 VM_BUG_ON(!rcu_read_lock_held()); 1346 VM_BUG_ON(!rcu_read_lock_held());
1332 return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; 1347 return atomic_read(&memcg->moving_account) > 0;
1333} 1348}
1334 1349
1335static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1350static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
@@ -1370,6 +1385,24 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1370 return false; 1385 return false;
1371} 1386}
1372 1387
1388/*
1389 * Take this lock when
1390 * - a code tries to modify page's memcg while it's USED.
1391 * - a code tries to modify page state accounting in a memcg.
1392 * see mem_cgroup_stolen(), too.
1393 */
1394static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1395 unsigned long *flags)
1396{
1397 spin_lock_irqsave(&memcg->move_lock, *flags);
1398}
1399
1400static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1401 unsigned long *flags)
1402{
1403 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1404}
1405
1373/** 1406/**
1374 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1407 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1375 * @memcg: The memory cgroup that went over limit 1408 * @memcg: The memory cgroup that went over limit
@@ -1393,7 +1426,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1393 if (!memcg || !p) 1426 if (!memcg || !p)
1394 return; 1427 return;
1395 1428
1396
1397 rcu_read_lock(); 1429 rcu_read_lock();
1398 1430
1399 mem_cgrp = memcg->css.cgroup; 1431 mem_cgrp = memcg->css.cgroup;
@@ -1772,22 +1804,22 @@ static DEFINE_SPINLOCK(memcg_oom_lock);
1772static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1804static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1773 1805
1774struct oom_wait_info { 1806struct oom_wait_info {
1775 struct mem_cgroup *mem; 1807 struct mem_cgroup *memcg;
1776 wait_queue_t wait; 1808 wait_queue_t wait;
1777}; 1809};
1778 1810
1779static int memcg_oom_wake_function(wait_queue_t *wait, 1811static int memcg_oom_wake_function(wait_queue_t *wait,
1780 unsigned mode, int sync, void *arg) 1812 unsigned mode, int sync, void *arg)
1781{ 1813{
1782 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg, 1814 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1783 *oom_wait_memcg; 1815 struct mem_cgroup *oom_wait_memcg;
1784 struct oom_wait_info *oom_wait_info; 1816 struct oom_wait_info *oom_wait_info;
1785 1817
1786 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1818 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1787 oom_wait_memcg = oom_wait_info->mem; 1819 oom_wait_memcg = oom_wait_info->memcg;
1788 1820
1789 /* 1821 /*
1790 * Both of oom_wait_info->mem and wake_mem are stable under us. 1822 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
1791 * Then we can use css_is_ancestor without taking care of RCU. 1823 * Then we can use css_is_ancestor without taking care of RCU.
1792 */ 1824 */
1793 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 1825 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
@@ -1811,12 +1843,12 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
1811/* 1843/*
1812 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1844 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1813 */ 1845 */
1814bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) 1846bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1815{ 1847{
1816 struct oom_wait_info owait; 1848 struct oom_wait_info owait;
1817 bool locked, need_to_kill; 1849 bool locked, need_to_kill;
1818 1850
1819 owait.mem = memcg; 1851 owait.memcg = memcg;
1820 owait.wait.flags = 0; 1852 owait.wait.flags = 0;
1821 owait.wait.func = memcg_oom_wake_function; 1853 owait.wait.func = memcg_oom_wake_function;
1822 owait.wait.private = current; 1854 owait.wait.private = current;
@@ -1841,7 +1873,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1841 1873
1842 if (need_to_kill) { 1874 if (need_to_kill) {
1843 finish_wait(&memcg_oom_waitq, &owait.wait); 1875 finish_wait(&memcg_oom_waitq, &owait.wait);
1844 mem_cgroup_out_of_memory(memcg, mask); 1876 mem_cgroup_out_of_memory(memcg, mask, order);
1845 } else { 1877 } else {
1846 schedule(); 1878 schedule();
1847 finish_wait(&memcg_oom_waitq, &owait.wait); 1879 finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1881,41 +1913,66 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1881 * by flags. 1913 * by flags.
1882 * 1914 *
1883 * Considering "move", this is an only case we see a race. To make the race 1915 * Considering "move", this is an only case we see a race. To make the race
1884 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are 1916 * small, we check mm->moving_account and detect there are possibility of race
1885 * possibility of race condition. If there is, we take a lock. 1917 * If there is, we take a lock.
1886 */ 1918 */
1887 1919
1920void __mem_cgroup_begin_update_page_stat(struct page *page,
1921 bool *locked, unsigned long *flags)
1922{
1923 struct mem_cgroup *memcg;
1924 struct page_cgroup *pc;
1925
1926 pc = lookup_page_cgroup(page);
1927again:
1928 memcg = pc->mem_cgroup;
1929 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1930 return;
1931 /*
1932 * If this memory cgroup is not under account moving, we don't
1933 * need to take move_lock_page_cgroup(). Because we already hold
1934 * rcu_read_lock(), any calls to move_account will be delayed until
1935 * rcu_read_unlock() if mem_cgroup_stolen() == true.
1936 */
1937 if (!mem_cgroup_stolen(memcg))
1938 return;
1939
1940 move_lock_mem_cgroup(memcg, flags);
1941 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
1942 move_unlock_mem_cgroup(memcg, flags);
1943 goto again;
1944 }
1945 *locked = true;
1946}
1947
1948void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
1949{
1950 struct page_cgroup *pc = lookup_page_cgroup(page);
1951
1952 /*
1953 * It's guaranteed that pc->mem_cgroup never changes while
1954 * lock is held because a routine modifies pc->mem_cgroup
1955 * should take move_lock_page_cgroup().
1956 */
1957 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
1958}
1959
1888void mem_cgroup_update_page_stat(struct page *page, 1960void mem_cgroup_update_page_stat(struct page *page,
1889 enum mem_cgroup_page_stat_item idx, int val) 1961 enum mem_cgroup_page_stat_item idx, int val)
1890{ 1962{
1891 struct mem_cgroup *memcg; 1963 struct mem_cgroup *memcg;
1892 struct page_cgroup *pc = lookup_page_cgroup(page); 1964 struct page_cgroup *pc = lookup_page_cgroup(page);
1893 bool need_unlock = false;
1894 unsigned long uninitialized_var(flags); 1965 unsigned long uninitialized_var(flags);
1895 1966
1896 if (mem_cgroup_disabled()) 1967 if (mem_cgroup_disabled())
1897 return; 1968 return;
1898 1969
1899 rcu_read_lock();
1900 memcg = pc->mem_cgroup; 1970 memcg = pc->mem_cgroup;
1901 if (unlikely(!memcg || !PageCgroupUsed(pc))) 1971 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1902 goto out; 1972 return;
1903 /* pc->mem_cgroup is unstable ? */
1904 if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
1905 /* take a lock against to access pc->mem_cgroup */
1906 move_lock_page_cgroup(pc, &flags);
1907 need_unlock = true;
1908 memcg = pc->mem_cgroup;
1909 if (!memcg || !PageCgroupUsed(pc))
1910 goto out;
1911 }
1912 1973
1913 switch (idx) { 1974 switch (idx) {
1914 case MEMCG_NR_FILE_MAPPED: 1975 case MEMCG_NR_FILE_MAPPED:
1915 if (val > 0)
1916 SetPageCgroupFileMapped(pc);
1917 else if (!page_mapped(page))
1918 ClearPageCgroupFileMapped(pc);
1919 idx = MEM_CGROUP_STAT_FILE_MAPPED; 1976 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1920 break; 1977 break;
1921 default: 1978 default:
@@ -1923,14 +1980,7 @@ void mem_cgroup_update_page_stat(struct page *page,
1923 } 1980 }
1924 1981
1925 this_cpu_add(memcg->stat->count[idx], val); 1982 this_cpu_add(memcg->stat->count[idx], val);
1926
1927out:
1928 if (unlikely(need_unlock))
1929 move_unlock_page_cgroup(pc, &flags);
1930 rcu_read_unlock();
1931 return;
1932} 1983}
1933EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1934 1984
1935/* 1985/*
1936 * size of first charge trial. "32" comes from vmscan.c's magic value. 1986 * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -2101,17 +2151,6 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2101 per_cpu(memcg->stat->events[i], cpu) = 0; 2151 per_cpu(memcg->stat->events[i], cpu) = 0;
2102 memcg->nocpu_base.events[i] += x; 2152 memcg->nocpu_base.events[i] += x;
2103 } 2153 }
2104 /* need to clear ON_MOVE value, works as a kind of lock. */
2105 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
2106 spin_unlock(&memcg->pcp_counter_lock);
2107}
2108
2109static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
2110{
2111 int idx = MEM_CGROUP_ON_MOVE;
2112
2113 spin_lock(&memcg->pcp_counter_lock);
2114 per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
2115 spin_unlock(&memcg->pcp_counter_lock); 2154 spin_unlock(&memcg->pcp_counter_lock);
2116} 2155}
2117 2156
@@ -2123,11 +2162,8 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2123 struct memcg_stock_pcp *stock; 2162 struct memcg_stock_pcp *stock;
2124 struct mem_cgroup *iter; 2163 struct mem_cgroup *iter;
2125 2164
2126 if ((action == CPU_ONLINE)) { 2165 if (action == CPU_ONLINE)
2127 for_each_mem_cgroup(iter)
2128 synchronize_mem_cgroup_on_move(iter, cpu);
2129 return NOTIFY_OK; 2166 return NOTIFY_OK;
2130 }
2131 2167
2132 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 2168 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2133 return NOTIFY_OK; 2169 return NOTIFY_OK;
@@ -2212,7 +2248,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2212 if (!oom_check) 2248 if (!oom_check)
2213 return CHARGE_NOMEM; 2249 return CHARGE_NOMEM;
2214 /* check OOM */ 2250 /* check OOM */
2215 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) 2251 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
2216 return CHARGE_OOM_DIE; 2252 return CHARGE_OOM_DIE;
2217 2253
2218 return CHARGE_RETRY; 2254 return CHARGE_RETRY;
@@ -2446,6 +2482,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2446{ 2482{
2447 struct zone *uninitialized_var(zone); 2483 struct zone *uninitialized_var(zone);
2448 bool was_on_lru = false; 2484 bool was_on_lru = false;
2485 bool anon;
2449 2486
2450 lock_page_cgroup(pc); 2487 lock_page_cgroup(pc);
2451 if (unlikely(PageCgroupUsed(pc))) { 2488 if (unlikely(PageCgroupUsed(pc))) {
@@ -2481,19 +2518,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2481 * See mem_cgroup_add_lru_list(), etc. 2518 * See mem_cgroup_add_lru_list(), etc.
2482 */ 2519 */
2483 smp_wmb(); 2520 smp_wmb();
2484 switch (ctype) { 2521 SetPageCgroupUsed(pc);
2485 case MEM_CGROUP_CHARGE_TYPE_CACHE:
2486 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
2487 SetPageCgroupCache(pc);
2488 SetPageCgroupUsed(pc);
2489 break;
2490 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2491 ClearPageCgroupCache(pc);
2492 SetPageCgroupUsed(pc);
2493 break;
2494 default:
2495 break;
2496 }
2497 2522
2498 if (lrucare) { 2523 if (lrucare) {
2499 if (was_on_lru) { 2524 if (was_on_lru) {
@@ -2504,7 +2529,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2504 spin_unlock_irq(&zone->lru_lock); 2529 spin_unlock_irq(&zone->lru_lock);
2505 } 2530 }
2506 2531
2507 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); 2532 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
2533 anon = true;
2534 else
2535 anon = false;
2536
2537 mem_cgroup_charge_statistics(memcg, anon, nr_pages);
2508 unlock_page_cgroup(pc); 2538 unlock_page_cgroup(pc);
2509 2539
2510 /* 2540 /*
@@ -2517,8 +2547,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2517 2547
2518#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2548#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2519 2549
2520#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2550#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION))
2521 (1 << PCG_MIGRATION))
2522/* 2551/*
2523 * Because tail pages are not marked as "used", set it. We're under 2552 * Because tail pages are not marked as "used", set it. We're under
2524 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2553 * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -2569,6 +2598,7 @@ static int mem_cgroup_move_account(struct page *page,
2569{ 2598{
2570 unsigned long flags; 2599 unsigned long flags;
2571 int ret; 2600 int ret;
2601 bool anon = PageAnon(page);
2572 2602
2573 VM_BUG_ON(from == to); 2603 VM_BUG_ON(from == to);
2574 VM_BUG_ON(PageLRU(page)); 2604 VM_BUG_ON(PageLRU(page));
@@ -2588,23 +2618,23 @@ static int mem_cgroup_move_account(struct page *page,
2588 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 2618 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2589 goto unlock; 2619 goto unlock;
2590 2620
2591 move_lock_page_cgroup(pc, &flags); 2621 move_lock_mem_cgroup(from, &flags);
2592 2622
2593 if (PageCgroupFileMapped(pc)) { 2623 if (!anon && page_mapped(page)) {
2594 /* Update mapped_file data for mem_cgroup */ 2624 /* Update mapped_file data for mem_cgroup */
2595 preempt_disable(); 2625 preempt_disable();
2596 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2626 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2597 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2627 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2598 preempt_enable(); 2628 preempt_enable();
2599 } 2629 }
2600 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); 2630 mem_cgroup_charge_statistics(from, anon, -nr_pages);
2601 if (uncharge) 2631 if (uncharge)
2602 /* This is not "cancel", but cancel_charge does all we need. */ 2632 /* This is not "cancel", but cancel_charge does all we need. */
2603 __mem_cgroup_cancel_charge(from, nr_pages); 2633 __mem_cgroup_cancel_charge(from, nr_pages);
2604 2634
2605 /* caller should have done css_get */ 2635 /* caller should have done css_get */
2606 pc->mem_cgroup = to; 2636 pc->mem_cgroup = to;
2607 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); 2637 mem_cgroup_charge_statistics(to, anon, nr_pages);
2608 /* 2638 /*
2609 * We charges against "to" which may not have any tasks. Then, "to" 2639 * We charges against "to" which may not have any tasks. Then, "to"
2610 * can be under rmdir(). But in current implementation, caller of 2640 * can be under rmdir(). But in current implementation, caller of
@@ -2612,7 +2642,7 @@ static int mem_cgroup_move_account(struct page *page,
2612 * guaranteed that "to" is never removed. So, we don't check rmdir 2642 * guaranteed that "to" is never removed. So, we don't check rmdir
2613 * status here. 2643 * status here.
2614 */ 2644 */
2615 move_unlock_page_cgroup(pc, &flags); 2645 move_unlock_mem_cgroup(from, &flags);
2616 ret = 0; 2646 ret = 0;
2617unlock: 2647unlock:
2618 unlock_page_cgroup(pc); 2648 unlock_page_cgroup(pc);
@@ -2914,7 +2944,6 @@ direct_uncharge:
2914 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); 2944 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
2915 if (unlikely(batch->memcg != memcg)) 2945 if (unlikely(batch->memcg != memcg))
2916 memcg_oom_recover(memcg); 2946 memcg_oom_recover(memcg);
2917 return;
2918} 2947}
2919 2948
2920/* 2949/*
@@ -2926,6 +2955,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2926 struct mem_cgroup *memcg = NULL; 2955 struct mem_cgroup *memcg = NULL;
2927 unsigned int nr_pages = 1; 2956 unsigned int nr_pages = 1;
2928 struct page_cgroup *pc; 2957 struct page_cgroup *pc;
2958 bool anon;
2929 2959
2930 if (mem_cgroup_disabled()) 2960 if (mem_cgroup_disabled())
2931 return NULL; 2961 return NULL;
@@ -2951,8 +2981,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2951 if (!PageCgroupUsed(pc)) 2981 if (!PageCgroupUsed(pc))
2952 goto unlock_out; 2982 goto unlock_out;
2953 2983
2984 anon = PageAnon(page);
2985
2954 switch (ctype) { 2986 switch (ctype) {
2955 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2987 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2988 /*
2989 * Generally PageAnon tells if it's the anon statistics to be
2990 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
2991 * used before page reached the stage of being marked PageAnon.
2992 */
2993 anon = true;
2994 /* fallthrough */
2956 case MEM_CGROUP_CHARGE_TYPE_DROP: 2995 case MEM_CGROUP_CHARGE_TYPE_DROP:
2957 /* See mem_cgroup_prepare_migration() */ 2996 /* See mem_cgroup_prepare_migration() */
2958 if (page_mapped(page) || PageCgroupMigration(pc)) 2997 if (page_mapped(page) || PageCgroupMigration(pc))
@@ -2969,7 +3008,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2969 break; 3008 break;
2970 } 3009 }
2971 3010
2972 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages); 3011 mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
2973 3012
2974 ClearPageCgroupUsed(pc); 3013 ClearPageCgroupUsed(pc);
2975 /* 3014 /*
@@ -3276,6 +3315,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3276{ 3315{
3277 struct page *used, *unused; 3316 struct page *used, *unused;
3278 struct page_cgroup *pc; 3317 struct page_cgroup *pc;
3318 bool anon;
3279 3319
3280 if (!memcg) 3320 if (!memcg)
3281 return; 3321 return;
@@ -3297,8 +3337,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3297 lock_page_cgroup(pc); 3337 lock_page_cgroup(pc);
3298 ClearPageCgroupMigration(pc); 3338 ClearPageCgroupMigration(pc);
3299 unlock_page_cgroup(pc); 3339 unlock_page_cgroup(pc);
3300 3340 anon = PageAnon(used);
3301 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 3341 __mem_cgroup_uncharge_common(unused,
3342 anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
3343 : MEM_CGROUP_CHARGE_TYPE_CACHE);
3302 3344
3303 /* 3345 /*
3304 * If a page is a file cache, radix-tree replacement is very atomic 3346 * If a page is a file cache, radix-tree replacement is very atomic
@@ -3308,7 +3350,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3308 * and USED bit check in mem_cgroup_uncharge_page() will do enough 3350 * and USED bit check in mem_cgroup_uncharge_page() will do enough
3309 * check. (see prepare_charge() also) 3351 * check. (see prepare_charge() also)
3310 */ 3352 */
3311 if (PageAnon(used)) 3353 if (anon)
3312 mem_cgroup_uncharge_page(used); 3354 mem_cgroup_uncharge_page(used);
3313 /* 3355 /*
3314 * At migration, we may charge account against cgroup which has no 3356 * At migration, we may charge account against cgroup which has no
@@ -3338,7 +3380,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3338 /* fix accounting on old pages */ 3380 /* fix accounting on old pages */
3339 lock_page_cgroup(pc); 3381 lock_page_cgroup(pc);
3340 memcg = pc->mem_cgroup; 3382 memcg = pc->mem_cgroup;
3341 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); 3383 mem_cgroup_charge_statistics(memcg, false, -1);
3342 ClearPageCgroupUsed(pc); 3384 ClearPageCgroupUsed(pc);
3343 unlock_page_cgroup(pc); 3385 unlock_page_cgroup(pc);
3344 3386
@@ -3549,7 +3591,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3549 break; 3591 break;
3550 3592
3551 nr_scanned = 0; 3593 nr_scanned = 0;
3552 reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, 3594 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
3553 gfp_mask, &nr_scanned); 3595 gfp_mask, &nr_scanned);
3554 nr_reclaimed += reclaimed; 3596 nr_reclaimed += reclaimed;
3555 *total_scanned += nr_scanned; 3597 *total_scanned += nr_scanned;
@@ -3576,13 +3618,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3576 next_mz = 3618 next_mz =
3577 __mem_cgroup_largest_soft_limit_node(mctz); 3619 __mem_cgroup_largest_soft_limit_node(mctz);
3578 if (next_mz == mz) 3620 if (next_mz == mz)
3579 css_put(&next_mz->mem->css); 3621 css_put(&next_mz->memcg->css);
3580 else /* next_mz == NULL or other memcg */ 3622 else /* next_mz == NULL or other memcg */
3581 break; 3623 break;
3582 } while (1); 3624 } while (1);
3583 } 3625 }
3584 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 3626 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
3585 excess = res_counter_soft_limit_excess(&mz->mem->res); 3627 excess = res_counter_soft_limit_excess(&mz->memcg->res);
3586 /* 3628 /*
3587 * One school of thought says that we should not add 3629 * One school of thought says that we should not add
3588 * back the node to the tree if reclaim returns 0. 3630 * back the node to the tree if reclaim returns 0.
@@ -3592,9 +3634,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3592 * term TODO. 3634 * term TODO.
3593 */ 3635 */
3594 /* If excess == 0, no tree ops */ 3636 /* If excess == 0, no tree ops */
3595 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 3637 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
3596 spin_unlock(&mctz->lock); 3638 spin_unlock(&mctz->lock);
3597 css_put(&mz->mem->css); 3639 css_put(&mz->memcg->css);
3598 loop++; 3640 loop++;
3599 /* 3641 /*
3600 * Could not reclaim anything and there are no more 3642 * Could not reclaim anything and there are no more
@@ -3607,7 +3649,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3607 break; 3649 break;
3608 } while (!nr_reclaimed); 3650 } while (!nr_reclaimed);
3609 if (next_mz) 3651 if (next_mz)
3610 css_put(&next_mz->mem->css); 3652 css_put(&next_mz->memcg->css);
3611 return nr_reclaimed; 3653 return nr_reclaimed;
3612} 3654}
3613 3655
@@ -3629,7 +3671,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3629 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3671 mz = mem_cgroup_zoneinfo(memcg, node, zid);
3630 list = &mz->lruvec.lists[lru]; 3672 list = &mz->lruvec.lists[lru];
3631 3673
3632 loop = MEM_CGROUP_ZSTAT(mz, lru); 3674 loop = mz->lru_size[lru];
3633 /* give some margin against EBUSY etc...*/ 3675 /* give some margin against EBUSY etc...*/
3634 loop += 256; 3676 loop += 256;
3635 busy = NULL; 3677 busy = NULL;
@@ -3703,10 +3745,10 @@ move_account:
3703 mem_cgroup_start_move(memcg); 3745 mem_cgroup_start_move(memcg);
3704 for_each_node_state(node, N_HIGH_MEMORY) { 3746 for_each_node_state(node, N_HIGH_MEMORY) {
3705 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3747 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3706 enum lru_list l; 3748 enum lru_list lru;
3707 for_each_lru(l) { 3749 for_each_lru(lru) {
3708 ret = mem_cgroup_force_empty_list(memcg, 3750 ret = mem_cgroup_force_empty_list(memcg,
3709 node, zid, l); 3751 node, zid, lru);
3710 if (ret) 3752 if (ret)
3711 break; 3753 break;
3712 } 3754 }
@@ -3860,7 +3902,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3860 break; 3902 break;
3861 default: 3903 default:
3862 BUG(); 3904 BUG();
3863 break;
3864 } 3905 }
3865 return val; 3906 return val;
3866} 3907}
@@ -3939,7 +3980,6 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3939out: 3980out:
3940 *mem_limit = min_limit; 3981 *mem_limit = min_limit;
3941 *memsw_limit = min_memsw_limit; 3982 *memsw_limit = min_memsw_limit;
3942 return;
3943} 3983}
3944 3984
3945static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3985static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -4098,38 +4138,38 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4098 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 4138 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4099 unsigned long node_nr; 4139 unsigned long node_nr;
4100 struct cgroup *cont = m->private; 4140 struct cgroup *cont = m->private;
4101 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4141 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4102 4142
4103 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); 4143 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
4104 seq_printf(m, "total=%lu", total_nr); 4144 seq_printf(m, "total=%lu", total_nr);
4105 for_each_node_state(nid, N_HIGH_MEMORY) { 4145 for_each_node_state(nid, N_HIGH_MEMORY) {
4106 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); 4146 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
4107 seq_printf(m, " N%d=%lu", nid, node_nr); 4147 seq_printf(m, " N%d=%lu", nid, node_nr);
4108 } 4148 }
4109 seq_putc(m, '\n'); 4149 seq_putc(m, '\n');
4110 4150
4111 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); 4151 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
4112 seq_printf(m, "file=%lu", file_nr); 4152 seq_printf(m, "file=%lu", file_nr);
4113 for_each_node_state(nid, N_HIGH_MEMORY) { 4153 for_each_node_state(nid, N_HIGH_MEMORY) {
4114 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4154 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4115 LRU_ALL_FILE); 4155 LRU_ALL_FILE);
4116 seq_printf(m, " N%d=%lu", nid, node_nr); 4156 seq_printf(m, " N%d=%lu", nid, node_nr);
4117 } 4157 }
4118 seq_putc(m, '\n'); 4158 seq_putc(m, '\n');
4119 4159
4120 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); 4160 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
4121 seq_printf(m, "anon=%lu", anon_nr); 4161 seq_printf(m, "anon=%lu", anon_nr);
4122 for_each_node_state(nid, N_HIGH_MEMORY) { 4162 for_each_node_state(nid, N_HIGH_MEMORY) {
4123 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4163 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4124 LRU_ALL_ANON); 4164 LRU_ALL_ANON);
4125 seq_printf(m, " N%d=%lu", nid, node_nr); 4165 seq_printf(m, " N%d=%lu", nid, node_nr);
4126 } 4166 }
4127 seq_putc(m, '\n'); 4167 seq_putc(m, '\n');
4128 4168
4129 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); 4169 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4130 seq_printf(m, "unevictable=%lu", unevictable_nr); 4170 seq_printf(m, "unevictable=%lu", unevictable_nr);
4131 for_each_node_state(nid, N_HIGH_MEMORY) { 4171 for_each_node_state(nid, N_HIGH_MEMORY) {
4132 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4172 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4133 BIT(LRU_UNEVICTABLE)); 4173 BIT(LRU_UNEVICTABLE));
4134 seq_printf(m, " N%d=%lu", nid, node_nr); 4174 seq_printf(m, " N%d=%lu", nid, node_nr);
4135 } 4175 }
@@ -4141,12 +4181,12 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4141static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4181static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4142 struct cgroup_map_cb *cb) 4182 struct cgroup_map_cb *cb)
4143{ 4183{
4144 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4184 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4145 struct mcs_total_stat mystat; 4185 struct mcs_total_stat mystat;
4146 int i; 4186 int i;
4147 4187
4148 memset(&mystat, 0, sizeof(mystat)); 4188 memset(&mystat, 0, sizeof(mystat));
4149 mem_cgroup_get_local_stat(mem_cont, &mystat); 4189 mem_cgroup_get_local_stat(memcg, &mystat);
4150 4190
4151 4191
4152 for (i = 0; i < NR_MCS_STAT; i++) { 4192 for (i = 0; i < NR_MCS_STAT; i++) {
@@ -4158,14 +4198,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4158 /* Hierarchical information */ 4198 /* Hierarchical information */
4159 { 4199 {
4160 unsigned long long limit, memsw_limit; 4200 unsigned long long limit, memsw_limit;
4161 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 4201 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
4162 cb->fill(cb, "hierarchical_memory_limit", limit); 4202 cb->fill(cb, "hierarchical_memory_limit", limit);
4163 if (do_swap_account) 4203 if (do_swap_account)
4164 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 4204 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
4165 } 4205 }
4166 4206
4167 memset(&mystat, 0, sizeof(mystat)); 4207 memset(&mystat, 0, sizeof(mystat));
4168 mem_cgroup_get_total_stat(mem_cont, &mystat); 4208 mem_cgroup_get_total_stat(memcg, &mystat);
4169 for (i = 0; i < NR_MCS_STAT; i++) { 4209 for (i = 0; i < NR_MCS_STAT; i++) {
4170 if (i == MCS_SWAP && !do_swap_account) 4210 if (i == MCS_SWAP && !do_swap_account)
4171 continue; 4211 continue;
@@ -4181,7 +4221,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4181 4221
4182 for_each_online_node(nid) 4222 for_each_online_node(nid)
4183 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4223 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4184 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 4224 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
4185 4225
4186 recent_rotated[0] += 4226 recent_rotated[0] +=
4187 mz->reclaim_stat.recent_rotated[0]; 4227 mz->reclaim_stat.recent_rotated[0];
@@ -4426,12 +4466,6 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4426 else 4466 else
4427 BUG(); 4467 BUG();
4428 4468
4429 /*
4430 * Something went wrong if we trying to unregister a threshold
4431 * if we don't have thresholds
4432 */
4433 BUG_ON(!thresholds);
4434
4435 if (!thresholds->primary) 4469 if (!thresholds->primary)
4436 goto unlock; 4470 goto unlock;
4437 4471
@@ -4736,7 +4770,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4736{ 4770{
4737 struct mem_cgroup_per_node *pn; 4771 struct mem_cgroup_per_node *pn;
4738 struct mem_cgroup_per_zone *mz; 4772 struct mem_cgroup_per_zone *mz;
4739 enum lru_list l; 4773 enum lru_list lru;
4740 int zone, tmp = node; 4774 int zone, tmp = node;
4741 /* 4775 /*
4742 * This routine is called against possible nodes. 4776 * This routine is called against possible nodes.
@@ -4754,11 +4788,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4754 4788
4755 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4789 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4756 mz = &pn->zoneinfo[zone]; 4790 mz = &pn->zoneinfo[zone];
4757 for_each_lru(l) 4791 for_each_lru(lru)
4758 INIT_LIST_HEAD(&mz->lruvec.lists[l]); 4792 INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
4759 mz->usage_in_excess = 0; 4793 mz->usage_in_excess = 0;
4760 mz->on_tree = false; 4794 mz->on_tree = false;
4761 mz->mem = memcg; 4795 mz->memcg = memcg;
4762 } 4796 }
4763 memcg->info.nodeinfo[node] = pn; 4797 memcg->info.nodeinfo[node] = pn;
4764 return 0; 4798 return 0;
@@ -4771,29 +4805,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4771 4805
4772static struct mem_cgroup *mem_cgroup_alloc(void) 4806static struct mem_cgroup *mem_cgroup_alloc(void)
4773{ 4807{
4774 struct mem_cgroup *mem; 4808 struct mem_cgroup *memcg;
4775 int size = sizeof(struct mem_cgroup); 4809 int size = sizeof(struct mem_cgroup);
4776 4810
4777 /* Can be very big if MAX_NUMNODES is very big */ 4811 /* Can be very big if MAX_NUMNODES is very big */
4778 if (size < PAGE_SIZE) 4812 if (size < PAGE_SIZE)
4779 mem = kzalloc(size, GFP_KERNEL); 4813 memcg = kzalloc(size, GFP_KERNEL);
4780 else 4814 else
4781 mem = vzalloc(size); 4815 memcg = vzalloc(size);
4782 4816
4783 if (!mem) 4817 if (!memcg)
4784 return NULL; 4818 return NULL;
4785 4819
4786 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4820 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4787 if (!mem->stat) 4821 if (!memcg->stat)
4788 goto out_free; 4822 goto out_free;
4789 spin_lock_init(&mem->pcp_counter_lock); 4823 spin_lock_init(&memcg->pcp_counter_lock);
4790 return mem; 4824 return memcg;
4791 4825
4792out_free: 4826out_free:
4793 if (size < PAGE_SIZE) 4827 if (size < PAGE_SIZE)
4794 kfree(mem); 4828 kfree(memcg);
4795 else 4829 else
4796 vfree(mem); 4830 vfree(memcg);
4797 return NULL; 4831 return NULL;
4798} 4832}
4799 4833
@@ -4981,6 +5015,7 @@ mem_cgroup_create(struct cgroup *cont)
4981 atomic_set(&memcg->refcnt, 1); 5015 atomic_set(&memcg->refcnt, 1);
4982 memcg->move_charge_at_immigrate = 0; 5016 memcg->move_charge_at_immigrate = 0;
4983 mutex_init(&memcg->thresholds_lock); 5017 mutex_init(&memcg->thresholds_lock);
5018 spin_lock_init(&memcg->move_lock);
4984 return &memcg->css; 5019 return &memcg->css;
4985free_out: 5020free_out:
4986 __mem_cgroup_free(memcg); 5021 __mem_cgroup_free(memcg);
@@ -5075,7 +5110,7 @@ one_by_one:
5075} 5110}
5076 5111
5077/** 5112/**
5078 * is_target_pte_for_mc - check a pte whether it is valid for move charge 5113 * get_mctgt_type - get target type of moving charge
5079 * @vma: the vma the pte to be checked belongs 5114 * @vma: the vma the pte to be checked belongs
5080 * @addr: the address corresponding to the pte to be checked 5115 * @addr: the address corresponding to the pte to be checked
5081 * @ptent: the pte to be checked 5116 * @ptent: the pte to be checked
@@ -5098,7 +5133,7 @@ union mc_target {
5098}; 5133};
5099 5134
5100enum mc_target_type { 5135enum mc_target_type {
5101 MC_TARGET_NONE, /* not used */ 5136 MC_TARGET_NONE = 0,
5102 MC_TARGET_PAGE, 5137 MC_TARGET_PAGE,
5103 MC_TARGET_SWAP, 5138 MC_TARGET_SWAP,
5104}; 5139};
@@ -5179,12 +5214,12 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5179 return page; 5214 return page;
5180} 5215}
5181 5216
5182static int is_target_pte_for_mc(struct vm_area_struct *vma, 5217static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5183 unsigned long addr, pte_t ptent, union mc_target *target) 5218 unsigned long addr, pte_t ptent, union mc_target *target)
5184{ 5219{
5185 struct page *page = NULL; 5220 struct page *page = NULL;
5186 struct page_cgroup *pc; 5221 struct page_cgroup *pc;
5187 int ret = 0; 5222 enum mc_target_type ret = MC_TARGET_NONE;
5188 swp_entry_t ent = { .val = 0 }; 5223 swp_entry_t ent = { .val = 0 };
5189 5224
5190 if (pte_present(ptent)) 5225 if (pte_present(ptent))
@@ -5195,7 +5230,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
5195 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5230 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5196 5231
5197 if (!page && !ent.val) 5232 if (!page && !ent.val)
5198 return 0; 5233 return ret;
5199 if (page) { 5234 if (page) {
5200 pc = lookup_page_cgroup(page); 5235 pc = lookup_page_cgroup(page);
5201 /* 5236 /*
@@ -5221,6 +5256,41 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
5221 return ret; 5256 return ret;
5222} 5257}
5223 5258
5259#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5260/*
5261 * We don't consider swapping or file mapped pages because THP does not
5262 * support them for now.
5263 * Caller should make sure that pmd_trans_huge(pmd) is true.
5264 */
5265static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5266 unsigned long addr, pmd_t pmd, union mc_target *target)
5267{
5268 struct page *page = NULL;
5269 struct page_cgroup *pc;
5270 enum mc_target_type ret = MC_TARGET_NONE;
5271
5272 page = pmd_page(pmd);
5273 VM_BUG_ON(!page || !PageHead(page));
5274 if (!move_anon())
5275 return ret;
5276 pc = lookup_page_cgroup(page);
5277 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5278 ret = MC_TARGET_PAGE;
5279 if (target) {
5280 get_page(page);
5281 target->page = page;
5282 }
5283 }
5284 return ret;
5285}
5286#else
5287static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5288 unsigned long addr, pmd_t pmd, union mc_target *target)
5289{
5290 return MC_TARGET_NONE;
5291}
5292#endif
5293
5224static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5294static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5225 unsigned long addr, unsigned long end, 5295 unsigned long addr, unsigned long end,
5226 struct mm_walk *walk) 5296 struct mm_walk *walk)
@@ -5229,11 +5299,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5229 pte_t *pte; 5299 pte_t *pte;
5230 spinlock_t *ptl; 5300 spinlock_t *ptl;
5231 5301
5232 split_huge_page_pmd(walk->mm, pmd); 5302 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5303 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5304 mc.precharge += HPAGE_PMD_NR;
5305 spin_unlock(&vma->vm_mm->page_table_lock);
5306 return 0;
5307 }
5233 5308
5234 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5309 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5235 for (; addr != end; pte++, addr += PAGE_SIZE) 5310 for (; addr != end; pte++, addr += PAGE_SIZE)
5236 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 5311 if (get_mctgt_type(vma, addr, *pte, NULL))
5237 mc.precharge++; /* increment precharge temporarily */ 5312 mc.precharge++; /* increment precharge temporarily */
5238 pte_unmap_unlock(pte - 1, ptl); 5313 pte_unmap_unlock(pte - 1, ptl);
5239 cond_resched(); 5314 cond_resched();
@@ -5388,23 +5463,55 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5388 struct vm_area_struct *vma = walk->private; 5463 struct vm_area_struct *vma = walk->private;
5389 pte_t *pte; 5464 pte_t *pte;
5390 spinlock_t *ptl; 5465 spinlock_t *ptl;
5466 enum mc_target_type target_type;
5467 union mc_target target;
5468 struct page *page;
5469 struct page_cgroup *pc;
5470
5471 /*
5472 * We don't take compound_lock() here but no race with splitting thp
5473 * happens because:
5474 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
5475 * under splitting, which means there's no concurrent thp split,
5476 * - if another thread runs into split_huge_page() just after we
5477 * entered this if-block, the thread must wait for page table lock
5478 * to be unlocked in __split_huge_page_splitting(), where the main
5479 * part of thp split is not executed yet.
5480 */
5481 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5482 if (!mc.precharge) {
5483 spin_unlock(&vma->vm_mm->page_table_lock);
5484 return 0;
5485 }
5486 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5487 if (target_type == MC_TARGET_PAGE) {
5488 page = target.page;
5489 if (!isolate_lru_page(page)) {
5490 pc = lookup_page_cgroup(page);
5491 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
5492 pc, mc.from, mc.to,
5493 false)) {
5494 mc.precharge -= HPAGE_PMD_NR;
5495 mc.moved_charge += HPAGE_PMD_NR;
5496 }
5497 putback_lru_page(page);
5498 }
5499 put_page(page);
5500 }
5501 spin_unlock(&vma->vm_mm->page_table_lock);
5502 return 0;
5503 }
5391 5504
5392 split_huge_page_pmd(walk->mm, pmd);
5393retry: 5505retry:
5394 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5506 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5395 for (; addr != end; addr += PAGE_SIZE) { 5507 for (; addr != end; addr += PAGE_SIZE) {
5396 pte_t ptent = *(pte++); 5508 pte_t ptent = *(pte++);
5397 union mc_target target;
5398 int type;
5399 struct page *page;
5400 struct page_cgroup *pc;
5401 swp_entry_t ent; 5509 swp_entry_t ent;
5402 5510
5403 if (!mc.precharge) 5511 if (!mc.precharge)
5404 break; 5512 break;
5405 5513
5406 type = is_target_pte_for_mc(vma, addr, ptent, &target); 5514 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5407 switch (type) {
5408 case MC_TARGET_PAGE: 5515 case MC_TARGET_PAGE:
5409 page = target.page; 5516 page = target.page;
5410 if (isolate_lru_page(page)) 5517 if (isolate_lru_page(page))
@@ -5417,7 +5524,7 @@ retry:
5417 mc.moved_charge++; 5524 mc.moved_charge++;
5418 } 5525 }
5419 putback_lru_page(page); 5526 putback_lru_page(page);
5420put: /* is_target_pte_for_mc() gets the page */ 5527put: /* get_mctgt_type() gets the page */
5421 put_page(page); 5528 put_page(page);
5422 break; 5529 break;
5423 case MC_TARGET_SWAP: 5530 case MC_TARGET_SWAP:
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 56080ea36140..c22076ffdd44 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1063,7 +1063,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1063 * The check (unnecessarily) ignores LRU pages being isolated and 1063 * The check (unnecessarily) ignores LRU pages being isolated and
1064 * walked by the page reclaim code, however that's not a big loss. 1064 * walked by the page reclaim code, however that's not a big loss.
1065 */ 1065 */
1066 if (!PageHuge(p) && !PageTransCompound(p)) { 1066 if (!PageHuge(p) && !PageTransTail(p)) {
1067 if (!PageLRU(p)) 1067 if (!PageLRU(p))
1068 shake_page(p, 0); 1068 shake_page(p, 0);
1069 if (!PageLRU(p)) { 1069 if (!PageLRU(p)) {
diff --git a/mm/memory.c b/mm/memory.c
index 8438c157e4d9..3416b6e018d6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -125,17 +125,17 @@ core_initcall(init_zero_pfn);
125 125
126#if defined(SPLIT_RSS_COUNTING) 126#if defined(SPLIT_RSS_COUNTING)
127 127
128static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) 128void sync_mm_rss(struct mm_struct *mm)
129{ 129{
130 int i; 130 int i;
131 131
132 for (i = 0; i < NR_MM_COUNTERS; i++) { 132 for (i = 0; i < NR_MM_COUNTERS; i++) {
133 if (task->rss_stat.count[i]) { 133 if (current->rss_stat.count[i]) {
134 add_mm_counter(mm, i, task->rss_stat.count[i]); 134 add_mm_counter(mm, i, current->rss_stat.count[i]);
135 task->rss_stat.count[i] = 0; 135 current->rss_stat.count[i] = 0;
136 } 136 }
137 } 137 }
138 task->rss_stat.events = 0; 138 current->rss_stat.events = 0;
139} 139}
140 140
141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) 141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
@@ -157,30 +157,7 @@ static void check_sync_rss_stat(struct task_struct *task)
157 if (unlikely(task != current)) 157 if (unlikely(task != current))
158 return; 158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) 159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 __sync_task_rss_stat(task, task->mm); 160 sync_mm_rss(task->mm);
161}
162
163unsigned long get_mm_counter(struct mm_struct *mm, int member)
164{
165 long val = 0;
166
167 /*
168 * Don't use task->mm here...for avoiding to use task_get_mm()..
169 * The caller must guarantee task->mm is not invalid.
170 */
171 val = atomic_long_read(&mm->rss_stat.count[member]);
172 /*
173 * counter is updated in asynchronous manner and may go to minus.
174 * But it's never be expected number for users.
175 */
176 if (val < 0)
177 return 0;
178 return (unsigned long)val;
179}
180
181void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
182{
183 __sync_task_rss_stat(task, mm);
184} 161}
185#else /* SPLIT_RSS_COUNTING */ 162#else /* SPLIT_RSS_COUNTING */
186 163
@@ -661,7 +638,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
661 int i; 638 int i;
662 639
663 if (current->mm == mm) 640 if (current->mm == mm)
664 sync_mm_rss(current, mm); 641 sync_mm_rss(mm);
665 for (i = 0; i < NR_MM_COUNTERS; i++) 642 for (i = 0; i < NR_MM_COUNTERS; i++)
666 if (rss[i]) 643 if (rss[i])
667 add_mm_counter(mm, i, rss[i]); 644 add_mm_counter(mm, i, rss[i]);
@@ -1247,16 +1224,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1247 do { 1224 do {
1248 next = pmd_addr_end(addr, end); 1225 next = pmd_addr_end(addr, end);
1249 if (pmd_trans_huge(*pmd)) { 1226 if (pmd_trans_huge(*pmd)) {
1250 if (next-addr != HPAGE_PMD_SIZE) { 1227 if (next - addr != HPAGE_PMD_SIZE) {
1251 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1228 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1252 split_huge_page_pmd(vma->vm_mm, pmd); 1229 split_huge_page_pmd(vma->vm_mm, pmd);
1253 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1230 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1254 continue; 1231 goto next;
1255 /* fall through */ 1232 /* fall through */
1256 } 1233 }
1257 if (pmd_none_or_clear_bad(pmd)) 1234 /*
1258 continue; 1235 * Here there can be other concurrent MADV_DONTNEED or
1236 * trans huge page faults running, and if the pmd is
1237 * none or trans huge it can change under us. This is
1238 * because MADV_DONTNEED holds the mmap_sem in read
1239 * mode.
1240 */
1241 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1242 goto next;
1259 next = zap_pte_range(tlb, vma, pmd, addr, next, details); 1243 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1244next:
1260 cond_resched(); 1245 cond_resched();
1261 } while (pmd++, addr = next, addr != end); 1246 } while (pmd++, addr = next, addr != end);
1262 1247
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 47296fee23db..cfb6c8678754 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
512 do { 512 do {
513 next = pmd_addr_end(addr, end); 513 next = pmd_addr_end(addr, end);
514 split_huge_page_pmd(vma->vm_mm, pmd); 514 split_huge_page_pmd(vma->vm_mm, pmd);
515 if (pmd_none_or_clear_bad(pmd)) 515 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
516 continue; 516 continue;
517 if (check_pte_range(vma, pmd, addr, next, nodes, 517 if (check_pte_range(vma, pmd, addr, next, nodes,
518 flags, private)) 518 flags, private))
@@ -1323,12 +1323,9 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1323 err = -ESRCH; 1323 err = -ESRCH;
1324 goto out; 1324 goto out;
1325 } 1325 }
1326 mm = get_task_mm(task); 1326 get_task_struct(task);
1327 rcu_read_unlock();
1328 1327
1329 err = -EINVAL; 1328 err = -EINVAL;
1330 if (!mm)
1331 goto out;
1332 1329
1333 /* 1330 /*
1334 * Check if this process has the right to modify the specified 1331 * Check if this process has the right to modify the specified
@@ -1336,14 +1333,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1336 * capabilities, superuser privileges or the same 1333 * capabilities, superuser privileges or the same
1337 * userid as the target process. 1334 * userid as the target process.
1338 */ 1335 */
1339 rcu_read_lock();
1340 tcred = __task_cred(task); 1336 tcred = __task_cred(task);
1341 if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1337 if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1342 cred->uid != tcred->suid && cred->uid != tcred->uid && 1338 cred->uid != tcred->suid && cred->uid != tcred->uid &&
1343 !capable(CAP_SYS_NICE)) { 1339 !capable(CAP_SYS_NICE)) {
1344 rcu_read_unlock(); 1340 rcu_read_unlock();
1345 err = -EPERM; 1341 err = -EPERM;
1346 goto out; 1342 goto out_put;
1347 } 1343 }
1348 rcu_read_unlock(); 1344 rcu_read_unlock();
1349 1345
@@ -1351,26 +1347,36 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1351 /* Is the user allowed to access the target nodes? */ 1347 /* Is the user allowed to access the target nodes? */
1352 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 1348 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1353 err = -EPERM; 1349 err = -EPERM;
1354 goto out; 1350 goto out_put;
1355 } 1351 }
1356 1352
1357 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { 1353 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1358 err = -EINVAL; 1354 err = -EINVAL;
1359 goto out; 1355 goto out_put;
1360 } 1356 }
1361 1357
1362 err = security_task_movememory(task); 1358 err = security_task_movememory(task);
1363 if (err) 1359 if (err)
1364 goto out; 1360 goto out_put;
1365 1361
1366 err = do_migrate_pages(mm, old, new, 1362 mm = get_task_mm(task);
1367 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1363 put_task_struct(task);
1368out:
1369 if (mm) 1364 if (mm)
1370 mmput(mm); 1365 err = do_migrate_pages(mm, old, new,
1366 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1367 else
1368 err = -EINVAL;
1369
1370 mmput(mm);
1371out:
1371 NODEMASK_SCRATCH_FREE(scratch); 1372 NODEMASK_SCRATCH_FREE(scratch);
1372 1373
1373 return err; 1374 return err;
1375
1376out_put:
1377 put_task_struct(task);
1378 goto out;
1379
1374} 1380}
1375 1381
1376 1382
@@ -1844,18 +1850,24 @@ struct page *
1844alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 1850alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1845 unsigned long addr, int node) 1851 unsigned long addr, int node)
1846{ 1852{
1847 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1853 struct mempolicy *pol;
1848 struct zonelist *zl; 1854 struct zonelist *zl;
1849 struct page *page; 1855 struct page *page;
1856 unsigned int cpuset_mems_cookie;
1857
1858retry_cpuset:
1859 pol = get_vma_policy(current, vma, addr);
1860 cpuset_mems_cookie = get_mems_allowed();
1850 1861
1851 get_mems_allowed();
1852 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1862 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1853 unsigned nid; 1863 unsigned nid;
1854 1864
1855 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); 1865 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1856 mpol_cond_put(pol); 1866 mpol_cond_put(pol);
1857 page = alloc_page_interleave(gfp, order, nid); 1867 page = alloc_page_interleave(gfp, order, nid);
1858 put_mems_allowed(); 1868 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1869 goto retry_cpuset;
1870
1859 return page; 1871 return page;
1860 } 1872 }
1861 zl = policy_zonelist(gfp, pol, node); 1873 zl = policy_zonelist(gfp, pol, node);
@@ -1866,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1866 struct page *page = __alloc_pages_nodemask(gfp, order, 1878 struct page *page = __alloc_pages_nodemask(gfp, order,
1867 zl, policy_nodemask(gfp, pol)); 1879 zl, policy_nodemask(gfp, pol));
1868 __mpol_put(pol); 1880 __mpol_put(pol);
1869 put_mems_allowed(); 1881 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1882 goto retry_cpuset;
1870 return page; 1883 return page;
1871 } 1884 }
1872 /* 1885 /*
@@ -1874,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1874 */ 1887 */
1875 page = __alloc_pages_nodemask(gfp, order, zl, 1888 page = __alloc_pages_nodemask(gfp, order, zl,
1876 policy_nodemask(gfp, pol)); 1889 policy_nodemask(gfp, pol));
1877 put_mems_allowed(); 1890 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1891 goto retry_cpuset;
1878 return page; 1892 return page;
1879} 1893}
1880 1894
@@ -1901,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1901{ 1915{
1902 struct mempolicy *pol = current->mempolicy; 1916 struct mempolicy *pol = current->mempolicy;
1903 struct page *page; 1917 struct page *page;
1918 unsigned int cpuset_mems_cookie;
1904 1919
1905 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1920 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1906 pol = &default_policy; 1921 pol = &default_policy;
1907 1922
1908 get_mems_allowed(); 1923retry_cpuset:
1924 cpuset_mems_cookie = get_mems_allowed();
1925
1909 /* 1926 /*
1910 * No reference counting needed for current->mempolicy 1927 * No reference counting needed for current->mempolicy
1911 * nor system default_policy 1928 * nor system default_policy
@@ -1916,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1916 page = __alloc_pages_nodemask(gfp, order, 1933 page = __alloc_pages_nodemask(gfp, order,
1917 policy_zonelist(gfp, pol, numa_node_id()), 1934 policy_zonelist(gfp, pol, numa_node_id()),
1918 policy_nodemask(gfp, pol)); 1935 policy_nodemask(gfp, pol));
1919 put_mems_allowed(); 1936
1937 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1938 goto retry_cpuset;
1939
1920 return page; 1940 return page;
1921} 1941}
1922EXPORT_SYMBOL(alloc_pages_current); 1942EXPORT_SYMBOL(alloc_pages_current);
diff --git a/mm/migrate.c b/mm/migrate.c
index 1503b6b54ecb..51c08a0c6f68 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1174,20 +1174,17 @@ set_status:
1174 * Migrate an array of page address onto an array of nodes and fill 1174 * Migrate an array of page address onto an array of nodes and fill
1175 * the corresponding array of status. 1175 * the corresponding array of status.
1176 */ 1176 */
1177static int do_pages_move(struct mm_struct *mm, struct task_struct *task, 1177static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1178 unsigned long nr_pages, 1178 unsigned long nr_pages,
1179 const void __user * __user *pages, 1179 const void __user * __user *pages,
1180 const int __user *nodes, 1180 const int __user *nodes,
1181 int __user *status, int flags) 1181 int __user *status, int flags)
1182{ 1182{
1183 struct page_to_node *pm; 1183 struct page_to_node *pm;
1184 nodemask_t task_nodes;
1185 unsigned long chunk_nr_pages; 1184 unsigned long chunk_nr_pages;
1186 unsigned long chunk_start; 1185 unsigned long chunk_start;
1187 int err; 1186 int err;
1188 1187
1189 task_nodes = cpuset_mems_allowed(task);
1190
1191 err = -ENOMEM; 1188 err = -ENOMEM;
1192 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); 1189 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
1193 if (!pm) 1190 if (!pm)
@@ -1349,6 +1346,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1349 struct task_struct *task; 1346 struct task_struct *task;
1350 struct mm_struct *mm; 1347 struct mm_struct *mm;
1351 int err; 1348 int err;
1349 nodemask_t task_nodes;
1352 1350
1353 /* Check flags */ 1351 /* Check flags */
1354 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1352 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1364,11 +1362,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1364 rcu_read_unlock(); 1362 rcu_read_unlock();
1365 return -ESRCH; 1363 return -ESRCH;
1366 } 1364 }
1367 mm = get_task_mm(task); 1365 get_task_struct(task);
1368 rcu_read_unlock();
1369
1370 if (!mm)
1371 return -EINVAL;
1372 1366
1373 /* 1367 /*
1374 * Check if this process has the right to modify the specified 1368 * Check if this process has the right to modify the specified
@@ -1376,7 +1370,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1376 * capabilities, superuser privileges or the same 1370 * capabilities, superuser privileges or the same
1377 * userid as the target process. 1371 * userid as the target process.
1378 */ 1372 */
1379 rcu_read_lock();
1380 tcred = __task_cred(task); 1373 tcred = __task_cred(task);
1381 if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1374 if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1382 cred->uid != tcred->suid && cred->uid != tcred->uid && 1375 cred->uid != tcred->suid && cred->uid != tcred->uid &&
@@ -1391,16 +1384,25 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1391 if (err) 1384 if (err)
1392 goto out; 1385 goto out;
1393 1386
1394 if (nodes) { 1387 task_nodes = cpuset_mems_allowed(task);
1395 err = do_pages_move(mm, task, nr_pages, pages, nodes, status, 1388 mm = get_task_mm(task);
1396 flags); 1389 put_task_struct(task);
1397 } else { 1390
1398 err = do_pages_stat(mm, nr_pages, pages, status); 1391 if (mm) {
1399 } 1392 if (nodes)
1393 err = do_pages_move(mm, task_nodes, nr_pages, pages,
1394 nodes, status, flags);
1395 else
1396 err = do_pages_stat(mm, nr_pages, pages, status);
1397 } else
1398 err = -EINVAL;
1400 1399
1401out:
1402 mmput(mm); 1400 mmput(mm);
1403 return err; 1401 return err;
1402
1403out:
1404 put_task_struct(task);
1405 return err;
1404} 1406}
1405 1407
1406/* 1408/*
diff --git a/mm/mincore.c b/mm/mincore.c
index 636a86876ff2..936b4cee8cb1 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
164 } 164 }
165 /* fall through */ 165 /* fall through */
166 } 166 }
167 if (pmd_none_or_clear_bad(pmd)) 167 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
168 mincore_unmapped_range(vma, addr, next, vec); 168 mincore_unmapped_range(vma, addr, next, vec);
169 else 169 else
170 mincore_pte_range(vma, pmd, addr, next, vec); 170 mincore_pte_range(vma, pmd, addr, next, vec);
diff --git a/mm/mmap.c b/mm/mmap.c
index 6f3766b57803..a7bf6a31c9f6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -451,9 +451,8 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
451} 451}
452 452
453/* 453/*
454 * Helper for vma_adjust in the split_vma insert case: 454 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
455 * insert vm structure into list and rbtree and anon_vma, 455 * mm's list and rbtree. It has already been inserted into the prio_tree.
456 * but it has already been inserted into prio_tree earlier.
457 */ 456 */
458static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 457static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
459{ 458{
@@ -1112,9 +1111,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1112 * A dummy user value is used because we are not locking 1111 * A dummy user value is used because we are not locking
1113 * memory so no accounting is necessary 1112 * memory so no accounting is necessary
1114 */ 1113 */
1115 len = ALIGN(len, huge_page_size(&default_hstate)); 1114 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
1116 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, 1115 VM_NORESERVE, &user,
1117 &user, HUGETLB_ANONHUGE_INODE); 1116 HUGETLB_ANONHUGE_INODE);
1118 if (IS_ERR(file)) 1117 if (IS_ERR(file))
1119 return PTR_ERR(file); 1118 return PTR_ERR(file);
1120 } 1119 }
@@ -1439,10 +1438,8 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1439 /* 1438 /*
1440 * Is this a new hole at the lowest possible address? 1439 * Is this a new hole at the lowest possible address?
1441 */ 1440 */
1442 if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { 1441 if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
1443 mm->free_area_cache = addr; 1442 mm->free_area_cache = addr;
1444 mm->cached_hole_size = ~0UL;
1445 }
1446} 1443}
1447 1444
1448/* 1445/*
@@ -1457,7 +1454,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1457{ 1454{
1458 struct vm_area_struct *vma; 1455 struct vm_area_struct *vma;
1459 struct mm_struct *mm = current->mm; 1456 struct mm_struct *mm = current->mm;
1460 unsigned long addr = addr0; 1457 unsigned long addr = addr0, start_addr;
1461 1458
1462 /* requested length too big for entire address space */ 1459 /* requested length too big for entire address space */
1463 if (len > TASK_SIZE) 1460 if (len > TASK_SIZE)
@@ -1481,22 +1478,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1481 mm->free_area_cache = mm->mmap_base; 1478 mm->free_area_cache = mm->mmap_base;
1482 } 1479 }
1483 1480
1481try_again:
1484 /* either no address requested or can't fit in requested address hole */ 1482 /* either no address requested or can't fit in requested address hole */
1485 addr = mm->free_area_cache; 1483 start_addr = addr = mm->free_area_cache;
1486
1487 /* make sure it can fit in the remaining address space */
1488 if (addr > len) {
1489 vma = find_vma(mm, addr-len);
1490 if (!vma || addr <= vma->vm_start)
1491 /* remember the address as a hint for next time */
1492 return (mm->free_area_cache = addr-len);
1493 }
1494
1495 if (mm->mmap_base < len)
1496 goto bottomup;
1497 1484
1498 addr = mm->mmap_base-len; 1485 if (addr < len)
1486 goto fail;
1499 1487
1488 addr -= len;
1500 do { 1489 do {
1501 /* 1490 /*
1502 * Lookup failure means no vma is above this address, 1491 * Lookup failure means no vma is above this address,
@@ -1516,7 +1505,21 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1516 addr = vma->vm_start-len; 1505 addr = vma->vm_start-len;
1517 } while (len < vma->vm_start); 1506 } while (len < vma->vm_start);
1518 1507
1519bottomup: 1508fail:
1509 /*
1510 * if hint left us with no space for the requested
1511 * mapping then try again:
1512 *
1513 * Note: this is different with the case of bottomup
1514 * which does the fully line-search, but we use find_vma
1515 * here that causes some holes skipped.
1516 */
1517 if (start_addr != mm->mmap_base) {
1518 mm->free_area_cache = mm->mmap_base;
1519 mm->cached_hole_size = 0;
1520 goto try_again;
1521 }
1522
1520 /* 1523 /*
1521 * A failed mmap() very likely causes application failure, 1524 * A failed mmap() very likely causes application failure,
1522 * so fall back to the bottom-up function here. This scenario 1525 * so fall back to the bottom-up function here. This scenario
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index cf332bc0080a..3dcfaf4ed355 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm)
53 struct task_struct *tsk = current; 53 struct task_struct *tsk = current;
54 54
55 task_lock(tsk); 55 task_lock(tsk);
56 sync_mm_rss(tsk, mm); 56 sync_mm_rss(mm);
57 tsk->mm = NULL; 57 tsk->mm = NULL;
58 /* active_mm is still 'mm' */ 58 /* active_mm is still 'mm' */
59 enter_lazy_tlb(mm, tsk); 59 enter_lazy_tlb(mm, tsk);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 142ef4a1f480..a40992610ab6 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -60,7 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
60 ptent = pte_mkwrite(ptent); 60 ptent = pte_mkwrite(ptent);
61 61
62 ptep_modify_prot_commit(mm, addr, pte, ptent); 62 ptep_modify_prot_commit(mm, addr, pte, ptent);
63 } else if (PAGE_MIGRATION && !pte_file(oldpte)) { 63 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
64 swp_entry_t entry = pte_to_swp_entry(oldpte); 64 swp_entry_t entry = pte_to_swp_entry(oldpte);
65 65
66 if (is_write_migration_entry(entry)) { 66 if (is_write_migration_entry(entry)) {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2958fd8e7c9a..4198e000f41a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -34,6 +34,7 @@
34#include <linux/ptrace.h> 34#include <linux/ptrace.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/ftrace.h> 36#include <linux/ftrace.h>
37#include <linux/ratelimit.h>
37 38
38#define CREATE_TRACE_POINTS 39#define CREATE_TRACE_POINTS
39#include <trace/events/oom.h> 40#include <trace/events/oom.h>
@@ -309,7 +310,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
309 */ 310 */
310static struct task_struct *select_bad_process(unsigned int *ppoints, 311static struct task_struct *select_bad_process(unsigned int *ppoints,
311 unsigned long totalpages, struct mem_cgroup *memcg, 312 unsigned long totalpages, struct mem_cgroup *memcg,
312 const nodemask_t *nodemask) 313 const nodemask_t *nodemask, bool force_kill)
313{ 314{
314 struct task_struct *g, *p; 315 struct task_struct *g, *p;
315 struct task_struct *chosen = NULL; 316 struct task_struct *chosen = NULL;
@@ -335,7 +336,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
335 if (test_tsk_thread_flag(p, TIF_MEMDIE)) { 336 if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
336 if (unlikely(frozen(p))) 337 if (unlikely(frozen(p)))
337 __thaw_task(p); 338 __thaw_task(p);
338 return ERR_PTR(-1UL); 339 if (!force_kill)
340 return ERR_PTR(-1UL);
339 } 341 }
340 if (!p->mm) 342 if (!p->mm)
341 continue; 343 continue;
@@ -353,7 +355,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
353 if (p == current) { 355 if (p == current) {
354 chosen = p; 356 chosen = p;
355 *ppoints = 1000; 357 *ppoints = 1000;
356 } else { 358 } else if (!force_kill) {
357 /* 359 /*
358 * If this task is not being ptraced on exit, 360 * If this task is not being ptraced on exit,
359 * then wait for it to finish before killing 361 * then wait for it to finish before killing
@@ -434,66 +436,18 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
434} 436}
435 437
436#define K(x) ((x) << (PAGE_SHIFT-10)) 438#define K(x) ((x) << (PAGE_SHIFT-10))
437static int oom_kill_task(struct task_struct *p) 439static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
438{ 440 unsigned int points, unsigned long totalpages,
439 struct task_struct *q; 441 struct mem_cgroup *memcg, nodemask_t *nodemask,
440 struct mm_struct *mm; 442 const char *message)
441
442 p = find_lock_task_mm(p);
443 if (!p)
444 return 1;
445
446 /* mm cannot be safely dereferenced after task_unlock(p) */
447 mm = p->mm;
448
449 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
450 task_pid_nr(p), p->comm, K(p->mm->total_vm),
451 K(get_mm_counter(p->mm, MM_ANONPAGES)),
452 K(get_mm_counter(p->mm, MM_FILEPAGES)));
453 task_unlock(p);
454
455 /*
456 * Kill all user processes sharing p->mm in other thread groups, if any.
457 * They don't get access to memory reserves or a higher scheduler
458 * priority, though, to avoid depletion of all memory or task
459 * starvation. This prevents mm->mmap_sem livelock when an oom killed
460 * task cannot exit because it requires the semaphore and its contended
461 * by another thread trying to allocate memory itself. That thread will
462 * now get access to memory reserves since it has a pending fatal
463 * signal.
464 */
465 for_each_process(q)
466 if (q->mm == mm && !same_thread_group(q, p) &&
467 !(q->flags & PF_KTHREAD)) {
468 if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
469 continue;
470
471 task_lock(q); /* Protect ->comm from prctl() */
472 pr_err("Kill process %d (%s) sharing same memory\n",
473 task_pid_nr(q), q->comm);
474 task_unlock(q);
475 force_sig(SIGKILL, q);
476 }
477
478 set_tsk_thread_flag(p, TIF_MEMDIE);
479 force_sig(SIGKILL, p);
480
481 return 0;
482}
483#undef K
484
485static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
486 unsigned int points, unsigned long totalpages,
487 struct mem_cgroup *memcg, nodemask_t *nodemask,
488 const char *message)
489{ 443{
490 struct task_struct *victim = p; 444 struct task_struct *victim = p;
491 struct task_struct *child; 445 struct task_struct *child;
492 struct task_struct *t = p; 446 struct task_struct *t = p;
447 struct mm_struct *mm;
493 unsigned int victim_points = 0; 448 unsigned int victim_points = 0;
494 449 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
495 if (printk_ratelimit()) 450 DEFAULT_RATELIMIT_BURST);
496 dump_header(p, gfp_mask, order, memcg, nodemask);
497 451
498 /* 452 /*
499 * If the task is already exiting, don't alarm the sysadmin or kill 453 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -501,9 +455,12 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
501 */ 455 */
502 if (p->flags & PF_EXITING) { 456 if (p->flags & PF_EXITING) {
503 set_tsk_thread_flag(p, TIF_MEMDIE); 457 set_tsk_thread_flag(p, TIF_MEMDIE);
504 return 0; 458 return;
505 } 459 }
506 460
461 if (__ratelimit(&oom_rs))
462 dump_header(p, gfp_mask, order, memcg, nodemask);
463
507 task_lock(p); 464 task_lock(p);
508 pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", 465 pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
509 message, task_pid_nr(p), p->comm, points); 466 message, task_pid_nr(p), p->comm, points);
@@ -533,8 +490,44 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
533 } 490 }
534 } while_each_thread(p, t); 491 } while_each_thread(p, t);
535 492
536 return oom_kill_task(victim); 493 victim = find_lock_task_mm(victim);
494 if (!victim)
495 return;
496
497 /* mm cannot safely be dereferenced after task_unlock(victim) */
498 mm = victim->mm;
499 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
500 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
501 K(get_mm_counter(victim->mm, MM_ANONPAGES)),
502 K(get_mm_counter(victim->mm, MM_FILEPAGES)));
503 task_unlock(victim);
504
505 /*
506 * Kill all user processes sharing victim->mm in other thread groups, if
507 * any. They don't get access to memory reserves, though, to avoid
508 * depletion of all memory. This prevents mm->mmap_sem livelock when an
509 * oom killed thread cannot exit because it requires the semaphore and
510 * its contended by another thread trying to allocate memory itself.
511 * That thread will now get access to memory reserves since it has a
512 * pending fatal signal.
513 */
514 for_each_process(p)
515 if (p->mm == mm && !same_thread_group(p, victim) &&
516 !(p->flags & PF_KTHREAD)) {
517 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
518 continue;
519
520 task_lock(p); /* Protect ->comm from prctl() */
521 pr_err("Kill process %d (%s) sharing same memory\n",
522 task_pid_nr(p), p->comm);
523 task_unlock(p);
524 force_sig(SIGKILL, p);
525 }
526
527 set_tsk_thread_flag(victim, TIF_MEMDIE);
528 force_sig(SIGKILL, victim);
537} 529}
530#undef K
538 531
539/* 532/*
540 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 533 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
@@ -561,7 +554,8 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
561} 554}
562 555
563#ifdef CONFIG_CGROUP_MEM_RES_CTLR 556#ifdef CONFIG_CGROUP_MEM_RES_CTLR
564void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) 557void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
558 int order)
565{ 559{
566 unsigned long limit; 560 unsigned long limit;
567 unsigned int points = 0; 561 unsigned int points = 0;
@@ -577,18 +571,13 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
577 return; 571 return;
578 } 572 }
579 573
580 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); 574 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
581 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; 575 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
582 read_lock(&tasklist_lock); 576 read_lock(&tasklist_lock);
583retry: 577 p = select_bad_process(&points, limit, memcg, NULL, false);
584 p = select_bad_process(&points, limit, memcg, NULL); 578 if (p && PTR_ERR(p) != -1UL)
585 if (!p || PTR_ERR(p) == -1UL) 579 oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
586 goto out; 580 "Memory cgroup out of memory");
587
588 if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL,
589 "Memory cgroup out of memory"))
590 goto retry;
591out:
592 read_unlock(&tasklist_lock); 581 read_unlock(&tasklist_lock);
593} 582}
594#endif 583#endif
@@ -700,6 +689,7 @@ static void clear_system_oom(void)
700 * @gfp_mask: memory allocation flags 689 * @gfp_mask: memory allocation flags
701 * @order: amount of memory being requested as a power of 2 690 * @order: amount of memory being requested as a power of 2
702 * @nodemask: nodemask passed to page allocator 691 * @nodemask: nodemask passed to page allocator
692 * @force_kill: true if a task must be killed, even if others are exiting
703 * 693 *
704 * If we run out of memory, we have the choice between either 694 * If we run out of memory, we have the choice between either
705 * killing a random task (bad), letting the system crash (worse) 695 * killing a random task (bad), letting the system crash (worse)
@@ -707,7 +697,7 @@ static void clear_system_oom(void)
707 * don't have to be perfect here, we just have to be good. 697 * don't have to be perfect here, we just have to be good.
708 */ 698 */
709void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 699void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
710 int order, nodemask_t *nodemask) 700 int order, nodemask_t *nodemask, bool force_kill)
711{ 701{
712 const nodemask_t *mpol_mask; 702 const nodemask_t *mpol_mask;
713 struct task_struct *p; 703 struct task_struct *p;
@@ -745,33 +735,25 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
745 if (sysctl_oom_kill_allocating_task && 735 if (sysctl_oom_kill_allocating_task &&
746 !oom_unkillable_task(current, NULL, nodemask) && 736 !oom_unkillable_task(current, NULL, nodemask) &&
747 current->mm) { 737 current->mm) {
748 /* 738 oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
749 * oom_kill_process() needs tasklist_lock held. If it returns 739 nodemask,
750 * non-zero, current could not be killed so we must fallback to 740 "Out of memory (oom_kill_allocating_task)");
751 * the tasklist scan.
752 */
753 if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
754 NULL, nodemask,
755 "Out of memory (oom_kill_allocating_task)"))
756 goto out;
757 }
758
759retry:
760 p = select_bad_process(&points, totalpages, NULL, mpol_mask);
761 if (PTR_ERR(p) == -1UL)
762 goto out; 741 goto out;
742 }
763 743
744 p = select_bad_process(&points, totalpages, NULL, mpol_mask,
745 force_kill);
764 /* Found nothing?!?! Either we hang forever, or we panic. */ 746 /* Found nothing?!?! Either we hang forever, or we panic. */
765 if (!p) { 747 if (!p) {
766 dump_header(NULL, gfp_mask, order, NULL, mpol_mask); 748 dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
767 read_unlock(&tasklist_lock); 749 read_unlock(&tasklist_lock);
768 panic("Out of memory and no killable processes...\n"); 750 panic("Out of memory and no killable processes...\n");
769 } 751 }
770 752 if (PTR_ERR(p) != -1UL) {
771 if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, 753 oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
772 nodemask, "Out of memory")) 754 nodemask, "Out of memory");
773 goto retry; 755 killed = 1;
774 killed = 1; 756 }
775out: 757out:
776 read_unlock(&tasklist_lock); 758 read_unlock(&tasklist_lock);
777 759
@@ -792,7 +774,7 @@ out:
792void pagefault_out_of_memory(void) 774void pagefault_out_of_memory(void)
793{ 775{
794 if (try_set_system_oom()) { 776 if (try_set_system_oom()) {
795 out_of_memory(NULL, 0, 0, NULL); 777 out_of_memory(NULL, 0, 0, NULL, false);
796 clear_system_oom(); 778 clear_system_oom();
797 } 779 }
798 if (!test_thread_flag(TIF_MEMDIE)) 780 if (!test_thread_flag(TIF_MEMDIE))
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 363ba7082ef5..3fc261705b1e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1472,6 +1472,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
1472 1472
1473 for ( ; ; ) { 1473 for ( ; ; ) {
1474 global_dirty_limits(&background_thresh, &dirty_thresh); 1474 global_dirty_limits(&background_thresh, &dirty_thresh);
1475 dirty_thresh = hard_dirty_limit(dirty_thresh);
1475 1476
1476 /* 1477 /*
1477 * Boost the allowable dirty threshold a bit for page 1478 * Boost the allowable dirty threshold a bit for page
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a13ded1938f0..caea788628e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1968,7 +1968,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1968 goto out; 1968 goto out;
1969 } 1969 }
1970 /* Exhausted what can be done so it's blamo time */ 1970 /* Exhausted what can be done so it's blamo time */
1971 out_of_memory(zonelist, gfp_mask, order, nodemask); 1971 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
1972 1972
1973out: 1973out:
1974 clear_zonelist_oom(zonelist, gfp_mask); 1974 clear_zonelist_oom(zonelist, gfp_mask);
@@ -1990,7 +1990,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1990 if (!order) 1990 if (!order)
1991 return NULL; 1991 return NULL;
1992 1992
1993 if (compaction_deferred(preferred_zone)) { 1993 if (compaction_deferred(preferred_zone, order)) {
1994 *deferred_compaction = true; 1994 *deferred_compaction = true;
1995 return NULL; 1995 return NULL;
1996 } 1996 }
@@ -2012,6 +2012,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2012 if (page) { 2012 if (page) {
2013 preferred_zone->compact_considered = 0; 2013 preferred_zone->compact_considered = 0;
2014 preferred_zone->compact_defer_shift = 0; 2014 preferred_zone->compact_defer_shift = 0;
2015 if (order >= preferred_zone->compact_order_failed)
2016 preferred_zone->compact_order_failed = order + 1;
2015 count_vm_event(COMPACTSUCCESS); 2017 count_vm_event(COMPACTSUCCESS);
2016 return page; 2018 return page;
2017 } 2019 }
@@ -2028,7 +2030,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2028 * defer if the failure was a sync compaction failure. 2030 * defer if the failure was a sync compaction failure.
2029 */ 2031 */
2030 if (sync_migration) 2032 if (sync_migration)
2031 defer_compaction(preferred_zone); 2033 defer_compaction(preferred_zone, order);
2032 2034
2033 cond_resched(); 2035 cond_resched();
2034 } 2036 }
@@ -2378,8 +2380,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2378{ 2380{
2379 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2381 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2380 struct zone *preferred_zone; 2382 struct zone *preferred_zone;
2381 struct page *page; 2383 struct page *page = NULL;
2382 int migratetype = allocflags_to_migratetype(gfp_mask); 2384 int migratetype = allocflags_to_migratetype(gfp_mask);
2385 unsigned int cpuset_mems_cookie;
2383 2386
2384 gfp_mask &= gfp_allowed_mask; 2387 gfp_mask &= gfp_allowed_mask;
2385 2388
@@ -2398,15 +2401,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2398 if (unlikely(!zonelist->_zonerefs->zone)) 2401 if (unlikely(!zonelist->_zonerefs->zone))
2399 return NULL; 2402 return NULL;
2400 2403
2401 get_mems_allowed(); 2404retry_cpuset:
2405 cpuset_mems_cookie = get_mems_allowed();
2406
2402 /* The preferred zone is used for statistics later */ 2407 /* The preferred zone is used for statistics later */
2403 first_zones_zonelist(zonelist, high_zoneidx, 2408 first_zones_zonelist(zonelist, high_zoneidx,
2404 nodemask ? : &cpuset_current_mems_allowed, 2409 nodemask ? : &cpuset_current_mems_allowed,
2405 &preferred_zone); 2410 &preferred_zone);
2406 if (!preferred_zone) { 2411 if (!preferred_zone)
2407 put_mems_allowed(); 2412 goto out;
2408 return NULL;
2409 }
2410 2413
2411 /* First allocation attempt */ 2414 /* First allocation attempt */
2412 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2415 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2416,9 +2419,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2416 page = __alloc_pages_slowpath(gfp_mask, order, 2419 page = __alloc_pages_slowpath(gfp_mask, order,
2417 zonelist, high_zoneidx, nodemask, 2420 zonelist, high_zoneidx, nodemask,
2418 preferred_zone, migratetype); 2421 preferred_zone, migratetype);
2419 put_mems_allowed();
2420 2422
2421 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2423 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2424
2425out:
2426 /*
2427 * When updating a task's mems_allowed, it is possible to race with
2428 * parallel threads in such a way that an allocation can fail while
2429 * the mask is being updated. If a page allocation is about to fail,
2430 * check if the cpuset changed during allocation and if so, retry.
2431 */
2432 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2433 goto retry_cpuset;
2434
2422 return page; 2435 return page;
2423} 2436}
2424EXPORT_SYMBOL(__alloc_pages_nodemask); 2437EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2632,13 +2645,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
2632bool skip_free_areas_node(unsigned int flags, int nid) 2645bool skip_free_areas_node(unsigned int flags, int nid)
2633{ 2646{
2634 bool ret = false; 2647 bool ret = false;
2648 unsigned int cpuset_mems_cookie;
2635 2649
2636 if (!(flags & SHOW_MEM_FILTER_NODES)) 2650 if (!(flags & SHOW_MEM_FILTER_NODES))
2637 goto out; 2651 goto out;
2638 2652
2639 get_mems_allowed(); 2653 do {
2640 ret = !node_isset(nid, cpuset_current_mems_allowed); 2654 cpuset_mems_cookie = get_mems_allowed();
2641 put_mems_allowed(); 2655 ret = !node_isset(nid, cpuset_current_mems_allowed);
2656 } while (!put_mems_allowed(cpuset_mems_cookie));
2642out: 2657out:
2643 return ret; 2658 return ret;
2644} 2659}
@@ -3925,18 +3940,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
3925 } 3940 }
3926} 3941}
3927 3942
3928int __init add_from_early_node_map(struct range *range, int az,
3929 int nr_range, int nid)
3930{
3931 unsigned long start_pfn, end_pfn;
3932 int i;
3933
3934 /* need to go over early_node_map to find out good range for node */
3935 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
3936 nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
3937 return nr_range;
3938}
3939
3940/** 3943/**
3941 * sparse_memory_present_with_active_regions - Call memory_present for each active range 3944 * sparse_memory_present_with_active_regions - Call memory_present for each active range
3942 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 3945 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -4521,7 +4524,7 @@ static unsigned long __init early_calculate_totalpages(void)
4521 * memory. When they don't, some nodes will have more kernelcore than 4524 * memory. When they don't, some nodes will have more kernelcore than
4522 * others 4525 * others
4523 */ 4526 */
4524static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) 4527static void __init find_zone_movable_pfns_for_nodes(void)
4525{ 4528{
4526 int i, nid; 4529 int i, nid;
4527 unsigned long usable_startpfn; 4530 unsigned long usable_startpfn;
@@ -4713,7 +4716,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4713 4716
4714 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 4717 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
4715 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 4718 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
4716 find_zone_movable_pfns_for_nodes(zone_movable_pfn); 4719 find_zone_movable_pfns_for_nodes();
4717 4720
4718 /* Print out the zone ranges */ 4721 /* Print out the zone ranges */
4719 printk("Zone PFN ranges:\n"); 4722 printk("Zone PFN ranges:\n");
@@ -4823,6 +4826,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
4823 int cpu = (unsigned long)hcpu; 4826 int cpu = (unsigned long)hcpu;
4824 4827
4825 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 4828 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
4829 lru_add_drain_cpu(cpu);
4826 drain_pages(cpu); 4830 drain_pages(cpu);
4827 4831
4828 /* 4832 /*
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2f5cf10ff660..aa9701e12714 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -59,7 +59,7 @@ again:
59 continue; 59 continue;
60 60
61 split_huge_page_pmd(walk->mm, pmd); 61 split_huge_page_pmd(walk->mm, pmd);
62 if (pmd_none_or_clear_bad(pmd)) 62 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
63 goto again; 63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk); 64 err = walk_pte_range(pmd, addr, next, walk);
65 if (err) 65 if (err)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index eb663fb533e0..5a74fea182f1 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -70,10 +70,11 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
70 unsigned long address, pmd_t *pmdp) 70 unsigned long address, pmd_t *pmdp)
71{ 71{
72 int young; 72 int young;
73#ifndef CONFIG_TRANSPARENT_HUGEPAGE 73#ifdef CONFIG_TRANSPARENT_HUGEPAGE
74 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
75#else
74 BUG(); 76 BUG();
75#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 77#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
76 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
77 young = pmdp_test_and_clear_young(vma, address, pmdp); 78 young = pmdp_test_and_clear_young(vma, address, pmdp);
78 if (young) 79 if (young)
79 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 80 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
diff --git a/mm/rmap.c b/mm/rmap.c
index c8454e06b6c8..5b5ad584ffb7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -120,6 +120,21 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
120 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 120 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
121} 121}
122 122
123static void anon_vma_chain_link(struct vm_area_struct *vma,
124 struct anon_vma_chain *avc,
125 struct anon_vma *anon_vma)
126{
127 avc->vma = vma;
128 avc->anon_vma = anon_vma;
129 list_add(&avc->same_vma, &vma->anon_vma_chain);
130
131 /*
132 * It's critical to add new vmas to the tail of the anon_vma,
133 * see comment in huge_memory.c:__split_huge_page().
134 */
135 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
136}
137
123/** 138/**
124 * anon_vma_prepare - attach an anon_vma to a memory region 139 * anon_vma_prepare - attach an anon_vma to a memory region
125 * @vma: the memory region in question 140 * @vma: the memory region in question
@@ -175,10 +190,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
175 spin_lock(&mm->page_table_lock); 190 spin_lock(&mm->page_table_lock);
176 if (likely(!vma->anon_vma)) { 191 if (likely(!vma->anon_vma)) {
177 vma->anon_vma = anon_vma; 192 vma->anon_vma = anon_vma;
178 avc->anon_vma = anon_vma; 193 anon_vma_chain_link(vma, avc, anon_vma);
179 avc->vma = vma;
180 list_add(&avc->same_vma, &vma->anon_vma_chain);
181 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
182 allocated = NULL; 194 allocated = NULL;
183 avc = NULL; 195 avc = NULL;
184 } 196 }
@@ -224,21 +236,6 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
224 mutex_unlock(&root->mutex); 236 mutex_unlock(&root->mutex);
225} 237}
226 238
227static void anon_vma_chain_link(struct vm_area_struct *vma,
228 struct anon_vma_chain *avc,
229 struct anon_vma *anon_vma)
230{
231 avc->vma = vma;
232 avc->anon_vma = anon_vma;
233 list_add(&avc->same_vma, &vma->anon_vma_chain);
234
235 /*
236 * It's critical to add new vmas to the tail of the anon_vma,
237 * see comment in huge_memory.c:__split_huge_page().
238 */
239 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
240}
241
242/* 239/*
243 * Attach the anon_vmas from src to dst. 240 * Attach the anon_vmas from src to dst.
244 * Returns 0 on success, -ENOMEM on failure. 241 * Returns 0 on success, -ENOMEM on failure.
@@ -1151,10 +1148,15 @@ void page_add_new_anon_rmap(struct page *page,
1151 */ 1148 */
1152void page_add_file_rmap(struct page *page) 1149void page_add_file_rmap(struct page *page)
1153{ 1150{
1151 bool locked;
1152 unsigned long flags;
1153
1154 mem_cgroup_begin_update_page_stat(page, &locked, &flags);
1154 if (atomic_inc_and_test(&page->_mapcount)) { 1155 if (atomic_inc_and_test(&page->_mapcount)) {
1155 __inc_zone_page_state(page, NR_FILE_MAPPED); 1156 __inc_zone_page_state(page, NR_FILE_MAPPED);
1156 mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); 1157 mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
1157 } 1158 }
1159 mem_cgroup_end_update_page_stat(page, &locked, &flags);
1158} 1160}
1159 1161
1160/** 1162/**
@@ -1165,9 +1167,21 @@ void page_add_file_rmap(struct page *page)
1165 */ 1167 */
1166void page_remove_rmap(struct page *page) 1168void page_remove_rmap(struct page *page)
1167{ 1169{
1170 bool anon = PageAnon(page);
1171 bool locked;
1172 unsigned long flags;
1173
1174 /*
1175 * The anon case has no mem_cgroup page_stat to update; but may
1176 * uncharge_page() below, where the lock ordering can deadlock if
1177 * we hold the lock against page_stat move: so avoid it on anon.
1178 */
1179 if (!anon)
1180 mem_cgroup_begin_update_page_stat(page, &locked, &flags);
1181
1168 /* page still mapped by someone else? */ 1182 /* page still mapped by someone else? */
1169 if (!atomic_add_negative(-1, &page->_mapcount)) 1183 if (!atomic_add_negative(-1, &page->_mapcount))
1170 return; 1184 goto out;
1171 1185
1172 /* 1186 /*
1173 * Now that the last pte has gone, s390 must transfer dirty 1187 * Now that the last pte has gone, s390 must transfer dirty
@@ -1176,7 +1190,7 @@ void page_remove_rmap(struct page *page)
1176 * not if it's in swapcache - there might be another pte slot 1190 * not if it's in swapcache - there might be another pte slot
1177 * containing the swap entry, but page not yet written to swap. 1191 * containing the swap entry, but page not yet written to swap.
1178 */ 1192 */
1179 if ((!PageAnon(page) || PageSwapCache(page)) && 1193 if ((!anon || PageSwapCache(page)) &&
1180 page_test_and_clear_dirty(page_to_pfn(page), 1)) 1194 page_test_and_clear_dirty(page_to_pfn(page), 1))
1181 set_page_dirty(page); 1195 set_page_dirty(page);
1182 /* 1196 /*
@@ -1184,8 +1198,8 @@ void page_remove_rmap(struct page *page)
1184 * and not charged by memcg for now. 1198 * and not charged by memcg for now.
1185 */ 1199 */
1186 if (unlikely(PageHuge(page))) 1200 if (unlikely(PageHuge(page)))
1187 return; 1201 goto out;
1188 if (PageAnon(page)) { 1202 if (anon) {
1189 mem_cgroup_uncharge_page(page); 1203 mem_cgroup_uncharge_page(page);
1190 if (!PageTransHuge(page)) 1204 if (!PageTransHuge(page))
1191 __dec_zone_page_state(page, NR_ANON_PAGES); 1205 __dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1205,6 +1219,9 @@ void page_remove_rmap(struct page *page)
1205 * Leaving it set also helps swapoff to reinstate ptes 1219 * Leaving it set also helps swapoff to reinstate ptes
1206 * faster for those pages still in swapcache. 1220 * faster for those pages still in swapcache.
1207 */ 1221 */
1222out:
1223 if (!anon)
1224 mem_cgroup_end_update_page_stat(page, &locked, &flags);
1208} 1225}
1209 1226
1210/* 1227/*
@@ -1282,7 +1299,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1282 } 1299 }
1283 dec_mm_counter(mm, MM_ANONPAGES); 1300 dec_mm_counter(mm, MM_ANONPAGES);
1284 inc_mm_counter(mm, MM_SWAPENTS); 1301 inc_mm_counter(mm, MM_SWAPENTS);
1285 } else if (PAGE_MIGRATION) { 1302 } else if (IS_ENABLED(CONFIG_MIGRATION)) {
1286 /* 1303 /*
1287 * Store the pfn of the page in a special migration 1304 * Store the pfn of the page in a special migration
1288 * pte. do_swap_page() will wait until the migration 1305 * pte. do_swap_page() will wait until the migration
@@ -1293,7 +1310,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1293 } 1310 }
1294 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 1311 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
1295 BUG_ON(pte_file(*pte)); 1312 BUG_ON(pte_file(*pte));
1296 } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { 1313 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
1314 (TTU_ACTION(flags) == TTU_MIGRATION)) {
1297 /* Establish migration entry for a file page */ 1315 /* Establish migration entry for a file page */
1298 swp_entry_t entry; 1316 swp_entry_t entry;
1299 entry = make_migration_entry(page, pte_write(pteval)); 1317 entry = make_migration_entry(page, pte_write(pteval));
@@ -1499,7 +1517,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1499 * locking requirements of exec(), migration skips 1517 * locking requirements of exec(), migration skips
1500 * temporary VMAs until after exec() completes. 1518 * temporary VMAs until after exec() completes.
1501 */ 1519 */
1502 if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && 1520 if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
1503 is_vma_temporary_stack(vma)) 1521 is_vma_temporary_stack(vma))
1504 continue; 1522 continue;
1505 1523
diff --git a/mm/shmem.c b/mm/shmem.c
index 7a45ad004cfd..f99ff3e50bd6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1178,6 +1178,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1178static const struct inode_operations shmem_symlink_inode_operations; 1178static const struct inode_operations shmem_symlink_inode_operations;
1179static const struct inode_operations shmem_short_symlink_operations; 1179static const struct inode_operations shmem_short_symlink_operations;
1180 1180
1181#ifdef CONFIG_TMPFS_XATTR
1182static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
1183#else
1184#define shmem_initxattrs NULL
1185#endif
1186
1181static int 1187static int
1182shmem_write_begin(struct file *file, struct address_space *mapping, 1188shmem_write_begin(struct file *file, struct address_space *mapping,
1183 loff_t pos, unsigned len, unsigned flags, 1189 loff_t pos, unsigned len, unsigned flags,
@@ -1490,7 +1496,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
1490 if (inode) { 1496 if (inode) {
1491 error = security_inode_init_security(inode, dir, 1497 error = security_inode_init_security(inode, dir,
1492 &dentry->d_name, 1498 &dentry->d_name,
1493 NULL, NULL); 1499 shmem_initxattrs, NULL);
1494 if (error) { 1500 if (error) {
1495 if (error != -EOPNOTSUPP) { 1501 if (error != -EOPNOTSUPP) {
1496 iput(inode); 1502 iput(inode);
@@ -1630,7 +1636,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1630 return -ENOSPC; 1636 return -ENOSPC;
1631 1637
1632 error = security_inode_init_security(inode, dir, &dentry->d_name, 1638 error = security_inode_init_security(inode, dir, &dentry->d_name,
1633 NULL, NULL); 1639 shmem_initxattrs, NULL);
1634 if (error) { 1640 if (error) {
1635 if (error != -EOPNOTSUPP) { 1641 if (error != -EOPNOTSUPP) {
1636 iput(inode); 1642 iput(inode);
@@ -1704,6 +1710,66 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
1704 * filesystem level, though. 1710 * filesystem level, though.
1705 */ 1711 */
1706 1712
1713/*
1714 * Allocate new xattr and copy in the value; but leave the name to callers.
1715 */
1716static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
1717{
1718 struct shmem_xattr *new_xattr;
1719 size_t len;
1720
1721 /* wrap around? */
1722 len = sizeof(*new_xattr) + size;
1723 if (len <= sizeof(*new_xattr))
1724 return NULL;
1725
1726 new_xattr = kmalloc(len, GFP_KERNEL);
1727 if (!new_xattr)
1728 return NULL;
1729
1730 new_xattr->size = size;
1731 memcpy(new_xattr->value, value, size);
1732 return new_xattr;
1733}
1734
1735/*
1736 * Callback for security_inode_init_security() for acquiring xattrs.
1737 */
1738static int shmem_initxattrs(struct inode *inode,
1739 const struct xattr *xattr_array,
1740 void *fs_info)
1741{
1742 struct shmem_inode_info *info = SHMEM_I(inode);
1743 const struct xattr *xattr;
1744 struct shmem_xattr *new_xattr;
1745 size_t len;
1746
1747 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
1748 new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len);
1749 if (!new_xattr)
1750 return -ENOMEM;
1751
1752 len = strlen(xattr->name) + 1;
1753 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
1754 GFP_KERNEL);
1755 if (!new_xattr->name) {
1756 kfree(new_xattr);
1757 return -ENOMEM;
1758 }
1759
1760 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
1761 XATTR_SECURITY_PREFIX_LEN);
1762 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
1763 xattr->name, len);
1764
1765 spin_lock(&info->lock);
1766 list_add(&new_xattr->list, &info->xattr_list);
1767 spin_unlock(&info->lock);
1768 }
1769
1770 return 0;
1771}
1772
1707static int shmem_xattr_get(struct dentry *dentry, const char *name, 1773static int shmem_xattr_get(struct dentry *dentry, const char *name,
1708 void *buffer, size_t size) 1774 void *buffer, size_t size)
1709{ 1775{
@@ -1731,24 +1797,17 @@ static int shmem_xattr_get(struct dentry *dentry, const char *name,
1731 return ret; 1797 return ret;
1732} 1798}
1733 1799
1734static int shmem_xattr_set(struct dentry *dentry, const char *name, 1800static int shmem_xattr_set(struct inode *inode, const char *name,
1735 const void *value, size_t size, int flags) 1801 const void *value, size_t size, int flags)
1736{ 1802{
1737 struct inode *inode = dentry->d_inode;
1738 struct shmem_inode_info *info = SHMEM_I(inode); 1803 struct shmem_inode_info *info = SHMEM_I(inode);
1739 struct shmem_xattr *xattr; 1804 struct shmem_xattr *xattr;
1740 struct shmem_xattr *new_xattr = NULL; 1805 struct shmem_xattr *new_xattr = NULL;
1741 size_t len;
1742 int err = 0; 1806 int err = 0;
1743 1807
1744 /* value == NULL means remove */ 1808 /* value == NULL means remove */
1745 if (value) { 1809 if (value) {
1746 /* wrap around? */ 1810 new_xattr = shmem_xattr_alloc(value, size);
1747 len = sizeof(*new_xattr) + size;
1748 if (len <= sizeof(*new_xattr))
1749 return -ENOMEM;
1750
1751 new_xattr = kmalloc(len, GFP_KERNEL);
1752 if (!new_xattr) 1811 if (!new_xattr)
1753 return -ENOMEM; 1812 return -ENOMEM;
1754 1813
@@ -1757,9 +1816,6 @@ static int shmem_xattr_set(struct dentry *dentry, const char *name,
1757 kfree(new_xattr); 1816 kfree(new_xattr);
1758 return -ENOMEM; 1817 return -ENOMEM;
1759 } 1818 }
1760
1761 new_xattr->size = size;
1762 memcpy(new_xattr->value, value, size);
1763 } 1819 }
1764 1820
1765 spin_lock(&info->lock); 1821 spin_lock(&info->lock);
@@ -1858,7 +1914,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
1858 if (size == 0) 1914 if (size == 0)
1859 value = ""; /* empty EA, do not remove */ 1915 value = ""; /* empty EA, do not remove */
1860 1916
1861 return shmem_xattr_set(dentry, name, value, size, flags); 1917 return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
1862 1918
1863} 1919}
1864 1920
@@ -1878,7 +1934,7 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
1878 if (err) 1934 if (err)
1879 return err; 1935 return err;
1880 1936
1881 return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE); 1937 return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
1882} 1938}
1883 1939
1884static bool xattr_is_trusted(const char *name) 1940static bool xattr_is_trusted(const char *name)
diff --git a/mm/slab.c b/mm/slab.c
index f0bd7857ab3b..29c8716eb7a9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3284,12 +3284,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3284 if (in_interrupt() || (flags & __GFP_THISNODE)) 3284 if (in_interrupt() || (flags & __GFP_THISNODE))
3285 return NULL; 3285 return NULL;
3286 nid_alloc = nid_here = numa_mem_id(); 3286 nid_alloc = nid_here = numa_mem_id();
3287 get_mems_allowed();
3288 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3287 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3289 nid_alloc = cpuset_slab_spread_node(); 3288 nid_alloc = cpuset_slab_spread_node();
3290 else if (current->mempolicy) 3289 else if (current->mempolicy)
3291 nid_alloc = slab_node(current->mempolicy); 3290 nid_alloc = slab_node(current->mempolicy);
3292 put_mems_allowed();
3293 if (nid_alloc != nid_here) 3291 if (nid_alloc != nid_here)
3294 return ____cache_alloc_node(cachep, flags, nid_alloc); 3292 return ____cache_alloc_node(cachep, flags, nid_alloc);
3295 return NULL; 3293 return NULL;
@@ -3312,14 +3310,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3312 enum zone_type high_zoneidx = gfp_zone(flags); 3310 enum zone_type high_zoneidx = gfp_zone(flags);
3313 void *obj = NULL; 3311 void *obj = NULL;
3314 int nid; 3312 int nid;
3313 unsigned int cpuset_mems_cookie;
3315 3314
3316 if (flags & __GFP_THISNODE) 3315 if (flags & __GFP_THISNODE)
3317 return NULL; 3316 return NULL;
3318 3317
3319 get_mems_allowed();
3320 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3321 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3318 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3322 3319
3320retry_cpuset:
3321 cpuset_mems_cookie = get_mems_allowed();
3322 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3323
3323retry: 3324retry:
3324 /* 3325 /*
3325 * Look through allowed nodes for objects available 3326 * Look through allowed nodes for objects available
@@ -3372,7 +3373,9 @@ retry:
3372 } 3373 }
3373 } 3374 }
3374 } 3375 }
3375 put_mems_allowed(); 3376
3377 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
3378 goto retry_cpuset;
3376 return obj; 3379 return obj;
3377} 3380}
3378 3381
diff --git a/mm/slub.c b/mm/slub.c
index 4907563ef7ff..f4a6229848fd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1581,6 +1581,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
1581 struct zone *zone; 1581 struct zone *zone;
1582 enum zone_type high_zoneidx = gfp_zone(flags); 1582 enum zone_type high_zoneidx = gfp_zone(flags);
1583 void *object; 1583 void *object;
1584 unsigned int cpuset_mems_cookie;
1584 1585
1585 /* 1586 /*
1586 * The defrag ratio allows a configuration of the tradeoffs between 1587 * The defrag ratio allows a configuration of the tradeoffs between
@@ -1604,23 +1605,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
1604 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1605 get_cycles() % 1024 > s->remote_node_defrag_ratio)
1605 return NULL; 1606 return NULL;
1606 1607
1607 get_mems_allowed(); 1608 do {
1608 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1609 cpuset_mems_cookie = get_mems_allowed();
1609 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1610 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1610 struct kmem_cache_node *n; 1611 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1611 1612 struct kmem_cache_node *n;
1612 n = get_node(s, zone_to_nid(zone)); 1613
1613 1614 n = get_node(s, zone_to_nid(zone));
1614 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1615
1615 n->nr_partial > s->min_partial) { 1616 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1616 object = get_partial_node(s, n, c); 1617 n->nr_partial > s->min_partial) {
1617 if (object) { 1618 object = get_partial_node(s, n, c);
1618 put_mems_allowed(); 1619 if (object) {
1619 return object; 1620 /*
1621 * Return the object even if
1622 * put_mems_allowed indicated that
1623 * the cpuset mems_allowed was
1624 * updated in parallel. It's a
1625 * harmless race between the alloc
1626 * and the cpuset update.
1627 */
1628 put_mems_allowed(cpuset_mems_cookie);
1629 return object;
1630 }
1620 } 1631 }
1621 } 1632 }
1622 } 1633 } while (!put_mems_allowed(cpuset_mems_cookie));
1623 put_mems_allowed();
1624#endif 1634#endif
1625 return NULL; 1635 return NULL;
1626} 1636}
diff --git a/mm/sparse.c b/mm/sparse.c
index 61d7cde23111..a8bc7d364deb 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -353,29 +353,21 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
353 353
354 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), 354 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
355 usemap_count); 355 usemap_count);
356 if (usemap) { 356 if (!usemap) {
357 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 357 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
358 if (!present_section_nr(pnum)) 358 if (!usemap) {
359 continue; 359 printk(KERN_WARNING "%s: allocation failed\n", __func__);
360 usemap_map[pnum] = usemap; 360 return;
361 usemap += size;
362 } 361 }
363 return;
364 } 362 }
365 363
366 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); 364 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
367 if (usemap) { 365 if (!present_section_nr(pnum))
368 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 366 continue;
369 if (!present_section_nr(pnum)) 367 usemap_map[pnum] = usemap;
370 continue; 368 usemap += size;
371 usemap_map[pnum] = usemap; 369 check_usemap_section_nr(nodeid, usemap_map[pnum]);
372 usemap += size;
373 check_usemap_section_nr(nodeid, usemap_map[pnum]);
374 }
375 return;
376 } 370 }
377
378 printk(KERN_WARNING "%s: allocation failed\n", __func__);
379} 371}
380 372
381#ifndef CONFIG_SPARSEMEM_VMEMMAP 373#ifndef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/mm/swap.c b/mm/swap.c
index 14380e9fbe33..5c13f1338972 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -496,7 +496,7 @@ static void lru_deactivate_fn(struct page *page, void *arg)
496 * Either "cpu" is the current CPU, and preemption has already been 496 * Either "cpu" is the current CPU, and preemption has already been
497 * disabled; or "cpu" is being hot-unplugged, and is already dead. 497 * disabled; or "cpu" is being hot-unplugged, and is already dead.
498 */ 498 */
499static void drain_cpu_pagevecs(int cpu) 499void lru_add_drain_cpu(int cpu)
500{ 500{
501 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); 501 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
502 struct pagevec *pvec; 502 struct pagevec *pvec;
@@ -553,7 +553,7 @@ void deactivate_page(struct page *page)
553 553
554void lru_add_drain(void) 554void lru_add_drain(void)
555{ 555{
556 drain_cpu_pagevecs(get_cpu()); 556 lru_add_drain_cpu(get_cpu());
557 put_cpu(); 557 put_cpu();
558} 558}
559 559
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ea6b32d61873..9d3dd3763cf7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -372,25 +372,23 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
372struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 372struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
373 struct vm_area_struct *vma, unsigned long addr) 373 struct vm_area_struct *vma, unsigned long addr)
374{ 374{
375 int nr_pages;
376 struct page *page; 375 struct page *page;
377 unsigned long offset; 376 unsigned long offset = swp_offset(entry);
378 unsigned long end_offset; 377 unsigned long start_offset, end_offset;
378 unsigned long mask = (1UL << page_cluster) - 1;
379 379
380 /* 380 /* Read a page_cluster sized and aligned cluster around offset. */
381 * Get starting offset for readaround, and number of pages to read. 381 start_offset = offset & ~mask;
382 * Adjust starting address by readbehind (for NUMA interleave case)? 382 end_offset = offset | mask;
383 * No, it's very unlikely that swap layout would follow vma layout, 383 if (!start_offset) /* First page is swap header. */
384 * more likely that neighbouring swap pages came from the same node: 384 start_offset++;
385 * so use the same "addr" to choose the same node for each swap read. 385
386 */ 386 for (offset = start_offset; offset <= end_offset ; offset++) {
387 nr_pages = valid_swaphandles(entry, &offset);
388 for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
389 /* Ok, do the async read-ahead now */ 387 /* Ok, do the async read-ahead now */
390 page = read_swap_cache_async(swp_entry(swp_type(entry), offset), 388 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
391 gfp_mask, vma, addr); 389 gfp_mask, vma, addr);
392 if (!page) 390 if (!page)
393 break; 391 continue;
394 page_cache_release(page); 392 page_cache_release(page);
395 } 393 }
396 lru_add_drain(); /* Push any new pages onto the LRU now */ 394 lru_add_drain(); /* Push any new pages onto the LRU now */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6bf67ab6e469..dae42f380d6e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
932 pmd = pmd_offset(pud, addr); 932 pmd = pmd_offset(pud, addr);
933 do { 933 do {
934 next = pmd_addr_end(addr, end); 934 next = pmd_addr_end(addr, end);
935 if (unlikely(pmd_trans_huge(*pmd))) 935 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
936 continue;
937 if (pmd_none_or_clear_bad(pmd))
938 continue; 936 continue;
939 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 937 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
940 if (ret) 938 if (ret)
@@ -2107,7 +2105,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2107 p->flags |= SWP_SOLIDSTATE; 2105 p->flags |= SWP_SOLIDSTATE;
2108 p->cluster_next = 1 + (random32() % p->highest_bit); 2106 p->cluster_next = 1 + (random32() % p->highest_bit);
2109 } 2107 }
2110 if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD)) 2108 if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
2111 p->flags |= SWP_DISCARDABLE; 2109 p->flags |= SWP_DISCARDABLE;
2112 } 2110 }
2113 2111
@@ -2292,58 +2290,6 @@ int swapcache_prepare(swp_entry_t entry)
2292} 2290}
2293 2291
2294/* 2292/*
2295 * swap_lock prevents swap_map being freed. Don't grab an extra
2296 * reference on the swaphandle, it doesn't matter if it becomes unused.
2297 */
2298int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2299{
2300 struct swap_info_struct *si;
2301 int our_page_cluster = page_cluster;
2302 pgoff_t target, toff;
2303 pgoff_t base, end;
2304 int nr_pages = 0;
2305
2306 if (!our_page_cluster) /* no readahead */
2307 return 0;
2308
2309 si = swap_info[swp_type(entry)];
2310 target = swp_offset(entry);
2311 base = (target >> our_page_cluster) << our_page_cluster;
2312 end = base + (1 << our_page_cluster);
2313 if (!base) /* first page is swap header */
2314 base++;
2315
2316 spin_lock(&swap_lock);
2317 if (end > si->max) /* don't go beyond end of map */
2318 end = si->max;
2319
2320 /* Count contiguous allocated slots above our target */
2321 for (toff = target; ++toff < end; nr_pages++) {
2322 /* Don't read in free or bad pages */
2323 if (!si->swap_map[toff])
2324 break;
2325 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2326 break;
2327 }
2328 /* Count contiguous allocated slots below our target */
2329 for (toff = target; --toff >= base; nr_pages++) {
2330 /* Don't read in free or bad pages */
2331 if (!si->swap_map[toff])
2332 break;
2333 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2334 break;
2335 }
2336 spin_unlock(&swap_lock);
2337
2338 /*
2339 * Indicate starting offset, and return number of pages to get:
2340 * if only 1, say 0, since there's then no readahead to be done.
2341 */
2342 *offset = ++toff;
2343 return nr_pages? ++nr_pages: 0;
2344}
2345
2346/*
2347 * add_swap_count_continuation - called when a swap count is duplicated 2293 * add_swap_count_continuation - called when a swap count is duplicated
2348 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 2294 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
2349 * page of the original vmalloc'ed swap_map, to hold the continuation count 2295 * page of the original vmalloc'ed swap_map, to hold the continuation count
diff --git a/mm/util.c b/mm/util.c
index 136ac4f322b8..ae962b31de88 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -239,6 +239,47 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
239 next->vm_prev = vma; 239 next->vm_prev = vma;
240} 240}
241 241
242/* Check if the vma is being used as a stack by this task */
243static int vm_is_stack_for_task(struct task_struct *t,
244 struct vm_area_struct *vma)
245{
246 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
247}
248
249/*
250 * Check if the vma is being used as a stack.
251 * If is_group is non-zero, check in the entire thread group or else
252 * just check in the current task. Returns the pid of the task that
253 * the vma is stack for.
254 */
255pid_t vm_is_stack(struct task_struct *task,
256 struct vm_area_struct *vma, int in_group)
257{
258 pid_t ret = 0;
259
260 if (vm_is_stack_for_task(task, vma))
261 return task->pid;
262
263 if (in_group) {
264 struct task_struct *t;
265 rcu_read_lock();
266 if (!pid_alive(task))
267 goto done;
268
269 t = task;
270 do {
271 if (vm_is_stack_for_task(t, vma)) {
272 ret = t->pid;
273 goto done;
274 }
275 } while_each_thread(task, t);
276done:
277 rcu_read_unlock();
278 }
279
280 return ret;
281}
282
242#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 283#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
243void arch_pick_mmap_layout(struct mm_struct *mm) 284void arch_pick_mmap_layout(struct mm_struct *mm)
244{ 285{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c52b23552659..49f15ef0a99a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1138,7 +1138,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1138 * @mz: The mem_cgroup_zone to pull pages from. 1138 * @mz: The mem_cgroup_zone to pull pages from.
1139 * @dst: The temp list to put pages on to. 1139 * @dst: The temp list to put pages on to.
1140 * @nr_scanned: The number of pages that were scanned. 1140 * @nr_scanned: The number of pages that were scanned.
1141 * @order: The caller's attempted allocation order 1141 * @sc: The scan_control struct for this reclaim session
1142 * @mode: One of the LRU isolation modes 1142 * @mode: One of the LRU isolation modes
1143 * @active: True [1] if isolating active pages 1143 * @active: True [1] if isolating active pages
1144 * @file: True [1] if isolating file [!anon] pages 1144 * @file: True [1] if isolating file [!anon] pages
@@ -1147,8 +1147,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1147 */ 1147 */
1148static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1148static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1149 struct mem_cgroup_zone *mz, struct list_head *dst, 1149 struct mem_cgroup_zone *mz, struct list_head *dst,
1150 unsigned long *nr_scanned, int order, isolate_mode_t mode, 1150 unsigned long *nr_scanned, struct scan_control *sc,
1151 int active, int file) 1151 isolate_mode_t mode, int active, int file)
1152{ 1152{
1153 struct lruvec *lruvec; 1153 struct lruvec *lruvec;
1154 struct list_head *src; 1154 struct list_head *src;
@@ -1194,7 +1194,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1194 BUG(); 1194 BUG();
1195 } 1195 }
1196 1196
1197 if (!order) 1197 if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
1198 continue; 1198 continue;
1199 1199
1200 /* 1200 /*
@@ -1208,8 +1208,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1208 */ 1208 */
1209 zone_id = page_zone_id(page); 1209 zone_id = page_zone_id(page);
1210 page_pfn = page_to_pfn(page); 1210 page_pfn = page_to_pfn(page);
1211 pfn = page_pfn & ~((1 << order) - 1); 1211 pfn = page_pfn & ~((1 << sc->order) - 1);
1212 end_pfn = pfn + (1 << order); 1212 end_pfn = pfn + (1 << sc->order);
1213 for (; pfn < end_pfn; pfn++) { 1213 for (; pfn < end_pfn; pfn++) {
1214 struct page *cursor_page; 1214 struct page *cursor_page;
1215 1215
@@ -1275,7 +1275,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1275 1275
1276 *nr_scanned = scan; 1276 *nr_scanned = scan;
1277 1277
1278 trace_mm_vmscan_lru_isolate(order, 1278 trace_mm_vmscan_lru_isolate(sc->order,
1279 nr_to_scan, scan, 1279 nr_to_scan, scan,
1280 nr_taken, 1280 nr_taken,
1281 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, 1281 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
@@ -1413,7 +1413,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
1413 unsigned long *nr_anon, 1413 unsigned long *nr_anon,
1414 unsigned long *nr_file) 1414 unsigned long *nr_file)
1415{ 1415{
1416 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1417 struct zone *zone = mz->zone; 1416 struct zone *zone = mz->zone;
1418 unsigned int count[NR_LRU_LISTS] = { 0, }; 1417 unsigned int count[NR_LRU_LISTS] = { 0, };
1419 unsigned long nr_active = 0; 1418 unsigned long nr_active = 0;
@@ -1434,6 +1433,7 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
1434 count[lru] += numpages; 1433 count[lru] += numpages;
1435 } 1434 }
1436 1435
1436 preempt_disable();
1437 __count_vm_events(PGDEACTIVATE, nr_active); 1437 __count_vm_events(PGDEACTIVATE, nr_active);
1438 1438
1439 __mod_zone_page_state(zone, NR_ACTIVE_FILE, 1439 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
@@ -1448,8 +1448,9 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
1448 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 1448 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1449 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 1449 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1450 1450
1451 reclaim_stat->recent_scanned[0] += *nr_anon; 1451 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1452 reclaim_stat->recent_scanned[1] += *nr_file; 1452 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1453 preempt_enable();
1453} 1454}
1454 1455
1455/* 1456/*
@@ -1509,8 +1510,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1509 unsigned long nr_file; 1510 unsigned long nr_file;
1510 unsigned long nr_dirty = 0; 1511 unsigned long nr_dirty = 0;
1511 unsigned long nr_writeback = 0; 1512 unsigned long nr_writeback = 0;
1512 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; 1513 isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
1513 struct zone *zone = mz->zone; 1514 struct zone *zone = mz->zone;
1515 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1514 1516
1515 while (unlikely(too_many_isolated(zone, file, sc))) { 1517 while (unlikely(too_many_isolated(zone, file, sc))) {
1516 congestion_wait(BLK_RW_ASYNC, HZ/10); 1518 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1522,20 +1524,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1522 1524
1523 set_reclaim_mode(priority, sc, false); 1525 set_reclaim_mode(priority, sc, false);
1524 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) 1526 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1525 reclaim_mode |= ISOLATE_ACTIVE; 1527 isolate_mode |= ISOLATE_ACTIVE;
1526 1528
1527 lru_add_drain(); 1529 lru_add_drain();
1528 1530
1529 if (!sc->may_unmap) 1531 if (!sc->may_unmap)
1530 reclaim_mode |= ISOLATE_UNMAPPED; 1532 isolate_mode |= ISOLATE_UNMAPPED;
1531 if (!sc->may_writepage) 1533 if (!sc->may_writepage)
1532 reclaim_mode |= ISOLATE_CLEAN; 1534 isolate_mode |= ISOLATE_CLEAN;
1533 1535
1534 spin_lock_irq(&zone->lru_lock); 1536 spin_lock_irq(&zone->lru_lock);
1535 1537
1536 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, 1538 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
1537 &nr_scanned, sc->order, 1539 sc, isolate_mode, 0, file);
1538 reclaim_mode, 0, file);
1539 if (global_reclaim(sc)) { 1540 if (global_reclaim(sc)) {
1540 zone->pages_scanned += nr_scanned; 1541 zone->pages_scanned += nr_scanned;
1541 if (current_is_kswapd()) 1542 if (current_is_kswapd())
@@ -1545,19 +1546,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1545 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1546 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1546 nr_scanned); 1547 nr_scanned);
1547 } 1548 }
1549 spin_unlock_irq(&zone->lru_lock);
1548 1550
1549 if (nr_taken == 0) { 1551 if (nr_taken == 0)
1550 spin_unlock_irq(&zone->lru_lock);
1551 return 0; 1552 return 0;
1552 }
1553 1553
1554 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); 1554 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
1555 1555
1556 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1557 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1558
1559 spin_unlock_irq(&zone->lru_lock);
1560
1561 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, 1556 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
1562 &nr_dirty, &nr_writeback); 1557 &nr_dirty, &nr_writeback);
1563 1558
@@ -1570,6 +1565,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1570 1565
1571 spin_lock_irq(&zone->lru_lock); 1566 spin_lock_irq(&zone->lru_lock);
1572 1567
1568 reclaim_stat->recent_scanned[0] += nr_anon;
1569 reclaim_stat->recent_scanned[1] += nr_file;
1570
1573 if (current_is_kswapd()) 1571 if (current_is_kswapd())
1574 __count_vm_events(KSWAPD_STEAL, nr_reclaimed); 1572 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1575 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); 1573 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
@@ -1643,18 +1641,6 @@ static void move_active_pages_to_lru(struct zone *zone,
1643 unsigned long pgmoved = 0; 1641 unsigned long pgmoved = 0;
1644 struct page *page; 1642 struct page *page;
1645 1643
1646 if (buffer_heads_over_limit) {
1647 spin_unlock_irq(&zone->lru_lock);
1648 list_for_each_entry(page, list, lru) {
1649 if (page_has_private(page) && trylock_page(page)) {
1650 if (page_has_private(page))
1651 try_to_release_page(page, 0);
1652 unlock_page(page);
1653 }
1654 }
1655 spin_lock_irq(&zone->lru_lock);
1656 }
1657
1658 while (!list_empty(list)) { 1644 while (!list_empty(list)) {
1659 struct lruvec *lruvec; 1645 struct lruvec *lruvec;
1660 1646
@@ -1699,21 +1685,22 @@ static void shrink_active_list(unsigned long nr_to_scan,
1699 struct page *page; 1685 struct page *page;
1700 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1686 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1701 unsigned long nr_rotated = 0; 1687 unsigned long nr_rotated = 0;
1702 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; 1688 isolate_mode_t isolate_mode = ISOLATE_ACTIVE;
1703 struct zone *zone = mz->zone; 1689 struct zone *zone = mz->zone;
1704 1690
1705 lru_add_drain(); 1691 lru_add_drain();
1706 1692
1693 reset_reclaim_mode(sc);
1694
1707 if (!sc->may_unmap) 1695 if (!sc->may_unmap)
1708 reclaim_mode |= ISOLATE_UNMAPPED; 1696 isolate_mode |= ISOLATE_UNMAPPED;
1709 if (!sc->may_writepage) 1697 if (!sc->may_writepage)
1710 reclaim_mode |= ISOLATE_CLEAN; 1698 isolate_mode |= ISOLATE_CLEAN;
1711 1699
1712 spin_lock_irq(&zone->lru_lock); 1700 spin_lock_irq(&zone->lru_lock);
1713 1701
1714 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, 1702 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
1715 &nr_scanned, sc->order, 1703 isolate_mode, 1, file);
1716 reclaim_mode, 1, file);
1717 if (global_reclaim(sc)) 1704 if (global_reclaim(sc))
1718 zone->pages_scanned += nr_scanned; 1705 zone->pages_scanned += nr_scanned;
1719 1706
@@ -1737,6 +1724,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
1737 continue; 1724 continue;
1738 } 1725 }
1739 1726
1727 if (unlikely(buffer_heads_over_limit)) {
1728 if (page_has_private(page) && trylock_page(page)) {
1729 if (page_has_private(page))
1730 try_to_release_page(page, 0);
1731 unlock_page(page);
1732 }
1733 }
1734
1740 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { 1735 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
1741 nr_rotated += hpage_nr_pages(page); 1736 nr_rotated += hpage_nr_pages(page);
1742 /* 1737 /*
@@ -2112,7 +2107,12 @@ restart:
2112 * with multiple processes reclaiming pages, the total 2107 * with multiple processes reclaiming pages, the total
2113 * freeing target can get unreasonably large. 2108 * freeing target can get unreasonably large.
2114 */ 2109 */
2115 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 2110 if (nr_reclaimed >= nr_to_reclaim)
2111 nr_to_reclaim = 0;
2112 else
2113 nr_to_reclaim -= nr_reclaimed;
2114
2115 if (!nr_to_reclaim && priority < DEF_PRIORITY)
2116 break; 2116 break;
2117 } 2117 }
2118 blk_finish_plug(&plug); 2118 blk_finish_plug(&plug);
@@ -2195,7 +2195,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2195 * If compaction is deferred, reclaim up to a point where 2195 * If compaction is deferred, reclaim up to a point where
2196 * compaction will have a chance of success when re-enabled 2196 * compaction will have a chance of success when re-enabled
2197 */ 2197 */
2198 if (compaction_deferred(zone)) 2198 if (compaction_deferred(zone, sc->order))
2199 return watermark_ok; 2199 return watermark_ok;
2200 2200
2201 /* If compaction is not ready to start, keep reclaiming */ 2201 /* If compaction is not ready to start, keep reclaiming */
@@ -2235,6 +2235,14 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2235 unsigned long nr_soft_scanned; 2235 unsigned long nr_soft_scanned;
2236 bool aborted_reclaim = false; 2236 bool aborted_reclaim = false;
2237 2237
2238 /*
2239 * If the number of buffer_heads in the machine exceeds the maximum
2240 * allowed level, force direct reclaim to scan the highmem zone as
2241 * highmem pages could be pinning lowmem pages storing buffer_heads
2242 */
2243 if (buffer_heads_over_limit)
2244 sc->gfp_mask |= __GFP_HIGHMEM;
2245
2238 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2246 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2239 gfp_zone(sc->gfp_mask), sc->nodemask) { 2247 gfp_zone(sc->gfp_mask), sc->nodemask) {
2240 if (!populated_zone(zone)) 2248 if (!populated_zone(zone))
@@ -2255,8 +2263,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2255 * Even though compaction is invoked for any 2263 * Even though compaction is invoked for any
2256 * non-zero order, only frequent costly order 2264 * non-zero order, only frequent costly order
2257 * reclamation is disruptive enough to become a 2265 * reclamation is disruptive enough to become a
2258 * noticable problem, like transparent huge page 2266 * noticeable problem, like transparent huge
2259 * allocations. 2267 * page allocations.
2260 */ 2268 */
2261 if (compaction_ready(zone, sc)) { 2269 if (compaction_ready(zone, sc)) {
2262 aborted_reclaim = true; 2270 aborted_reclaim = true;
@@ -2337,7 +2345,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2337 unsigned long writeback_threshold; 2345 unsigned long writeback_threshold;
2338 bool aborted_reclaim; 2346 bool aborted_reclaim;
2339 2347
2340 get_mems_allowed();
2341 delayacct_freepages_start(); 2348 delayacct_freepages_start();
2342 2349
2343 if (global_reclaim(sc)) 2350 if (global_reclaim(sc))
@@ -2401,7 +2408,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2401 2408
2402out: 2409out:
2403 delayacct_freepages_end(); 2410 delayacct_freepages_end();
2404 put_mems_allowed();
2405 2411
2406 if (sc->nr_reclaimed) 2412 if (sc->nr_reclaimed)
2407 return sc->nr_reclaimed; 2413 return sc->nr_reclaimed;
@@ -2724,6 +2730,17 @@ loop_again:
2724 */ 2730 */
2725 age_active_anon(zone, &sc, priority); 2731 age_active_anon(zone, &sc, priority);
2726 2732
2733 /*
2734 * If the number of buffer_heads in the machine
2735 * exceeds the maximum allowed level and this node
2736 * has a highmem zone, force kswapd to reclaim from
2737 * it to relieve lowmem pressure.
2738 */
2739 if (buffer_heads_over_limit && is_highmem_idx(i)) {
2740 end_zone = i;
2741 break;
2742 }
2743
2727 if (!zone_watermark_ok_safe(zone, order, 2744 if (!zone_watermark_ok_safe(zone, order,
2728 high_wmark_pages(zone), 0, 0)) { 2745 high_wmark_pages(zone), 0, 0)) {
2729 end_zone = i; 2746 end_zone = i;
@@ -2753,7 +2770,7 @@ loop_again:
2753 */ 2770 */
2754 for (i = 0; i <= end_zone; i++) { 2771 for (i = 0; i <= end_zone; i++) {
2755 struct zone *zone = pgdat->node_zones + i; 2772 struct zone *zone = pgdat->node_zones + i;
2756 int nr_slab; 2773 int nr_slab, testorder;
2757 unsigned long balance_gap; 2774 unsigned long balance_gap;
2758 2775
2759 if (!populated_zone(zone)) 2776 if (!populated_zone(zone))
@@ -2786,7 +2803,21 @@ loop_again:
2786 (zone->present_pages + 2803 (zone->present_pages +
2787 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2804 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2788 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2805 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2789 if (!zone_watermark_ok_safe(zone, order, 2806 /*
2807 * Kswapd reclaims only single pages with compaction
2808 * enabled. Trying too hard to reclaim until contiguous
2809 * free pages have become available can hurt performance
2810 * by evicting too much useful data from memory.
2811 * Do not reclaim more than needed for compaction.
2812 */
2813 testorder = order;
2814 if (COMPACTION_BUILD && order &&
2815 compaction_suitable(zone, order) !=
2816 COMPACT_SKIPPED)
2817 testorder = 0;
2818
2819 if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
2820 !zone_watermark_ok_safe(zone, order,
2790 high_wmark_pages(zone) + balance_gap, 2821 high_wmark_pages(zone) + balance_gap,
2791 end_zone, 0)) { 2822 end_zone, 0)) {
2792 shrink_zone(priority, zone, &sc); 2823 shrink_zone(priority, zone, &sc);
@@ -2815,7 +2846,7 @@ loop_again:
2815 continue; 2846 continue;
2816 } 2847 }
2817 2848
2818 if (!zone_watermark_ok_safe(zone, order, 2849 if (!zone_watermark_ok_safe(zone, testorder,
2819 high_wmark_pages(zone), end_zone, 0)) { 2850 high_wmark_pages(zone), end_zone, 0)) {
2820 all_zones_ok = 0; 2851 all_zones_ok = 0;
2821 /* 2852 /*
@@ -2903,6 +2934,8 @@ out:
2903 * and it is potentially going to sleep here. 2934 * and it is potentially going to sleep here.
2904 */ 2935 */
2905 if (order) { 2936 if (order) {
2937 int zones_need_compaction = 1;
2938
2906 for (i = 0; i <= end_zone; i++) { 2939 for (i = 0; i <= end_zone; i++) {
2907 struct zone *zone = pgdat->node_zones + i; 2940 struct zone *zone = pgdat->node_zones + i;
2908 2941
@@ -2912,6 +2945,10 @@ out:
2912 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2945 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2913 continue; 2946 continue;
2914 2947
2948 /* Would compaction fail due to lack of free memory? */
2949 if (compaction_suitable(zone, order) == COMPACT_SKIPPED)
2950 goto loop_again;
2951
2915 /* Confirm the zone is balanced for order-0 */ 2952 /* Confirm the zone is balanced for order-0 */
2916 if (!zone_watermark_ok(zone, 0, 2953 if (!zone_watermark_ok(zone, 0,
2917 high_wmark_pages(zone), 0, 0)) { 2954 high_wmark_pages(zone), 0, 0)) {
@@ -2919,11 +2956,17 @@ out:
2919 goto loop_again; 2956 goto loop_again;
2920 } 2957 }
2921 2958
2959 /* Check if the memory needs to be defragmented. */
2960 if (zone_watermark_ok(zone, order,
2961 low_wmark_pages(zone), *classzone_idx, 0))
2962 zones_need_compaction = 0;
2963
2922 /* If balanced, clear the congested flag */ 2964 /* If balanced, clear the congested flag */
2923 zone_clear_flag(zone, ZONE_CONGESTED); 2965 zone_clear_flag(zone, ZONE_CONGESTED);
2924 if (i <= *classzone_idx)
2925 balanced += zone->present_pages;
2926 } 2966 }
2967
2968 if (zones_need_compaction)
2969 compact_pgdat(pgdat, order);
2927 } 2970 }
2928 2971
2929 /* 2972 /*