aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-01-12 23:42:54 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-01-12 23:42:54 -0500
commit099469502f62fbe0d7e4f0b83a2f22538367f734 (patch)
tree5229c3818b2e6e09d35026d49314047121130536
parent7c17d86a8502c2e30c2eea777ed1b830aa3b447b (diff)
parent35f1526845a9d804206883e19bd257d3dcef758f (diff)
Merge branch 'akpm' (aka "Andrew's patch-bomb, take two")
Andrew explains: - various misc stuff - Most of the rest of MM: memcg, threaded hugepages, others. - cpumask - kexec - kdump - some direct-io performance tweaking - radix-tree optimisations - new selftests code A note on this: often people will develop a new userspace-visible feature and will develop userspace code to exercise/test that feature. Then they merge the patch and the selftest code dies. Sometimes we paste it into the changelog. Sometimes the code gets thrown into Documentation/(!). This saddens me. So this patch creates a bare-bones framework which will henceforth allow me to ask people to include their test apps in the kernel tree so we can keep them alive. Then when people enhance or fix the feature, I can ask them to update the test app too. The infrastruture is terribly trivial at present - let's see how it evolves. - checkpoint/restart feature work. A note on this: this is a project by various mad Russians to perform c/r mainly from userspace, with various oddball helper code added into the kernel where the need is demonstrated. So rather than some large central lump of code, what we have is little bits and pieces popping up in various places which either expose something new or which permit something which is normally kernel-private to be modified. The overall project is an ongoing thing. I've judged that the size and scope of the thing means that we're more likely to be successful with it if we integrate the support into mainline piecemeal rather than allowing it all to develop out-of-tree. However I'm less confident than the developers that it will all eventually work! So what I'm asking them to do is to wrap each piece of new code inside CONFIG_CHECKPOINT_RESTORE. So if it all eventually comes to tears and the project as a whole fails, it should be a simple matter to go through and delete all trace of it. This lot pretty much wraps up the -rc1 merge for me. * akpm: (96 commits) unlzo: fix input buffer free ramoops: update parameters only after successful init ramoops: fix use of rounddown_pow_of_two() c/r: prctl: add PR_SET_MM codes to set up mm_struct entries c/r: procfs: add start_data, end_data, start_brk members to /proc/$pid/stat v4 c/r: introduce CHECKPOINT_RESTORE symbol selftests: new x86 breakpoints selftest selftests: new very basic kernel selftests directory radix_tree: take radix_tree_path off stack radix_tree: remove radix_tree_indirect_to_ptr() dio: optimize cache misses in the submission path vfs: cache request_queue in struct block_device fs/direct-io.c: calculate fs_count correctly in get_more_blocks() drivers/parport/parport_pc.c: fix warnings panic: don't print redundant backtraces on oops sysctl: add the kernel.ns_last_pid control kdump: add udev events for memory online/offline include/linux/crash_dump.h needs elf.h kdump: fix crash_kexec()/smp_send_stop() race in panic() kdump: crashk_res init check for /sys/kernel/kexec_crash_size ...
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-slab4
-rw-r--r--Documentation/cgroups/memory.txt9
-rw-r--r--Documentation/filesystems/proc.txt3
-rw-r--r--Documentation/sysctl/kernel.txt8
-rw-r--r--Documentation/vm/slub.txt5
-rw-r--r--arch/Kconfig14
-rw-r--r--arch/avr32/include/asm/system.h2
-rw-r--r--arch/avr32/kernel/traps.c2
-rw-r--r--arch/ia64/include/asm/processor.h1
-rw-r--r--arch/ia64/kernel/machine_kexec.c4
-rw-r--r--arch/m68k/amiga/config.c3
-rw-r--r--arch/mips/include/asm/ptrace.h2
-rw-r--r--arch/mips/kernel/traps.c2
-rw-r--r--arch/mn10300/include/asm/exceptions.h2
-rw-r--r--arch/parisc/include/asm/processor.h2
-rw-r--r--arch/parisc/kernel/process.c1
-rw-r--r--arch/powerpc/kernel/machine_kexec_32.c4
-rw-r--r--arch/powerpc/kernel/machine_kexec_64.c6
-rw-r--r--arch/powerpc/mm/numa.c2
-rw-r--r--arch/powerpc/platforms/pseries/nvram.c1
-rw-r--r--arch/s390/include/asm/processor.h2
-rw-r--r--arch/s390/kernel/nmi.c2
-rw-r--r--arch/sh/kernel/process_32.c2
-rw-r--r--arch/sh/kernel/process_64.c2
-rw-r--r--arch/tile/kernel/machine_kexec.c6
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/Kconfig.cpu6
-rw-r--r--arch/x86/mm/numa.c2
-rw-r--r--arch/x86/um/Kconfig8
-rw-r--r--drivers/base/memory.c17
-rw-r--r--drivers/char/ramoops.c24
-rw-r--r--drivers/mtd/mtdoops.c3
-rw-r--r--drivers/parport/parport_pc.c4
-rw-r--r--drivers/video/nvidia/nvidia.c6
-rw-r--r--fs/block_dev.c3
-rw-r--r--fs/btrfs/disk-io.c5
-rw-r--r--fs/direct-io.c57
-rw-r--r--fs/eventpoll.c234
-rw-r--r--fs/hugetlbfs/inode.c3
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/write.c4
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/proc/array.c7
-rw-r--r--fs/proc/base.c2
-rw-r--r--include/asm-generic/tlb.h14
-rw-r--r--include/linux/crash_dump.h1
-rw-r--r--include/linux/eventpoll.h1
-rw-r--r--include/linux/fs.h14
-rw-r--r--include/linux/huge_mm.h2
-rw-r--r--include/linux/kernel.h13
-rw-r--r--include/linux/kmsg_dump.h1
-rw-r--r--include/linux/linkage.h4
-rw-r--r--include/linux/memcontrol.h105
-rw-r--r--include/linux/migrate.h23
-rw-r--r--include/linux/mm_inline.h44
-rw-r--r--include/linux/mm_types.h9
-rw-r--r--include/linux/mmzone.h28
-rw-r--r--include/linux/oom.h2
-rw-r--r--include/linux/page_cgroup.h46
-rw-r--r--include/linux/pagevec.h12
-rw-r--r--include/linux/prctl.h12
-rw-r--r--include/linux/radix-tree.h3
-rw-r--r--include/linux/rmap.h4
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/trace/events/vmscan.h22
-rw-r--r--init/Kconfig11
-rw-r--r--kernel/exit.c6
-rw-r--r--kernel/kexec.c25
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/panic.c26
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/pid_namespace.c31
-rw-r--r--kernel/sys.c121
-rw-r--r--lib/decompress_unlzo.c2
-rw-r--r--lib/radix-tree.c154
-rw-r--r--mm/compaction.c5
-rw-r--r--mm/filemap.c18
-rw-r--r--mm/huge_memory.c93
-rw-r--r--mm/ksm.c11
-rw-r--r--mm/memcontrol.c1102
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c4
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/migrate.c173
-rw-r--r--mm/oom_kill.c42
-rw-r--r--mm/page_alloc.c55
-rw-r--r--mm/page_cgroup.c164
-rw-r--r--mm/rmap.c20
-rw-r--r--mm/slub.c9
-rw-r--r--mm/swap.c79
-rw-r--r--mm/swap_state.c10
-rw-r--r--mm/swapfile.c9
-rw-r--r--mm/vmalloc.c9
-rw-r--r--mm/vmscan.c680
-rw-r--r--mm/vmstat.c2
-rw-r--r--tools/testing/selftests/Makefile11
-rw-r--r--tools/testing/selftests/breakpoints/Makefile20
-rw-r--r--tools/testing/selftests/breakpoints/breakpoint_test.c394
-rw-r--r--tools/testing/selftests/run_tests8
100 files changed, 2589 insertions, 1562 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
index 8b093f8222d3..91bd6ca5440f 100644
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab
@@ -346,6 +346,10 @@ Description:
346 number of objects per slab. If a slab cannot be allocated 346 number of objects per slab. If a slab cannot be allocated
347 because of fragmentation, SLUB will retry with the minimum order 347 because of fragmentation, SLUB will retry with the minimum order
348 possible depending on its characteristics. 348 possible depending on its characteristics.
349 When debug_guardpage_minorder=N (N > 0) parameter is specified
350 (see Documentation/kernel-parameters.txt), the minimum possible
351 order is used and this sysfs entry can not be used to change
352 the order at run time.
349 353
350What: /sys/kernel/slab/cache/order_fallback 354What: /sys/kernel/slab/cache/order_fallback
351Date: April 2008 355Date: April 2008
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 4d8774f6f48a..4c95c0034a4b 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -61,7 +61,7 @@ Brief summary of control files.
61 memory.failcnt # show the number of memory usage hits limits 61 memory.failcnt # show the number of memory usage hits limits
62 memory.memsw.failcnt # show the number of memory+Swap hits limits 62 memory.memsw.failcnt # show the number of memory+Swap hits limits
63 memory.max_usage_in_bytes # show max memory usage recorded 63 memory.max_usage_in_bytes # show max memory usage recorded
64 memory.memsw.usage_in_bytes # show max memory+Swap usage recorded 64 memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded
65 memory.soft_limit_in_bytes # set/show soft limit of memory usage 65 memory.soft_limit_in_bytes # set/show soft limit of memory usage
66 memory.stat # show various statistics 66 memory.stat # show various statistics
67 memory.use_hierarchy # set/show hierarchical account enabled 67 memory.use_hierarchy # set/show hierarchical account enabled
@@ -410,8 +410,11 @@ memory.stat file includes following statistics
410cache - # of bytes of page cache memory. 410cache - # of bytes of page cache memory.
411rss - # of bytes of anonymous and swap cache memory. 411rss - # of bytes of anonymous and swap cache memory.
412mapped_file - # of bytes of mapped file (includes tmpfs/shmem) 412mapped_file - # of bytes of mapped file (includes tmpfs/shmem)
413pgpgin - # of pages paged in (equivalent to # of charging events). 413pgpgin - # of charging events to the memory cgroup. The charging
414pgpgout - # of pages paged out (equivalent to # of uncharging events). 414 event happens each time a page is accounted as either mapped
415 anon page(RSS) or cache page(Page Cache) to the cgroup.
416pgpgout - # of uncharging events to the memory cgroup. The uncharging
417 event happens each time a page is unaccounted from the cgroup.
415swap - # of bytes of swap usage 418swap - # of bytes of swap usage
416inactive_anon - # of bytes of anonymous memory and swap cache memory on 419inactive_anon - # of bytes of anonymous memory and swap cache memory on
417 LRU list. 420 LRU list.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 12fee132fbe2..a76a26a1db8a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -307,6 +307,9 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
307 blkio_ticks time spent waiting for block IO 307 blkio_ticks time spent waiting for block IO
308 gtime guest time of the task in jiffies 308 gtime guest time of the task in jiffies
309 cgtime guest time of the task children in jiffies 309 cgtime guest time of the task children in jiffies
310 start_data address above which program data+bss is placed
311 end_data address below which program data+bss is placed
312 start_brk address above which program heap can be expanded with brk()
310.............................................................................. 313..............................................................................
311 314
312The /proc/PID/maps file containing the currently mapped memory regions and 315The /proc/PID/maps file containing the currently mapped memory regions and
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 6d8cd8b2c30d..8c20fbd8b42d 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -415,6 +415,14 @@ PIDs of value pid_max or larger are not allocated.
415 415
416============================================================== 416==============================================================
417 417
418ns_last_pid:
419
420The last pid allocated in the current (the one task using this sysctl
421lives in) pid namespace. When selecting a pid for a next task on fork
422kernel tries to allocate a number starting from this one.
423
424==============================================================
425
418powersave-nap: (PPC only) 426powersave-nap: (PPC only)
419 427
420If set, Linux-PPC will use the 'nap' mode of powersaving, 428If set, Linux-PPC will use the 'nap' mode of powersaving,
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt
index 2acdda9601b0..6752870c4970 100644
--- a/Documentation/vm/slub.txt
+++ b/Documentation/vm/slub.txt
@@ -131,7 +131,10 @@ slub_min_objects.
131slub_max_order specified the order at which slub_min_objects should no 131slub_max_order specified the order at which slub_min_objects should no
132longer be checked. This is useful to avoid SLUB trying to generate 132longer be checked. This is useful to avoid SLUB trying to generate
133super large order pages to fit slub_min_objects of a slab cache with 133super large order pages to fit slub_min_objects of a slab cache with
134large object sizes into one high order page. 134large object sizes into one high order page. Setting command line
135parameter debug_guardpage_minorder=N (N > 0), forces setting
136slub_max_order to 0, what cause minimum possible order of slabs
137allocation.
135 138
136SLUB Debug output 139SLUB Debug output
137----------------- 140-----------------
diff --git a/arch/Kconfig b/arch/Kconfig
index 2505740b81d2..4f55c736be11 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -185,4 +185,18 @@ config HAVE_RCU_TABLE_FREE
185config ARCH_HAVE_NMI_SAFE_CMPXCHG 185config ARCH_HAVE_NMI_SAFE_CMPXCHG
186 bool 186 bool
187 187
188config HAVE_ALIGNED_STRUCT_PAGE
189 bool
190 help
191 This makes sure that struct pages are double word aligned and that
192 e.g. the SLUB allocator can perform double word atomic operations
193 on a struct page for better performance. However selecting this
194 might increase the size of a struct page by a word.
195
196config HAVE_CMPXCHG_LOCAL
197 bool
198
199config HAVE_CMPXCHG_DOUBLE
200 bool
201
188source "kernel/gcov/Kconfig" 202source "kernel/gcov/Kconfig"
diff --git a/arch/avr32/include/asm/system.h b/arch/avr32/include/asm/system.h
index 9702c2213e1e..62d9ded01635 100644
--- a/arch/avr32/include/asm/system.h
+++ b/arch/avr32/include/asm/system.h
@@ -169,7 +169,7 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
169#define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n)) 169#define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n))
170 170
171struct pt_regs; 171struct pt_regs;
172void NORET_TYPE die(const char *str, struct pt_regs *regs, long err); 172void die(const char *str, struct pt_regs *regs, long err);
173void _exception(long signr, struct pt_regs *regs, int code, 173void _exception(long signr, struct pt_regs *regs, int code,
174 unsigned long addr); 174 unsigned long addr);
175 175
diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c
index 7aa25756412f..3d760c06f024 100644
--- a/arch/avr32/kernel/traps.c
+++ b/arch/avr32/kernel/traps.c
@@ -24,7 +24,7 @@
24 24
25static DEFINE_SPINLOCK(die_lock); 25static DEFINE_SPINLOCK(die_lock);
26 26
27void NORET_TYPE die(const char *str, struct pt_regs *regs, long err) 27void die(const char *str, struct pt_regs *regs, long err)
28{ 28{
29 static int die_counter; 29 static int die_counter;
30 30
diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
index d9f397fae03e..691be0b95c1e 100644
--- a/arch/ia64/include/asm/processor.h
+++ b/arch/ia64/include/asm/processor.h
@@ -309,7 +309,6 @@ struct thread_struct {
309} 309}
310 310
311#define start_thread(regs,new_ip,new_sp) do { \ 311#define start_thread(regs,new_ip,new_sp) do { \
312 set_fs(USER_DS); \
313 regs->cr_ipsr = ((regs->cr_ipsr | (IA64_PSR_BITS_TO_SET | IA64_PSR_CPL)) \ 312 regs->cr_ipsr = ((regs->cr_ipsr | (IA64_PSR_BITS_TO_SET | IA64_PSR_CPL)) \
314 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_RI | IA64_PSR_IS)); \ 313 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_RI | IA64_PSR_IS)); \
315 regs->cr_iip = new_ip; \ 314 regs->cr_iip = new_ip; \
diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
index 3d3aeef46947..4eed35814994 100644
--- a/arch/ia64/kernel/machine_kexec.c
+++ b/arch/ia64/kernel/machine_kexec.c
@@ -27,11 +27,11 @@
27#include <asm/sal.h> 27#include <asm/sal.h>
28#include <asm/mca.h> 28#include <asm/mca.h>
29 29
30typedef NORET_TYPE void (*relocate_new_kernel_t)( 30typedef void (*relocate_new_kernel_t)(
31 unsigned long indirection_page, 31 unsigned long indirection_page,
32 unsigned long start_address, 32 unsigned long start_address,
33 struct ia64_boot_param *boot_param, 33 struct ia64_boot_param *boot_param,
34 unsigned long pal_addr) ATTRIB_NORET; 34 unsigned long pal_addr) __noreturn;
35 35
36struct kimage *ia64_kimage; 36struct kimage *ia64_kimage;
37 37
diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c
index 82a4bb51d5d8..b95a451b1c3a 100644
--- a/arch/m68k/amiga/config.c
+++ b/arch/m68k/amiga/config.c
@@ -511,8 +511,7 @@ static unsigned long amiga_gettimeoffset(void)
511 return ticks + offset; 511 return ticks + offset;
512} 512}
513 513
514static NORET_TYPE void amiga_reset(void) 514static void amiga_reset(void) __noreturn;
515 ATTRIB_NORET;
516 515
517static void amiga_reset(void) 516static void amiga_reset(void)
518{ 517{
diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h
index de39b1f343ea..7b99c670e478 100644
--- a/arch/mips/include/asm/ptrace.h
+++ b/arch/mips/include/asm/ptrace.h
@@ -144,7 +144,7 @@ extern int ptrace_set_watch_regs(struct task_struct *child,
144extern asmlinkage void syscall_trace_enter(struct pt_regs *regs); 144extern asmlinkage void syscall_trace_enter(struct pt_regs *regs);
145extern asmlinkage void syscall_trace_leave(struct pt_regs *regs); 145extern asmlinkage void syscall_trace_leave(struct pt_regs *regs);
146 146
147extern NORET_TYPE void die(const char *, struct pt_regs *) ATTRIB_NORET; 147extern void die(const char *, struct pt_regs *) __noreturn;
148 148
149static inline void die_if_kernel(const char *str, struct pt_regs *regs) 149static inline void die_if_kernel(const char *str, struct pt_regs *regs)
150{ 150{
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 5c8a49d55054..bbddb86c1fa1 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -1340,7 +1340,7 @@ void ejtag_exception_handler(struct pt_regs *regs)
1340/* 1340/*
1341 * NMI exception handler. 1341 * NMI exception handler.
1342 */ 1342 */
1343NORET_TYPE void ATTRIB_NORET nmi_exception_handler(struct pt_regs *regs) 1343void __noreturn nmi_exception_handler(struct pt_regs *regs)
1344{ 1344{
1345 bust_spinlocks(1); 1345 bust_spinlocks(1);
1346 printk("NMI taken!!!!\n"); 1346 printk("NMI taken!!!!\n");
diff --git a/arch/mn10300/include/asm/exceptions.h b/arch/mn10300/include/asm/exceptions.h
index ca3e20508c77..95a4d42c3a06 100644
--- a/arch/mn10300/include/asm/exceptions.h
+++ b/arch/mn10300/include/asm/exceptions.h
@@ -110,7 +110,7 @@ extern asmlinkage void nmi_handler(void);
110extern asmlinkage void misalignment(struct pt_regs *, enum exception_code); 110extern asmlinkage void misalignment(struct pt_regs *, enum exception_code);
111 111
112extern void die(const char *, struct pt_regs *, enum exception_code) 112extern void die(const char *, struct pt_regs *, enum exception_code)
113 ATTRIB_NORET; 113 __noreturn;
114 114
115extern int die_if_no_fixup(const char *, struct pt_regs *, enum exception_code); 115extern int die_if_no_fixup(const char *, struct pt_regs *, enum exception_code);
116 116
diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h
index 9ce66e9d1c2b..7213ec9e594c 100644
--- a/arch/parisc/include/asm/processor.h
+++ b/arch/parisc/include/asm/processor.h
@@ -196,7 +196,6 @@ typedef unsigned int elf_caddr_t;
196 /* offset pc for priv. level */ \ 196 /* offset pc for priv. level */ \
197 pc |= 3; \ 197 pc |= 3; \
198 \ 198 \
199 set_fs(USER_DS); \
200 regs->iasq[0] = spaceid; \ 199 regs->iasq[0] = spaceid; \
201 regs->iasq[1] = spaceid; \ 200 regs->iasq[1] = spaceid; \
202 regs->iaoq[0] = pc; \ 201 regs->iaoq[0] = pc; \
@@ -299,7 +298,6 @@ on downward growing arches, it looks like this:
299 elf_addr_t pc = (elf_addr_t)new_pc | 3; \ 298 elf_addr_t pc = (elf_addr_t)new_pc | 3; \
300 elf_caddr_t *argv = (elf_caddr_t *)bprm->exec + 1; \ 299 elf_caddr_t *argv = (elf_caddr_t *)bprm->exec + 1; \
301 \ 300 \
302 set_fs(USER_DS); \
303 regs->iasq[0] = spaceid; \ 301 regs->iasq[0] = spaceid; \
304 regs->iasq[1] = spaceid; \ 302 regs->iasq[1] = spaceid; \
305 regs->iaoq[0] = pc; \ 303 regs->iaoq[0] = pc; \
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 4b4b9181a1a0..62c60b87d039 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -192,7 +192,6 @@ void flush_thread(void)
192 /* Only needs to handle fpu stuff or perf monitors. 192 /* Only needs to handle fpu stuff or perf monitors.
193 ** REVISIT: several arches implement a "lazy fpu state". 193 ** REVISIT: several arches implement a "lazy fpu state".
194 */ 194 */
195 set_fs(USER_DS);
196} 195}
197 196
198void release_thread(struct task_struct *dead_task) 197void release_thread(struct task_struct *dead_task)
diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c
index e63f2e7d2efb..affe5dcce7f4 100644
--- a/arch/powerpc/kernel/machine_kexec_32.c
+++ b/arch/powerpc/kernel/machine_kexec_32.c
@@ -16,10 +16,10 @@
16#include <asm/hw_irq.h> 16#include <asm/hw_irq.h>
17#include <asm/io.h> 17#include <asm/io.h>
18 18
19typedef NORET_TYPE void (*relocate_new_kernel_t)( 19typedef void (*relocate_new_kernel_t)(
20 unsigned long indirection_page, 20 unsigned long indirection_page,
21 unsigned long reboot_code_buffer, 21 unsigned long reboot_code_buffer,
22 unsigned long start_address) ATTRIB_NORET; 22 unsigned long start_address) __noreturn;
23 23
24/* 24/*
25 * This is a generic machine_kexec function suitable at least for 25 * This is a generic machine_kexec function suitable at least for
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index 26ccbf77dd41..d7f609086a99 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -307,9 +307,9 @@ static union thread_union kexec_stack __init_task_data =
307struct paca_struct kexec_paca; 307struct paca_struct kexec_paca;
308 308
309/* Our assembly helper, in kexec_stub.S */ 309/* Our assembly helper, in kexec_stub.S */
310extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start, 310extern void kexec_sequence(void *newstack, unsigned long start,
311 void *image, void *control, 311 void *image, void *control,
312 void (*clear_all)(void)) ATTRIB_NORET; 312 void (*clear_all)(void)) __noreturn;
313 313
314/* too late to fail here */ 314/* too late to fail here */
315void default_machine_kexec(struct kimage *image) 315void default_machine_kexec(struct kimage *image)
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 4ff3d8e411a7..3feefc3842a8 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -58,7 +58,7 @@ static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
58 * Allocate node_to_cpumask_map based on number of available nodes 58 * Allocate node_to_cpumask_map based on number of available nodes
59 * Requires node_possible_map to be valid. 59 * Requires node_possible_map to be valid.
60 * 60 *
61 * Note: node_to_cpumask() is not valid until after this is done. 61 * Note: cpumask_of_node() is not valid until after this is done.
62 */ 62 */
63static void __init setup_node_to_cpumask_map(void) 63static void __init setup_node_to_cpumask_map(void)
64{ 64{
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 330a57b7c17c..36f957f31842 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -638,7 +638,6 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
638 /* These are almost always orderly shutdowns. */ 638 /* These are almost always orderly shutdowns. */
639 return; 639 return;
640 case KMSG_DUMP_OOPS: 640 case KMSG_DUMP_OOPS:
641 case KMSG_DUMP_KEXEC:
642 break; 641 break;
643 case KMSG_DUMP_PANIC: 642 case KMSG_DUMP_PANIC:
644 panicking = true; 643 panicking = true;
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 27272f6a14c2..d25843a6a915 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -236,7 +236,7 @@ static inline unsigned long __rewind_psw(psw_t psw, unsigned long ilc)
236/* 236/*
237 * Function to drop a processor into disabled wait state 237 * Function to drop a processor into disabled wait state
238 */ 238 */
239static inline void ATTRIB_NORET disabled_wait(unsigned long code) 239static inline void __noreturn disabled_wait(unsigned long code)
240{ 240{
241 unsigned long ctl_buf; 241 unsigned long ctl_buf;
242 psw_t dw_psw; 242 psw_t dw_psw;
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index fab88431a06f..0fd2e863e114 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -30,7 +30,7 @@ struct mcck_struct {
30 30
31static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); 31static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck);
32 32
33static NORET_TYPE void s390_handle_damage(char *msg) 33static void s390_handle_damage(char *msg)
34{ 34{
35 smp_send_stop(); 35 smp_send_stop();
36 disabled_wait((unsigned long) __builtin_return_address(0)); 36 disabled_wait((unsigned long) __builtin_return_address(0));
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index aaf6d59c2012..7ec665178125 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -70,7 +70,7 @@ void show_regs(struct pt_regs * regs)
70/* 70/*
71 * Create a kernel thread 71 * Create a kernel thread
72 */ 72 */
73ATTRIB_NORET void kernel_thread_helper(void *arg, int (*fn)(void *)) 73__noreturn void kernel_thread_helper(void *arg, int (*fn)(void *))
74{ 74{
75 do_exit(fn(arg)); 75 do_exit(fn(arg));
76} 76}
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index 210c1cabcb7f..cbd4e4bb9fc5 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -285,7 +285,7 @@ void show_regs(struct pt_regs *regs)
285/* 285/*
286 * Create a kernel thread 286 * Create a kernel thread
287 */ 287 */
288ATTRIB_NORET void kernel_thread_helper(void *arg, int (*fn)(void *)) 288__noreturn void kernel_thread_helper(void *arg, int (*fn)(void *))
289{ 289{
290 do_exit(fn(arg)); 290 do_exit(fn(arg));
291} 291}
diff --git a/arch/tile/kernel/machine_kexec.c b/arch/tile/kernel/machine_kexec.c
index e00d7179989e..6255f2eab112 100644
--- a/arch/tile/kernel/machine_kexec.c
+++ b/arch/tile/kernel/machine_kexec.c
@@ -248,11 +248,11 @@ static void setup_quasi_va_is_pa(void)
248} 248}
249 249
250 250
251NORET_TYPE void machine_kexec(struct kimage *image) 251void machine_kexec(struct kimage *image)
252{ 252{
253 void *reboot_code_buffer; 253 void *reboot_code_buffer;
254 NORET_TYPE void (*rnk)(unsigned long, void *, unsigned long) 254 void (*rnk)(unsigned long, void *, unsigned long)
255 ATTRIB_NORET; 255 __noreturn;
256 256
257 /* Mask all interrupts before starting to reboot. */ 257 /* Mask all interrupts before starting to reboot. */
258 interrupt_mask_set_mask(~0ULL); 258 interrupt_mask_set_mask(~0ULL);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a150f4c35e94..6c14ecd851d0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -60,6 +60,9 @@ config X86
60 select PERF_EVENTS 60 select PERF_EVENTS
61 select HAVE_PERF_EVENTS_NMI 61 select HAVE_PERF_EVENTS_NMI
62 select ANON_INODES 62 select ANON_INODES
63 select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
64 select HAVE_CMPXCHG_LOCAL if !M386
65 select HAVE_CMPXCHG_DOUBLE
63 select HAVE_ARCH_KMEMCHECK 66 select HAVE_ARCH_KMEMCHECK
64 select HAVE_USER_RETURN_NOTIFIER 67 select HAVE_USER_RETURN_NOTIFIER
65 select ARCH_BINFMT_ELF_RANDOMIZE_PIE 68 select ARCH_BINFMT_ELF_RANDOMIZE_PIE
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index e3ca7e0d858c..3c57033e2211 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -309,12 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT
309config X86_CMPXCHG 309config X86_CMPXCHG
310 def_bool X86_64 || (X86_32 && !M386) 310 def_bool X86_64 || (X86_32 && !M386)
311 311
312config CMPXCHG_LOCAL
313 def_bool X86_64 || (X86_32 && !M386)
314
315config CMPXCHG_DOUBLE
316 def_bool y
317
318config X86_L1_CACHE_SHIFT 312config X86_L1_CACHE_SHIFT
319 int 313 int
320 default "7" if MPENTIUM4 || MPSC 314 default "7" if MPENTIUM4 || MPSC
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 020cd2e80873..19d3fa08b119 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -110,7 +110,7 @@ void __cpuinit numa_clear_node(int cpu)
110 * Allocate node_to_cpumask_map based on number of available nodes 110 * Allocate node_to_cpumask_map based on number of available nodes
111 * Requires node_possible_map to be valid. 111 * Requires node_possible_map to be valid.
112 * 112 *
113 * Note: node_to_cpumask() is not valid until after this is done. 113 * Note: cpumask_of_node() is not valid until after this is done.
114 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) 114 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
115 */ 115 */
116void __init setup_node_to_cpumask_map(void) 116void __init setup_node_to_cpumask_map(void)
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 1d97bd84b6fb..b2b54d2edf53 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -6,14 +6,6 @@ menu "UML-specific options"
6 6
7menu "Host processor type and features" 7menu "Host processor type and features"
8 8
9config CMPXCHG_LOCAL
10 bool
11 default n
12
13config CMPXCHG_DOUBLE
14 bool
15 default n
16
17source "arch/x86/Kconfig.cpu" 9source "arch/x86/Kconfig.cpu"
18 10
19endmenu 11endmenu
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index f17e3ea041c0..ed5de58c340f 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -295,11 +295,22 @@ static int memory_block_change_state(struct memory_block *mem,
295 295
296 ret = memory_block_action(mem->start_section_nr, to_state); 296 ret = memory_block_action(mem->start_section_nr, to_state);
297 297
298 if (ret) 298 if (ret) {
299 mem->state = from_state_req; 299 mem->state = from_state_req;
300 else 300 goto out;
301 mem->state = to_state; 301 }
302 302
303 mem->state = to_state;
304 switch (mem->state) {
305 case MEM_OFFLINE:
306 kobject_uevent(&mem->dev.kobj, KOBJ_OFFLINE);
307 break;
308 case MEM_ONLINE:
309 kobject_uevent(&mem->dev.kobj, KOBJ_ONLINE);
310 break;
311 default:
312 break;
313 }
303out: 314out:
304 mutex_unlock(&mem->state_mutex); 315 mutex_unlock(&mem->state_mutex);
305 return ret; 316 return ret;
diff --git a/drivers/char/ramoops.c b/drivers/char/ramoops.c
index 7c7f42a1f880..9fec3232b736 100644
--- a/drivers/char/ramoops.c
+++ b/drivers/char/ramoops.c
@@ -83,8 +83,7 @@ static void ramoops_do_dump(struct kmsg_dumper *dumper,
83 struct timeval timestamp; 83 struct timeval timestamp;
84 84
85 if (reason != KMSG_DUMP_OOPS && 85 if (reason != KMSG_DUMP_OOPS &&
86 reason != KMSG_DUMP_PANIC && 86 reason != KMSG_DUMP_PANIC)
87 reason != KMSG_DUMP_KEXEC)
88 return; 87 return;
89 88
90 /* Only dump oopses if dump_oops is set */ 89 /* Only dump oopses if dump_oops is set */
@@ -126,8 +125,8 @@ static int __init ramoops_probe(struct platform_device *pdev)
126 goto fail3; 125 goto fail3;
127 } 126 }
128 127
129 rounddown_pow_of_two(pdata->mem_size); 128 pdata->mem_size = rounddown_pow_of_two(pdata->mem_size);
130 rounddown_pow_of_two(pdata->record_size); 129 pdata->record_size = rounddown_pow_of_two(pdata->record_size);
131 130
132 /* Check for the minimum memory size */ 131 /* Check for the minimum memory size */
133 if (pdata->mem_size < MIN_MEM_SIZE && 132 if (pdata->mem_size < MIN_MEM_SIZE &&
@@ -148,14 +147,6 @@ static int __init ramoops_probe(struct platform_device *pdev)
148 cxt->phys_addr = pdata->mem_address; 147 cxt->phys_addr = pdata->mem_address;
149 cxt->record_size = pdata->record_size; 148 cxt->record_size = pdata->record_size;
150 cxt->dump_oops = pdata->dump_oops; 149 cxt->dump_oops = pdata->dump_oops;
151 /*
152 * Update the module parameter variables as well so they are visible
153 * through /sys/module/ramoops/parameters/
154 */
155 mem_size = pdata->mem_size;
156 mem_address = pdata->mem_address;
157 record_size = pdata->record_size;
158 dump_oops = pdata->dump_oops;
159 150
160 if (!request_mem_region(cxt->phys_addr, cxt->size, "ramoops")) { 151 if (!request_mem_region(cxt->phys_addr, cxt->size, "ramoops")) {
161 pr_err("request mem region failed\n"); 152 pr_err("request mem region failed\n");
@@ -176,6 +167,15 @@ static int __init ramoops_probe(struct platform_device *pdev)
176 goto fail1; 167 goto fail1;
177 } 168 }
178 169
170 /*
171 * Update the module parameter variables as well so they are visible
172 * through /sys/module/ramoops/parameters/
173 */
174 mem_size = pdata->mem_size;
175 mem_address = pdata->mem_address;
176 record_size = pdata->record_size;
177 dump_oops = pdata->dump_oops;
178
179 return 0; 179 return 0;
180 180
181fail1: 181fail1:
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c
index db8e8272d69b..3ce99e00a49e 100644
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -315,8 +315,7 @@ static void mtdoops_do_dump(struct kmsg_dumper *dumper,
315 char *dst; 315 char *dst;
316 316
317 if (reason != KMSG_DUMP_OOPS && 317 if (reason != KMSG_DUMP_OOPS &&
318 reason != KMSG_DUMP_PANIC && 318 reason != KMSG_DUMP_PANIC)
319 reason != KMSG_DUMP_KEXEC)
320 return; 319 return;
321 320
322 /* Only dump oopses if dump_oops is set */ 321 /* Only dump oopses if dump_oops is set */
diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index d0b597b50398..0cb64f50cecd 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -3404,8 +3404,8 @@ static int __init parport_init_mode_setup(char *str)
3404#endif 3404#endif
3405 3405
3406#ifdef MODULE 3406#ifdef MODULE
3407static const char *irq[PARPORT_PC_MAX_PORTS]; 3407static char *irq[PARPORT_PC_MAX_PORTS];
3408static const char *dma[PARPORT_PC_MAX_PORTS]; 3408static char *dma[PARPORT_PC_MAX_PORTS];
3409 3409
3410MODULE_PARM_DESC(io, "Base I/O address (SPP regs)"); 3410MODULE_PARM_DESC(io, "Base I/O address (SPP regs)");
3411module_param_array(io, int, NULL, 0); 3411module_param_array(io, int, NULL, 0);
diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c
index 081dc4745274..fe13ac567d54 100644
--- a/drivers/video/nvidia/nvidia.c
+++ b/drivers/video/nvidia/nvidia.c
@@ -81,7 +81,7 @@ static int vram __devinitdata = 0;
81static int bpp __devinitdata = 8; 81static int bpp __devinitdata = 8;
82static int reverse_i2c __devinitdata; 82static int reverse_i2c __devinitdata;
83#ifdef CONFIG_MTRR 83#ifdef CONFIG_MTRR
84static int nomtrr __devinitdata = 0; 84static bool nomtrr __devinitdata = false;
85#endif 85#endif
86#ifdef CONFIG_PMAC_BACKLIGHT 86#ifdef CONFIG_PMAC_BACKLIGHT
87static int backlight __devinitdata = 1; 87static int backlight __devinitdata = 1;
@@ -1509,7 +1509,7 @@ static int __devinit nvidiafb_setup(char *options)
1509 backlight = simple_strtoul(this_opt+10, NULL, 0); 1509 backlight = simple_strtoul(this_opt+10, NULL, 0);
1510#ifdef CONFIG_MTRR 1510#ifdef CONFIG_MTRR
1511 } else if (!strncmp(this_opt, "nomtrr", 6)) { 1511 } else if (!strncmp(this_opt, "nomtrr", 6)) {
1512 nomtrr = 1; 1512 nomtrr = true;
1513#endif 1513#endif
1514 } else if (!strncmp(this_opt, "fpdither:", 9)) { 1514 } else if (!strncmp(this_opt, "fpdither:", 9)) {
1515 fpdither = simple_strtol(this_opt+9, NULL, 0); 1515 fpdither = simple_strtol(this_opt+9, NULL, 0);
@@ -1599,7 +1599,7 @@ MODULE_PARM_DESC(bpp, "pixel width in bits"
1599module_param(reverse_i2c, int, 0); 1599module_param(reverse_i2c, int, 0);
1600MODULE_PARM_DESC(reverse_i2c, "reverse port assignment of the i2c bus"); 1600MODULE_PARM_DESC(reverse_i2c, "reverse port assignment of the i2c bus");
1601#ifdef CONFIG_MTRR 1601#ifdef CONFIG_MTRR
1602module_param(nomtrr, bool, 0); 1602module_param(nomtrr, bool, false);
1603MODULE_PARM_DESC(nomtrr, "Disables MTRR support (0 or 1=disabled) " 1603MODULE_PARM_DESC(nomtrr, "Disables MTRR support (0 or 1=disabled) "
1604 "(default=0)"); 1604 "(default=0)");
1605#endif 1605#endif
diff --git a/fs/block_dev.c b/fs/block_dev.c
index afe74dda632b..0e575d1304b4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1139,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1139 mutex_lock_nested(&bdev->bd_mutex, for_part); 1139 mutex_lock_nested(&bdev->bd_mutex, for_part);
1140 if (!bdev->bd_openers) { 1140 if (!bdev->bd_openers) {
1141 bdev->bd_disk = disk; 1141 bdev->bd_disk = disk;
1142 bdev->bd_queue = disk->queue;
1142 bdev->bd_contains = bdev; 1143 bdev->bd_contains = bdev;
1143 if (!partno) { 1144 if (!partno) {
1144 struct backing_dev_info *bdi; 1145 struct backing_dev_info *bdi;
@@ -1159,6 +1160,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1159 disk_put_part(bdev->bd_part); 1160 disk_put_part(bdev->bd_part);
1160 bdev->bd_part = NULL; 1161 bdev->bd_part = NULL;
1161 bdev->bd_disk = NULL; 1162 bdev->bd_disk = NULL;
1163 bdev->bd_queue = NULL;
1162 mutex_unlock(&bdev->bd_mutex); 1164 mutex_unlock(&bdev->bd_mutex);
1163 disk_unblock_events(disk); 1165 disk_unblock_events(disk);
1164 put_disk(disk); 1166 put_disk(disk);
@@ -1232,6 +1234,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1232 disk_put_part(bdev->bd_part); 1234 disk_put_part(bdev->bd_part);
1233 bdev->bd_disk = NULL; 1235 bdev->bd_disk = NULL;
1234 bdev->bd_part = NULL; 1236 bdev->bd_part = NULL;
1237 bdev->bd_queue = NULL;
1235 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); 1238 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1236 if (bdev != bdev->bd_contains) 1239 if (bdev != bdev->bd_contains)
1237 __blkdev_put(bdev->bd_contains, mode, 1); 1240 __blkdev_put(bdev->bd_contains, mode, 1);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f99a099a7747..d8525662ca7a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -872,7 +872,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
872 872
873#ifdef CONFIG_MIGRATION 873#ifdef CONFIG_MIGRATION
874static int btree_migratepage(struct address_space *mapping, 874static int btree_migratepage(struct address_space *mapping,
875 struct page *newpage, struct page *page) 875 struct page *newpage, struct page *page,
876 enum migrate_mode mode)
876{ 877{
877 /* 878 /*
878 * we can't safely write a btree page from here, 879 * we can't safely write a btree page from here,
@@ -887,7 +888,7 @@ static int btree_migratepage(struct address_space *mapping,
887 if (page_has_private(page) && 888 if (page_has_private(page) &&
888 !try_to_release_page(page, GFP_KERNEL)) 889 !try_to_release_page(page, GFP_KERNEL))
889 return -EAGAIN; 890 return -EAGAIN;
890 return migrate_page(mapping, newpage, page); 891 return migrate_page(mapping, newpage, page, mode);
891} 892}
892#endif 893#endif
893 894
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d740ab67ff6e..4a588dbd11bf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -36,6 +36,7 @@
36#include <linux/rwsem.h> 36#include <linux/rwsem.h>
37#include <linux/uio.h> 37#include <linux/uio.h>
38#include <linux/atomic.h> 38#include <linux/atomic.h>
39#include <linux/prefetch.h>
39 40
40/* 41/*
41 * How many user pages to map in one call to get_user_pages(). This determines 42 * How many user pages to map in one call to get_user_pages(). This determines
@@ -580,9 +581,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
580{ 581{
581 int ret; 582 int ret;
582 sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ 583 sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
584 sector_t fs_endblk; /* Into file, in filesystem-sized blocks */
583 unsigned long fs_count; /* Number of filesystem-sized blocks */ 585 unsigned long fs_count; /* Number of filesystem-sized blocks */
584 unsigned long dio_count;/* Number of dio_block-sized blocks */
585 unsigned long blkmask;
586 int create; 586 int create;
587 587
588 /* 588 /*
@@ -593,11 +593,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
593 if (ret == 0) { 593 if (ret == 0) {
594 BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); 594 BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
595 fs_startblk = sdio->block_in_file >> sdio->blkfactor; 595 fs_startblk = sdio->block_in_file >> sdio->blkfactor;
596 dio_count = sdio->final_block_in_request - sdio->block_in_file; 596 fs_endblk = (sdio->final_block_in_request - 1) >>
597 fs_count = dio_count >> sdio->blkfactor; 597 sdio->blkfactor;
598 blkmask = (1 << sdio->blkfactor) - 1; 598 fs_count = fs_endblk - fs_startblk + 1;
599 if (dio_count & blkmask)
600 fs_count++;
601 599
602 map_bh->b_state = 0; 600 map_bh->b_state = 0;
603 map_bh->b_size = fs_count << dio->inode->i_blkbits; 601 map_bh->b_size = fs_count << dio->inode->i_blkbits;
@@ -1090,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio)
1090 * individual fields and will generate much worse code. This is important 1088 * individual fields and will generate much worse code. This is important
1091 * for the whole file. 1089 * for the whole file.
1092 */ 1090 */
1093ssize_t 1091static inline ssize_t
1094__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1092do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1095 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1093 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1096 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1094 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1097 dio_submit_t submit_io, int flags) 1095 dio_submit_t submit_io, int flags)
@@ -1100,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1100 size_t size; 1098 size_t size;
1101 unsigned long addr; 1099 unsigned long addr;
1102 unsigned blkbits = inode->i_blkbits; 1100 unsigned blkbits = inode->i_blkbits;
1103 unsigned bdev_blkbits = 0;
1104 unsigned blocksize_mask = (1 << blkbits) - 1; 1101 unsigned blocksize_mask = (1 << blkbits) - 1;
1105 ssize_t retval = -EINVAL; 1102 ssize_t retval = -EINVAL;
1106 loff_t end = offset; 1103 loff_t end = offset;
@@ -1113,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1113 if (rw & WRITE) 1110 if (rw & WRITE)
1114 rw = WRITE_ODIRECT; 1111 rw = WRITE_ODIRECT;
1115 1112
1116 if (bdev) 1113 /*
1117 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); 1114 * Avoid references to bdev if not absolutely needed to give
1115 * the early prefetch in the caller enough time.
1116 */
1118 1117
1119 if (offset & blocksize_mask) { 1118 if (offset & blocksize_mask) {
1120 if (bdev) 1119 if (bdev)
1121 blkbits = bdev_blkbits; 1120 blkbits = blksize_bits(bdev_logical_block_size(bdev));
1122 blocksize_mask = (1 << blkbits) - 1; 1121 blocksize_mask = (1 << blkbits) - 1;
1123 if (offset & blocksize_mask) 1122 if (offset & blocksize_mask)
1124 goto out; 1123 goto out;
@@ -1129,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1129 addr = (unsigned long)iov[seg].iov_base; 1128 addr = (unsigned long)iov[seg].iov_base;
1130 size = iov[seg].iov_len; 1129 size = iov[seg].iov_len;
1131 end += size; 1130 end += size;
1132 if ((addr & blocksize_mask) || (size & blocksize_mask)) { 1131 if (unlikely((addr & blocksize_mask) ||
1132 (size & blocksize_mask))) {
1133 if (bdev) 1133 if (bdev)
1134 blkbits = bdev_blkbits; 1134 blkbits = blksize_bits(
1135 bdev_logical_block_size(bdev));
1135 blocksize_mask = (1 << blkbits) - 1; 1136 blocksize_mask = (1 << blkbits) - 1;
1136 if ((addr & blocksize_mask) || (size & blocksize_mask)) 1137 if ((addr & blocksize_mask) || (size & blocksize_mask))
1137 goto out; 1138 goto out;
1138 } 1139 }
1139 } 1140 }
@@ -1316,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1316out: 1317out:
1317 return retval; 1318 return retval;
1318} 1319}
1320
1321ssize_t
1322__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1323 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1324 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1325 dio_submit_t submit_io, int flags)
1326{
1327 /*
1328 * The block device state is needed in the end to finally
1329 * submit everything. Since it's likely to be cache cold
1330 * prefetch it here as first thing to hide some of the
1331 * latency.
1332 *
1333 * Attempt to prefetch the pieces we likely need later.
1334 */
1335 prefetch(&bdev->bd_disk->part_tbl);
1336 prefetch(bdev->bd_queue);
1337 prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
1338
1339 return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
1340 nr_segs, get_block, end_io,
1341 submit_io, flags);
1342}
1343
1319EXPORT_SYMBOL(__blockdev_direct_IO); 1344EXPORT_SYMBOL(__blockdev_direct_IO);
1320 1345
1321static __init int dio_init(void) 1346static __init int dio_init(void)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 828e750af23a..aabdfc38cf24 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -197,6 +197,12 @@ struct eventpoll {
197 197
198 /* The user that created the eventpoll descriptor */ 198 /* The user that created the eventpoll descriptor */
199 struct user_struct *user; 199 struct user_struct *user;
200
201 struct file *file;
202
203 /* used to optimize loop detection check */
204 int visited;
205 struct list_head visited_list_link;
200}; 206};
201 207
202/* Wait structure used by the poll hooks */ 208/* Wait structure used by the poll hooks */
@@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly;
255/* Slab cache used to allocate "struct eppoll_entry" */ 261/* Slab cache used to allocate "struct eppoll_entry" */
256static struct kmem_cache *pwq_cache __read_mostly; 262static struct kmem_cache *pwq_cache __read_mostly;
257 263
264/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
265static LIST_HEAD(visited_list);
266
267/*
268 * List of files with newly added links, where we may need to limit the number
269 * of emanating paths. Protected by the epmutex.
270 */
271static LIST_HEAD(tfile_check_list);
272
258#ifdef CONFIG_SYSCTL 273#ifdef CONFIG_SYSCTL
259 274
260#include <linux/sysctl.h> 275#include <linux/sysctl.h>
@@ -276,6 +291,12 @@ ctl_table epoll_table[] = {
276}; 291};
277#endif /* CONFIG_SYSCTL */ 292#endif /* CONFIG_SYSCTL */
278 293
294static const struct file_operations eventpoll_fops;
295
296static inline int is_file_epoll(struct file *f)
297{
298 return f->f_op == &eventpoll_fops;
299}
279 300
280/* Setup the structure that is used as key for the RB tree */ 301/* Setup the structure that is used as key for the RB tree */
281static inline void ep_set_ffd(struct epoll_filefd *ffd, 302static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -711,12 +732,6 @@ static const struct file_operations eventpoll_fops = {
711 .llseek = noop_llseek, 732 .llseek = noop_llseek,
712}; 733};
713 734
714/* Fast test to see if the file is an eventpoll file */
715static inline int is_file_epoll(struct file *f)
716{
717 return f->f_op == &eventpoll_fops;
718}
719
720/* 735/*
721 * This is called from eventpoll_release() to unlink files from the eventpoll 736 * This is called from eventpoll_release() to unlink files from the eventpoll
722 * interface. We need to have this facility to cleanup correctly files that are 737 * interface. We need to have this facility to cleanup correctly files that are
@@ -926,6 +941,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
926 rb_insert_color(&epi->rbn, &ep->rbr); 941 rb_insert_color(&epi->rbn, &ep->rbr);
927} 942}
928 943
944
945
946#define PATH_ARR_SIZE 5
947/*
948 * These are the number paths of length 1 to 5, that we are allowing to emanate
949 * from a single file of interest. For example, we allow 1000 paths of length
950 * 1, to emanate from each file of interest. This essentially represents the
951 * potential wakeup paths, which need to be limited in order to avoid massive
952 * uncontrolled wakeup storms. The common use case should be a single ep which
953 * is connected to n file sources. In this case each file source has 1 path
954 * of length 1. Thus, the numbers below should be more than sufficient. These
955 * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
956 * and delete can't add additional paths. Protected by the epmutex.
957 */
958static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
959static int path_count[PATH_ARR_SIZE];
960
961static int path_count_inc(int nests)
962{
963 if (++path_count[nests] > path_limits[nests])
964 return -1;
965 return 0;
966}
967
968static void path_count_init(void)
969{
970 int i;
971
972 for (i = 0; i < PATH_ARR_SIZE; i++)
973 path_count[i] = 0;
974}
975
976static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
977{
978 int error = 0;
979 struct file *file = priv;
980 struct file *child_file;
981 struct epitem *epi;
982
983 list_for_each_entry(epi, &file->f_ep_links, fllink) {
984 child_file = epi->ep->file;
985 if (is_file_epoll(child_file)) {
986 if (list_empty(&child_file->f_ep_links)) {
987 if (path_count_inc(call_nests)) {
988 error = -1;
989 break;
990 }
991 } else {
992 error = ep_call_nested(&poll_loop_ncalls,
993 EP_MAX_NESTS,
994 reverse_path_check_proc,
995 child_file, child_file,
996 current);
997 }
998 if (error != 0)
999 break;
1000 } else {
1001 printk(KERN_ERR "reverse_path_check_proc: "
1002 "file is not an ep!\n");
1003 }
1004 }
1005 return error;
1006}
1007
1008/**
1009 * reverse_path_check - The tfile_check_list is list of file *, which have
1010 * links that are proposed to be newly added. We need to
1011 * make sure that those added links don't add too many
1012 * paths such that we will spend all our time waking up
1013 * eventpoll objects.
1014 *
1015 * Returns: Returns zero if the proposed links don't create too many paths,
1016 * -1 otherwise.
1017 */
1018static int reverse_path_check(void)
1019{
1020 int length = 0;
1021 int error = 0;
1022 struct file *current_file;
1023
1024 /* let's call this for all tfiles */
1025 list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
1026 length++;
1027 path_count_init();
1028 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1029 reverse_path_check_proc, current_file,
1030 current_file, current);
1031 if (error)
1032 break;
1033 }
1034 return error;
1035}
1036
929/* 1037/*
930 * Must be called with "mtx" held. 1038 * Must be called with "mtx" held.
931 */ 1039 */
@@ -987,6 +1095,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
987 */ 1095 */
988 ep_rbtree_insert(ep, epi); 1096 ep_rbtree_insert(ep, epi);
989 1097
1098 /* now check if we've created too many backpaths */
1099 error = -EINVAL;
1100 if (reverse_path_check())
1101 goto error_remove_epi;
1102
990 /* We have to drop the new item inside our item list to keep track of it */ 1103 /* We have to drop the new item inside our item list to keep track of it */
991 spin_lock_irqsave(&ep->lock, flags); 1104 spin_lock_irqsave(&ep->lock, flags);
992 1105
@@ -1011,6 +1124,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1011 1124
1012 return 0; 1125 return 0;
1013 1126
1127error_remove_epi:
1128 spin_lock(&tfile->f_lock);
1129 if (ep_is_linked(&epi->fllink))
1130 list_del_init(&epi->fllink);
1131 spin_unlock(&tfile->f_lock);
1132
1133 rb_erase(&epi->rbn, &ep->rbr);
1134
1014error_unregister: 1135error_unregister:
1015 ep_unregister_pollwait(ep, epi); 1136 ep_unregister_pollwait(ep, epi);
1016 1137
@@ -1275,18 +1396,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1275 int error = 0; 1396 int error = 0;
1276 struct file *file = priv; 1397 struct file *file = priv;
1277 struct eventpoll *ep = file->private_data; 1398 struct eventpoll *ep = file->private_data;
1399 struct eventpoll *ep_tovisit;
1278 struct rb_node *rbp; 1400 struct rb_node *rbp;
1279 struct epitem *epi; 1401 struct epitem *epi;
1280 1402
1281 mutex_lock_nested(&ep->mtx, call_nests + 1); 1403 mutex_lock_nested(&ep->mtx, call_nests + 1);
1404 ep->visited = 1;
1405 list_add(&ep->visited_list_link, &visited_list);
1282 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { 1406 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1283 epi = rb_entry(rbp, struct epitem, rbn); 1407 epi = rb_entry(rbp, struct epitem, rbn);
1284 if (unlikely(is_file_epoll(epi->ffd.file))) { 1408 if (unlikely(is_file_epoll(epi->ffd.file))) {
1409 ep_tovisit = epi->ffd.file->private_data;
1410 if (ep_tovisit->visited)
1411 continue;
1285 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, 1412 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1286 ep_loop_check_proc, epi->ffd.file, 1413 ep_loop_check_proc, epi->ffd.file,
1287 epi->ffd.file->private_data, current); 1414 ep_tovisit, current);
1288 if (error != 0) 1415 if (error != 0)
1289 break; 1416 break;
1417 } else {
1418 /*
1419 * If we've reached a file that is not associated with
1420 * an ep, then we need to check if the newly added
1421 * links are going to add too many wakeup paths. We do
1422 * this by adding it to the tfile_check_list, if it's
1423 * not already there, and calling reverse_path_check()
1424 * during ep_insert().
1425 */
1426 if (list_empty(&epi->ffd.file->f_tfile_llink))
1427 list_add(&epi->ffd.file->f_tfile_llink,
1428 &tfile_check_list);
1290 } 1429 }
1291 } 1430 }
1292 mutex_unlock(&ep->mtx); 1431 mutex_unlock(&ep->mtx);
@@ -1307,8 +1446,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1307 */ 1446 */
1308static int ep_loop_check(struct eventpoll *ep, struct file *file) 1447static int ep_loop_check(struct eventpoll *ep, struct file *file)
1309{ 1448{
1310 return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, 1449 int ret;
1450 struct eventpoll *ep_cur, *ep_next;
1451
1452 ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1311 ep_loop_check_proc, file, ep, current); 1453 ep_loop_check_proc, file, ep, current);
1454 /* clear visited list */
1455 list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
1456 visited_list_link) {
1457 ep_cur->visited = 0;
1458 list_del(&ep_cur->visited_list_link);
1459 }
1460 return ret;
1461}
1462
1463static void clear_tfile_check_list(void)
1464{
1465 struct file *file;
1466
1467 /* first clear the tfile_check_list */
1468 while (!list_empty(&tfile_check_list)) {
1469 file = list_first_entry(&tfile_check_list, struct file,
1470 f_tfile_llink);
1471 list_del_init(&file->f_tfile_llink);
1472 }
1473 INIT_LIST_HEAD(&tfile_check_list);
1312} 1474}
1313 1475
1314/* 1476/*
@@ -1316,8 +1478,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
1316 */ 1478 */
1317SYSCALL_DEFINE1(epoll_create1, int, flags) 1479SYSCALL_DEFINE1(epoll_create1, int, flags)
1318{ 1480{
1319 int error; 1481 int error, fd;
1320 struct eventpoll *ep = NULL; 1482 struct eventpoll *ep = NULL;
1483 struct file *file;
1321 1484
1322 /* Check the EPOLL_* constant for consistency. */ 1485 /* Check the EPOLL_* constant for consistency. */
1323 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); 1486 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
@@ -1334,11 +1497,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
1334 * Creates all the items needed to setup an eventpoll file. That is, 1497 * Creates all the items needed to setup an eventpoll file. That is,
1335 * a file structure and a free file descriptor. 1498 * a file structure and a free file descriptor.
1336 */ 1499 */
1337 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, 1500 fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
1501 if (fd < 0) {
1502 error = fd;
1503 goto out_free_ep;
1504 }
1505 file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
1338 O_RDWR | (flags & O_CLOEXEC)); 1506 O_RDWR | (flags & O_CLOEXEC));
1339 if (error < 0) 1507 if (IS_ERR(file)) {
1340 ep_free(ep); 1508 error = PTR_ERR(file);
1341 1509 goto out_free_fd;
1510 }
1511 fd_install(fd, file);
1512 ep->file = file;
1513 return fd;
1514
1515out_free_fd:
1516 put_unused_fd(fd);
1517out_free_ep:
1518 ep_free(ep);
1342 return error; 1519 return error;
1343} 1520}
1344 1521
@@ -1404,21 +1581,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1404 /* 1581 /*
1405 * When we insert an epoll file descriptor, inside another epoll file 1582 * When we insert an epoll file descriptor, inside another epoll file
1406 * descriptor, there is the change of creating closed loops, which are 1583 * descriptor, there is the change of creating closed loops, which are
1407 * better be handled here, than in more critical paths. 1584 * better be handled here, than in more critical paths. While we are
1585 * checking for loops we also determine the list of files reachable
1586 * and hang them on the tfile_check_list, so we can check that we
1587 * haven't created too many possible wakeup paths.
1408 * 1588 *
1409 * We hold epmutex across the loop check and the insert in this case, in 1589 * We need to hold the epmutex across both ep_insert and ep_remove
1410 * order to prevent two separate inserts from racing and each doing the 1590 * b/c we want to make sure we are looking at a coherent view of
1411 * insert "at the same time" such that ep_loop_check passes on both 1591 * epoll network.
1412 * before either one does the insert, thereby creating a cycle.
1413 */ 1592 */
1414 if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { 1593 if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
1415 mutex_lock(&epmutex); 1594 mutex_lock(&epmutex);
1416 did_lock_epmutex = 1; 1595 did_lock_epmutex = 1;
1417 error = -ELOOP;
1418 if (ep_loop_check(ep, tfile) != 0)
1419 goto error_tgt_fput;
1420 } 1596 }
1421 1597 if (op == EPOLL_CTL_ADD) {
1598 if (is_file_epoll(tfile)) {
1599 error = -ELOOP;
1600 if (ep_loop_check(ep, tfile) != 0)
1601 goto error_tgt_fput;
1602 } else
1603 list_add(&tfile->f_tfile_llink, &tfile_check_list);
1604 }
1422 1605
1423 mutex_lock_nested(&ep->mtx, 0); 1606 mutex_lock_nested(&ep->mtx, 0);
1424 1607
@@ -1437,6 +1620,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1437 error = ep_insert(ep, &epds, tfile, fd); 1620 error = ep_insert(ep, &epds, tfile, fd);
1438 } else 1621 } else
1439 error = -EEXIST; 1622 error = -EEXIST;
1623 clear_tfile_check_list();
1440 break; 1624 break;
1441 case EPOLL_CTL_DEL: 1625 case EPOLL_CTL_DEL:
1442 if (epi) 1626 if (epi)
@@ -1455,7 +1639,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1455 mutex_unlock(&ep->mtx); 1639 mutex_unlock(&ep->mtx);
1456 1640
1457error_tgt_fput: 1641error_tgt_fput:
1458 if (unlikely(did_lock_epmutex)) 1642 if (did_lock_epmutex)
1459 mutex_unlock(&epmutex); 1643 mutex_unlock(&epmutex);
1460 1644
1461 fput(tfile); 1645 fput(tfile);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e425ad9d0490..1e85a7ac0217 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -583,7 +583,8 @@ static int hugetlbfs_set_page_dirty(struct page *page)
583} 583}
584 584
585static int hugetlbfs_migrate_page(struct address_space *mapping, 585static int hugetlbfs_migrate_page(struct address_space *mapping,
586 struct page *newpage, struct page *page) 586 struct page *newpage, struct page *page,
587 enum migrate_mode mode)
587{ 588{
588 int rc; 589 int rc;
589 590
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5ee92538b063..8102db9b926c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -332,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data);
332 332
333#ifdef CONFIG_MIGRATION 333#ifdef CONFIG_MIGRATION
334extern int nfs_migrate_page(struct address_space *, 334extern int nfs_migrate_page(struct address_space *,
335 struct page *, struct page *); 335 struct page *, struct page *, enum migrate_mode);
336#else 336#else
337#define nfs_migrate_page NULL 337#define nfs_migrate_page NULL
338#endif 338#endif
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0c3885255f97..834f0fe96f89 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1688,7 +1688,7 @@ out_error:
1688 1688
1689#ifdef CONFIG_MIGRATION 1689#ifdef CONFIG_MIGRATION
1690int nfs_migrate_page(struct address_space *mapping, struct page *newpage, 1690int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1691 struct page *page) 1691 struct page *page, enum migrate_mode mode)
1692{ 1692{
1693 /* 1693 /*
1694 * If PagePrivate is set, then the page is currently associated with 1694 * If PagePrivate is set, then the page is currently associated with
@@ -1703,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1703 1703
1704 nfs_fscache_release_page(page, GFP_KERNEL); 1704 nfs_fscache_release_page(page, GFP_KERNEL);
1705 1705
1706 return migrate_page(mapping, newpage, page); 1706 return migrate_page(mapping, newpage, page, mode);
1707} 1707}
1708#endif 1708#endif
1709 1709
diff --git a/fs/pipe.c b/fs/pipe.c
index f0e485d54e64..a932ced92a16 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
1137 if (nr_pages < pipe->nrbufs) 1137 if (nr_pages < pipe->nrbufs)
1138 return -EBUSY; 1138 return -EBUSY;
1139 1139
1140 bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); 1140 bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
1141 if (unlikely(!bufs)) 1141 if (unlikely(!bufs))
1142 return -ENOMEM; 1142 return -ENOMEM;
1143 1143
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 8c344f037bd0..9252ee3b71e3 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
464 464
465 seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ 465 seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
466%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ 466%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
467%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", 467%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n",
468 pid_nr_ns(pid, ns), 468 pid_nr_ns(pid, ns),
469 tcomm, 469 tcomm,
470 state, 470 state,
@@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
511 task->policy, 511 task->policy,
512 (unsigned long long)delayacct_blkio_ticks(task), 512 (unsigned long long)delayacct_blkio_ticks(task),
513 cputime_to_clock_t(gtime), 513 cputime_to_clock_t(gtime),
514 cputime_to_clock_t(cgtime)); 514 cputime_to_clock_t(cgtime),
515 (mm && permitted) ? mm->start_data : 0,
516 (mm && permitted) ? mm->end_data : 0,
517 (mm && permitted) ? mm->start_brk : 0);
515 if (mm) 518 if (mm)
516 mmput(mm); 519 mmput(mm);
517 return 0; 520 return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8173dfd89cb2..5485a5388ecb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -654,6 +654,8 @@ static int proc_pid_permission(struct inode *inode, int mask)
654 bool has_perms; 654 bool has_perms;
655 655
656 task = get_proc_task(inode); 656 task = get_proc_task(inode);
657 if (!task)
658 return -ESRCH;
657 has_perms = has_pid_permissions(pid, task, 1); 659 has_perms = has_pid_permissions(pid, task, 1);
658 put_task_struct(task); 660 put_task_struct(task);
659 661
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index e58fa777fa09..f96a5b58a975 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -139,6 +139,20 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
139 __tlb_remove_tlb_entry(tlb, ptep, address); \ 139 __tlb_remove_tlb_entry(tlb, ptep, address); \
140 } while (0) 140 } while (0)
141 141
142/**
143 * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation
144 * This is a nop so far, because only x86 needs it.
145 */
146#ifndef __tlb_remove_pmd_tlb_entry
147#define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0)
148#endif
149
150#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \
151 do { \
152 tlb->need_flush = 1; \
153 __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \
154 } while (0)
155
142#define pte_free_tlb(tlb, ptep, address) \ 156#define pte_free_tlb(tlb, ptep, address) \
143 do { \ 157 do { \
144 tlb->need_flush = 1; \ 158 tlb->need_flush = 1; \
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 5c4abce94ad1..b936763f2236 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -5,6 +5,7 @@
5#include <linux/kexec.h> 5#include <linux/kexec.h>
6#include <linux/device.h> 6#include <linux/device.h>
7#include <linux/proc_fs.h> 7#include <linux/proc_fs.h>
8#include <linux/elf.h>
8 9
9#define ELFCORE_ADDR_MAX (-1ULL) 10#define ELFCORE_ADDR_MAX (-1ULL)
10#define ELFCORE_ADDR_ERR (-2ULL) 11#define ELFCORE_ADDR_ERR (-2ULL)
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index f362733186a5..657ab55beda0 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -61,6 +61,7 @@ struct file;
61static inline void eventpoll_init_file(struct file *file) 61static inline void eventpoll_init_file(struct file *file)
62{ 62{
63 INIT_LIST_HEAD(&file->f_ep_links); 63 INIT_LIST_HEAD(&file->f_ep_links);
64 INIT_LIST_HEAD(&file->f_tfile_llink);
64} 65}
65 66
66 67
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7aacf31418fe..4bc8169fb5a1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -525,6 +525,7 @@ enum positive_aop_returns {
525struct page; 525struct page;
526struct address_space; 526struct address_space;
527struct writeback_control; 527struct writeback_control;
528enum migrate_mode;
528 529
529struct iov_iter { 530struct iov_iter {
530 const struct iovec *iov; 531 const struct iovec *iov;
@@ -609,9 +610,12 @@ struct address_space_operations {
609 loff_t offset, unsigned long nr_segs); 610 loff_t offset, unsigned long nr_segs);
610 int (*get_xip_mem)(struct address_space *, pgoff_t, int, 611 int (*get_xip_mem)(struct address_space *, pgoff_t, int,
611 void **, unsigned long *); 612 void **, unsigned long *);
612 /* migrate the contents of a page to the specified target */ 613 /*
614 * migrate the contents of a page to the specified target. If sync
615 * is false, it must not block.
616 */
613 int (*migratepage) (struct address_space *, 617 int (*migratepage) (struct address_space *,
614 struct page *, struct page *); 618 struct page *, struct page *, enum migrate_mode);
615 int (*launder_page) (struct page *); 619 int (*launder_page) (struct page *);
616 int (*is_partially_uptodate) (struct page *, read_descriptor_t *, 620 int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
617 unsigned long); 621 unsigned long);
@@ -656,6 +660,7 @@ struct address_space {
656 * must be enforced here for CRIS, to let the least significant bit 660 * must be enforced here for CRIS, to let the least significant bit
657 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. 661 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
658 */ 662 */
663struct request_queue;
659 664
660struct block_device { 665struct block_device {
661 dev_t bd_dev; /* not a kdev_t - it's a search key */ 666 dev_t bd_dev; /* not a kdev_t - it's a search key */
@@ -678,6 +683,7 @@ struct block_device {
678 unsigned bd_part_count; 683 unsigned bd_part_count;
679 int bd_invalidated; 684 int bd_invalidated;
680 struct gendisk * bd_disk; 685 struct gendisk * bd_disk;
686 struct request_queue * bd_queue;
681 struct list_head bd_list; 687 struct list_head bd_list;
682 /* 688 /*
683 * Private data. You must have bd_claim'ed the block_device 689 * Private data. You must have bd_claim'ed the block_device
@@ -1001,6 +1007,7 @@ struct file {
1001#ifdef CONFIG_EPOLL 1007#ifdef CONFIG_EPOLL
1002 /* Used by fs/eventpoll.c to link all the hooks to this file */ 1008 /* Used by fs/eventpoll.c to link all the hooks to this file */
1003 struct list_head f_ep_links; 1009 struct list_head f_ep_links;
1010 struct list_head f_tfile_llink;
1004#endif /* #ifdef CONFIG_EPOLL */ 1011#endif /* #ifdef CONFIG_EPOLL */
1005 struct address_space *f_mapping; 1012 struct address_space *f_mapping;
1006#ifdef CONFIG_DEBUG_WRITECOUNT 1013#ifdef CONFIG_DEBUG_WRITECOUNT
@@ -2536,7 +2543,8 @@ extern int generic_check_addressable(unsigned, u64);
2536 2543
2537#ifdef CONFIG_MIGRATION 2544#ifdef CONFIG_MIGRATION
2538extern int buffer_migrate_page(struct address_space *, 2545extern int buffer_migrate_page(struct address_space *,
2539 struct page *, struct page *); 2546 struct page *, struct page *,
2547 enum migrate_mode);
2540#else 2548#else
2541#define buffer_migrate_page NULL 2549#define buffer_migrate_page NULL
2542#endif 2550#endif
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a9ace9c32507..1b921299abc4 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -18,7 +18,7 @@ extern struct page *follow_trans_huge_pmd(struct mm_struct *mm,
18 unsigned int flags); 18 unsigned int flags);
19extern int zap_huge_pmd(struct mmu_gather *tlb, 19extern int zap_huge_pmd(struct mmu_gather *tlb,
20 struct vm_area_struct *vma, 20 struct vm_area_struct *vma,
21 pmd_t *pmd); 21 pmd_t *pmd, unsigned long addr);
22extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 22extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
23 unsigned long addr, unsigned long end, 23 unsigned long addr, unsigned long end,
24 unsigned char *vec); 24 unsigned char *vec);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d0a7a0c71661..e8343422240a 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -185,16 +185,17 @@ static inline void might_fault(void)
185 185
186extern struct atomic_notifier_head panic_notifier_list; 186extern struct atomic_notifier_head panic_notifier_list;
187extern long (*panic_blink)(int state); 187extern long (*panic_blink)(int state);
188NORET_TYPE void panic(const char * fmt, ...) 188__printf(1, 2)
189 __attribute__ ((NORET_AND format (printf, 1, 2))) __cold; 189void panic(const char *fmt, ...)
190 __noreturn __cold;
190extern void oops_enter(void); 191extern void oops_enter(void);
191extern void oops_exit(void); 192extern void oops_exit(void);
192void print_oops_end_marker(void); 193void print_oops_end_marker(void);
193extern int oops_may_print(void); 194extern int oops_may_print(void);
194NORET_TYPE void do_exit(long error_code) 195void do_exit(long error_code)
195 ATTRIB_NORET; 196 __noreturn;
196NORET_TYPE void complete_and_exit(struct completion *, long) 197void complete_and_exit(struct completion *, long)
197 ATTRIB_NORET; 198 __noreturn;
198 199
199/* Internal, do not use. */ 200/* Internal, do not use. */
200int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); 201int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index ee0c952188de..fee66317e071 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -18,7 +18,6 @@
18enum kmsg_dump_reason { 18enum kmsg_dump_reason {
19 KMSG_DUMP_OOPS, 19 KMSG_DUMP_OOPS,
20 KMSG_DUMP_PANIC, 20 KMSG_DUMP_PANIC,
21 KMSG_DUMP_KEXEC,
22 KMSG_DUMP_RESTART, 21 KMSG_DUMP_RESTART,
23 KMSG_DUMP_HALT, 22 KMSG_DUMP_HALT,
24 KMSG_DUMP_POWEROFF, 23 KMSG_DUMP_POWEROFF,
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 3f46aedea42f..807f1e533226 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -88,8 +88,4 @@
88 88
89#endif 89#endif
90 90
91#define NORET_TYPE /**/
92#define ATTRIB_NORET __attribute__((noreturn))
93#define NORET_AND noreturn,
94
95#endif 91#endif
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f944591765eb..4d34356fe644 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -32,13 +32,11 @@ enum mem_cgroup_page_stat_item {
32 MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ 32 MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
33}; 33};
34 34
35extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 35struct mem_cgroup_reclaim_cookie {
36 struct list_head *dst, 36 struct zone *zone;
37 unsigned long *scanned, int order, 37 int priority;
38 isolate_mode_t mode, 38 unsigned int generation;
39 struct zone *z, 39};
40 struct mem_cgroup *mem_cont,
41 int active, int file);
42 40
43#ifdef CONFIG_CGROUP_MEM_RES_CTLR 41#ifdef CONFIG_CGROUP_MEM_RES_CTLR
44/* 42/*
@@ -56,20 +54,21 @@ extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
56 gfp_t gfp_mask); 54 gfp_t gfp_mask);
57/* for swap handling */ 55/* for swap handling */
58extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 56extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
59 struct page *page, gfp_t mask, struct mem_cgroup **ptr); 57 struct page *page, gfp_t mask, struct mem_cgroup **memcgp);
60extern void mem_cgroup_commit_charge_swapin(struct page *page, 58extern void mem_cgroup_commit_charge_swapin(struct page *page,
61 struct mem_cgroup *ptr); 59 struct mem_cgroup *memcg);
62extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr); 60extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg);
63 61
64extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 62extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
65 gfp_t gfp_mask); 63 gfp_t gfp_mask);
66extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru); 64
67extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru); 65struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
68extern void mem_cgroup_rotate_reclaimable_page(struct page *page); 66struct lruvec *mem_cgroup_lru_add_list(struct zone *, struct page *,
69extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru); 67 enum lru_list);
70extern void mem_cgroup_del_lru(struct page *page); 68void mem_cgroup_lru_del_list(struct page *, enum lru_list);
71extern void mem_cgroup_move_lists(struct page *page, 69void mem_cgroup_lru_del(struct page *);
72 enum lru_list from, enum lru_list to); 70struct lruvec *mem_cgroup_lru_move_lists(struct zone *, struct page *,
71 enum lru_list, enum lru_list);
73 72
74/* For coalescing uncharge for reducing memcg' overhead*/ 73/* For coalescing uncharge for reducing memcg' overhead*/
75extern void mem_cgroup_uncharge_start(void); 74extern void mem_cgroup_uncharge_start(void);
@@ -102,10 +101,15 @@ extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
102 101
103extern int 102extern int
104mem_cgroup_prepare_migration(struct page *page, 103mem_cgroup_prepare_migration(struct page *page,
105 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask); 104 struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask);
106extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, 105extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
107 struct page *oldpage, struct page *newpage, bool migration_ok); 106 struct page *oldpage, struct page *newpage, bool migration_ok);
108 107
108struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
109 struct mem_cgroup *,
110 struct mem_cgroup_reclaim_cookie *);
111void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
112
109/* 113/*
110 * For memory reclaim. 114 * For memory reclaim.
111 */ 115 */
@@ -122,7 +126,10 @@ struct zone_reclaim_stat*
122mem_cgroup_get_reclaim_stat_from_page(struct page *page); 126mem_cgroup_get_reclaim_stat_from_page(struct page *page);
123extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, 127extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
124 struct task_struct *p); 128 struct task_struct *p);
129extern void mem_cgroup_replace_page_cache(struct page *oldpage,
130 struct page *newpage);
125 131
132extern void mem_cgroup_reset_owner(struct page *page);
126#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 133#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
127extern int do_swap_account; 134extern int do_swap_account;
128#endif 135#endif
@@ -157,7 +164,7 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg);
157 164
158void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); 165void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
159#ifdef CONFIG_TRANSPARENT_HUGEPAGE 166#ifdef CONFIG_TRANSPARENT_HUGEPAGE
160void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); 167void mem_cgroup_split_huge_fixup(struct page *head);
161#endif 168#endif
162 169
163#ifdef CONFIG_DEBUG_VM 170#ifdef CONFIG_DEBUG_VM
@@ -180,17 +187,17 @@ static inline int mem_cgroup_cache_charge(struct page *page,
180} 187}
181 188
182static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 189static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
183 struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr) 190 struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp)
184{ 191{
185 return 0; 192 return 0;
186} 193}
187 194
188static inline void mem_cgroup_commit_charge_swapin(struct page *page, 195static inline void mem_cgroup_commit_charge_swapin(struct page *page,
189 struct mem_cgroup *ptr) 196 struct mem_cgroup *memcg)
190{ 197{
191} 198}
192 199
193static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr) 200static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
194{ 201{
195} 202}
196 203
@@ -210,33 +217,33 @@ static inline void mem_cgroup_uncharge_cache_page(struct page *page)
210{ 217{
211} 218}
212 219
213static inline void mem_cgroup_add_lru_list(struct page *page, int lru) 220static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
214{ 221 struct mem_cgroup *memcg)
215}
216
217static inline void mem_cgroup_del_lru_list(struct page *page, int lru)
218{ 222{
219 return ; 223 return &zone->lruvec;
220} 224}
221 225
222static inline void mem_cgroup_rotate_reclaimable_page(struct page *page) 226static inline struct lruvec *mem_cgroup_lru_add_list(struct zone *zone,
227 struct page *page,
228 enum lru_list lru)
223{ 229{
224 return ; 230 return &zone->lruvec;
225} 231}
226 232
227static inline void mem_cgroup_rotate_lru_list(struct page *page, int lru) 233static inline void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
228{ 234{
229 return ;
230} 235}
231 236
232static inline void mem_cgroup_del_lru(struct page *page) 237static inline void mem_cgroup_lru_del(struct page *page)
233{ 238{
234 return ;
235} 239}
236 240
237static inline void 241static inline struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
238mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to) 242 struct page *page,
243 enum lru_list from,
244 enum lru_list to)
239{ 245{
246 return &zone->lruvec;
240} 247}
241 248
242static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 249static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -269,7 +276,7 @@ static inline struct cgroup_subsys_state
269 276
270static inline int 277static inline int
271mem_cgroup_prepare_migration(struct page *page, struct page *newpage, 278mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
272 struct mem_cgroup **ptr, gfp_t gfp_mask) 279 struct mem_cgroup **memcgp, gfp_t gfp_mask)
273{ 280{
274 return 0; 281 return 0;
275} 282}
@@ -279,6 +286,19 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
279{ 286{
280} 287}
281 288
289static inline struct mem_cgroup *
290mem_cgroup_iter(struct mem_cgroup *root,
291 struct mem_cgroup *prev,
292 struct mem_cgroup_reclaim_cookie *reclaim)
293{
294 return NULL;
295}
296
297static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
298 struct mem_cgroup *prev)
299{
300}
301
282static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg) 302static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg)
283{ 303{
284 return 0; 304 return 0;
@@ -360,8 +380,7 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
360 return 0; 380 return 0;
361} 381}
362 382
363static inline void mem_cgroup_split_huge_fixup(struct page *head, 383static inline void mem_cgroup_split_huge_fixup(struct page *head)
364 struct page *tail)
365{ 384{
366} 385}
367 386
@@ -369,6 +388,14 @@ static inline
369void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 388void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
370{ 389{
371} 390}
391static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
392 struct page *newpage)
393{
394}
395
396static inline void mem_cgroup_reset_owner(struct page *page)
397{
398}
372#endif /* CONFIG_CGROUP_MEM_CONT */ 399#endif /* CONFIG_CGROUP_MEM_CONT */
373 400
374#if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) 401#if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index e39aeecfe9a2..eaf867412f7a 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -6,18 +6,31 @@
6 6
7typedef struct page *new_page_t(struct page *, unsigned long private, int **); 7typedef struct page *new_page_t(struct page *, unsigned long private, int **);
8 8
9/*
10 * MIGRATE_ASYNC means never block
11 * MIGRATE_SYNC_LIGHT in the current implementation means to allow blocking
12 * on most operations but not ->writepage as the potential stall time
13 * is too significant
14 * MIGRATE_SYNC will block when migrating pages
15 */
16enum migrate_mode {
17 MIGRATE_ASYNC,
18 MIGRATE_SYNC_LIGHT,
19 MIGRATE_SYNC,
20};
21
9#ifdef CONFIG_MIGRATION 22#ifdef CONFIG_MIGRATION
10#define PAGE_MIGRATION 1 23#define PAGE_MIGRATION 1
11 24
12extern void putback_lru_pages(struct list_head *l); 25extern void putback_lru_pages(struct list_head *l);
13extern int migrate_page(struct address_space *, 26extern int migrate_page(struct address_space *,
14 struct page *, struct page *); 27 struct page *, struct page *, enum migrate_mode);
15extern int migrate_pages(struct list_head *l, new_page_t x, 28extern int migrate_pages(struct list_head *l, new_page_t x,
16 unsigned long private, bool offlining, 29 unsigned long private, bool offlining,
17 bool sync); 30 enum migrate_mode mode);
18extern int migrate_huge_pages(struct list_head *l, new_page_t x, 31extern int migrate_huge_pages(struct list_head *l, new_page_t x,
19 unsigned long private, bool offlining, 32 unsigned long private, bool offlining,
20 bool sync); 33 enum migrate_mode mode);
21 34
22extern int fail_migrate_page(struct address_space *, 35extern int fail_migrate_page(struct address_space *,
23 struct page *, struct page *); 36 struct page *, struct page *);
@@ -36,10 +49,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
36static inline void putback_lru_pages(struct list_head *l) {} 49static inline void putback_lru_pages(struct list_head *l) {}
37static inline int migrate_pages(struct list_head *l, new_page_t x, 50static inline int migrate_pages(struct list_head *l, new_page_t x,
38 unsigned long private, bool offlining, 51 unsigned long private, bool offlining,
39 bool sync) { return -ENOSYS; } 52 enum migrate_mode mode) { return -ENOSYS; }
40static inline int migrate_huge_pages(struct list_head *l, new_page_t x, 53static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
41 unsigned long private, bool offlining, 54 unsigned long private, bool offlining,
42 bool sync) { return -ENOSYS; } 55 enum migrate_mode mode) { return -ENOSYS; }
43 56
44static inline int migrate_prep(void) { return -ENOSYS; } 57static inline int migrate_prep(void) { return -ENOSYS; }
45static inline int migrate_prep_local(void) { return -ENOSYS; } 58static inline int migrate_prep_local(void) { return -ENOSYS; }
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8f7d24712dc1..227fd3e9a9c9 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -22,26 +22,21 @@ static inline int page_is_file_cache(struct page *page)
22} 22}
23 23
24static inline void 24static inline void
25__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, 25add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list lru)
26 struct list_head *head)
27{ 26{
28 list_add(&page->lru, head); 27 struct lruvec *lruvec;
29 __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page));
30 mem_cgroup_add_lru_list(page, l);
31}
32 28
33static inline void 29 lruvec = mem_cgroup_lru_add_list(zone, page, lru);
34add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) 30 list_add(&page->lru, &lruvec->lists[lru]);
35{ 31 __mod_zone_page_state(zone, NR_LRU_BASE + lru, hpage_nr_pages(page));
36 __add_page_to_lru_list(zone, page, l, &zone->lru[l].list);
37} 32}
38 33
39static inline void 34static inline void
40del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) 35del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list lru)
41{ 36{
37 mem_cgroup_lru_del_list(page, lru);
42 list_del(&page->lru); 38 list_del(&page->lru);
43 __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); 39 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -hpage_nr_pages(page));
44 mem_cgroup_del_lru_list(page, l);
45} 40}
46 41
47/** 42/**
@@ -59,24 +54,28 @@ static inline enum lru_list page_lru_base_type(struct page *page)
59 return LRU_INACTIVE_ANON; 54 return LRU_INACTIVE_ANON;
60} 55}
61 56
62static inline void 57/**
63del_page_from_lru(struct zone *zone, struct page *page) 58 * page_off_lru - which LRU list was page on? clearing its lru flags.
59 * @page: the page to test
60 *
61 * Returns the LRU list a page was on, as an index into the array of LRU
62 * lists; and clears its Unevictable or Active flags, ready for freeing.
63 */
64static inline enum lru_list page_off_lru(struct page *page)
64{ 65{
65 enum lru_list l; 66 enum lru_list lru;
66 67
67 list_del(&page->lru);
68 if (PageUnevictable(page)) { 68 if (PageUnevictable(page)) {
69 __ClearPageUnevictable(page); 69 __ClearPageUnevictable(page);
70 l = LRU_UNEVICTABLE; 70 lru = LRU_UNEVICTABLE;
71 } else { 71 } else {
72 l = page_lru_base_type(page); 72 lru = page_lru_base_type(page);
73 if (PageActive(page)) { 73 if (PageActive(page)) {
74 __ClearPageActive(page); 74 __ClearPageActive(page);
75 l += LRU_ACTIVE; 75 lru += LRU_ACTIVE;
76 } 76 }
77 } 77 }
78 __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); 78 return lru;
79 mem_cgroup_del_lru_list(page, l);
80} 79}
81 80
82/** 81/**
@@ -97,7 +96,6 @@ static inline enum lru_list page_lru(struct page *page)
97 if (PageActive(page)) 96 if (PageActive(page))
98 lru += LRU_ACTIVE; 97 lru += LRU_ACTIVE;
99 } 98 }
100
101 return lru; 99 return lru;
102} 100}
103 101
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5b42f1b34eb7..3cc3062b3767 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -151,12 +151,11 @@ struct page {
151#endif 151#endif
152} 152}
153/* 153/*
154 * If another subsystem starts using the double word pairing for atomic 154 * The struct page can be forced to be double word aligned so that atomic ops
155 * operations on struct page then it must change the #if to ensure 155 * on double words work. The SLUB allocator can make use of such a feature.
156 * proper alignment of the page struct.
157 */ 156 */
158#if defined(CONFIG_SLUB) && defined(CONFIG_CMPXCHG_LOCAL) 157#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
159 __attribute__((__aligned__(2*sizeof(unsigned long)))) 158 __aligned(2 * sizeof(unsigned long))
160#endif 159#endif
161; 160;
162 161
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ca6ca92418a6..650ba2fb3301 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -140,25 +140,29 @@ enum lru_list {
140 NR_LRU_LISTS 140 NR_LRU_LISTS
141}; 141};
142 142
143#define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++) 143#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)
144 144
145#define for_each_evictable_lru(l) for (l = 0; l <= LRU_ACTIVE_FILE; l++) 145#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
146 146
147static inline int is_file_lru(enum lru_list l) 147static inline int is_file_lru(enum lru_list lru)
148{ 148{
149 return (l == LRU_INACTIVE_FILE || l == LRU_ACTIVE_FILE); 149 return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
150} 150}
151 151
152static inline int is_active_lru(enum lru_list l) 152static inline int is_active_lru(enum lru_list lru)
153{ 153{
154 return (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE); 154 return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
155} 155}
156 156
157static inline int is_unevictable_lru(enum lru_list l) 157static inline int is_unevictable_lru(enum lru_list lru)
158{ 158{
159 return (l == LRU_UNEVICTABLE); 159 return (lru == LRU_UNEVICTABLE);
160} 160}
161 161
162struct lruvec {
163 struct list_head lists[NR_LRU_LISTS];
164};
165
162/* Mask used at gathering information at once (see memcontrol.c) */ 166/* Mask used at gathering information at once (see memcontrol.c) */
163#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 167#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
164#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 168#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
@@ -173,6 +177,8 @@ static inline int is_unevictable_lru(enum lru_list l)
173#define ISOLATE_CLEAN ((__force isolate_mode_t)0x4) 177#define ISOLATE_CLEAN ((__force isolate_mode_t)0x4)
174/* Isolate unmapped file */ 178/* Isolate unmapped file */
175#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x8) 179#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x8)
180/* Isolate for asynchronous migration */
181#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x10)
176 182
177/* LRU Isolation modes. */ 183/* LRU Isolation modes. */
178typedef unsigned __bitwise__ isolate_mode_t; 184typedef unsigned __bitwise__ isolate_mode_t;
@@ -364,10 +370,8 @@ struct zone {
364 ZONE_PADDING(_pad1_) 370 ZONE_PADDING(_pad1_)
365 371
366 /* Fields commonly accessed by the page reclaim scanner */ 372 /* Fields commonly accessed by the page reclaim scanner */
367 spinlock_t lru_lock; 373 spinlock_t lru_lock;
368 struct zone_lru { 374 struct lruvec lruvec;
369 struct list_head list;
370 } lru[NR_LRU_LISTS];
371 375
372 struct zone_reclaim_stat reclaim_stat; 376 struct zone_reclaim_stat reclaim_stat;
373 377
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 6f9d04a85336..552fba9c7d5a 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -43,7 +43,7 @@ enum oom_constraint {
43extern void compare_swap_oom_score_adj(int old_val, int new_val); 43extern void compare_swap_oom_score_adj(int old_val, int new_val);
44extern int test_set_oom_score_adj(int new_val); 44extern int test_set_oom_score_adj(int new_val);
45 45
46extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, 46extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
47 const nodemask_t *nodemask, unsigned long totalpages); 47 const nodemask_t *nodemask, unsigned long totalpages);
48extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); 48extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
49extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); 49extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 961ecc7d30bc..a2d11771c84b 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -10,8 +10,6 @@ enum {
10 /* flags for mem_cgroup and file and I/O status */ 10 /* flags for mem_cgroup and file and I/O status */
11 PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ 11 PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */
12 PCG_FILE_MAPPED, /* page is accounted as "mapped" */ 12 PCG_FILE_MAPPED, /* page is accounted as "mapped" */
13 /* No lock in page_cgroup */
14 PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */
15 __NR_PCG_FLAGS, 13 __NR_PCG_FLAGS,
16}; 14};
17 15
@@ -31,7 +29,6 @@ enum {
31struct page_cgroup { 29struct page_cgroup {
32 unsigned long flags; 30 unsigned long flags;
33 struct mem_cgroup *mem_cgroup; 31 struct mem_cgroup *mem_cgroup;
34 struct list_head lru; /* per cgroup LRU list */
35}; 32};
36 33
37void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat); 34void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
@@ -76,12 +73,6 @@ TESTPCGFLAG(Used, USED)
76CLEARPCGFLAG(Used, USED) 73CLEARPCGFLAG(Used, USED)
77SETPCGFLAG(Used, USED) 74SETPCGFLAG(Used, USED)
78 75
79SETPCGFLAG(AcctLRU, ACCT_LRU)
80CLEARPCGFLAG(AcctLRU, ACCT_LRU)
81TESTPCGFLAG(AcctLRU, ACCT_LRU)
82TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
83
84
85SETPCGFLAG(FileMapped, FILE_MAPPED) 76SETPCGFLAG(FileMapped, FILE_MAPPED)
86CLEARPCGFLAG(FileMapped, FILE_MAPPED) 77CLEARPCGFLAG(FileMapped, FILE_MAPPED)
87TESTPCGFLAG(FileMapped, FILE_MAPPED) 78TESTPCGFLAG(FileMapped, FILE_MAPPED)
@@ -122,39 +113,6 @@ static inline void move_unlock_page_cgroup(struct page_cgroup *pc,
122 local_irq_restore(*flags); 113 local_irq_restore(*flags);
123} 114}
124 115
125#ifdef CONFIG_SPARSEMEM
126#define PCG_ARRAYID_WIDTH SECTIONS_SHIFT
127#else
128#define PCG_ARRAYID_WIDTH NODES_SHIFT
129#endif
130
131#if (PCG_ARRAYID_WIDTH > BITS_PER_LONG - NR_PCG_FLAGS)
132#error Not enough space left in pc->flags to store page_cgroup array IDs
133#endif
134
135/* pc->flags: ARRAY-ID | FLAGS */
136
137#define PCG_ARRAYID_MASK ((1UL << PCG_ARRAYID_WIDTH) - 1)
138
139#define PCG_ARRAYID_OFFSET (BITS_PER_LONG - PCG_ARRAYID_WIDTH)
140/*
141 * Zero the shift count for non-existent fields, to prevent compiler
142 * warnings and ensure references are optimized away.
143 */
144#define PCG_ARRAYID_SHIFT (PCG_ARRAYID_OFFSET * (PCG_ARRAYID_WIDTH != 0))
145
146static inline void set_page_cgroup_array_id(struct page_cgroup *pc,
147 unsigned long id)
148{
149 pc->flags &= ~(PCG_ARRAYID_MASK << PCG_ARRAYID_SHIFT);
150 pc->flags |= (id & PCG_ARRAYID_MASK) << PCG_ARRAYID_SHIFT;
151}
152
153static inline unsigned long page_cgroup_array_id(struct page_cgroup *pc)
154{
155 return (pc->flags >> PCG_ARRAYID_SHIFT) & PCG_ARRAYID_MASK;
156}
157
158#else /* CONFIG_CGROUP_MEM_RES_CTLR */ 116#else /* CONFIG_CGROUP_MEM_RES_CTLR */
159struct page_cgroup; 117struct page_cgroup;
160 118
@@ -183,7 +141,7 @@ static inline void __init page_cgroup_init_flatmem(void)
183extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, 141extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
184 unsigned short old, unsigned short new); 142 unsigned short old, unsigned short new);
185extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); 143extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
186extern unsigned short lookup_swap_cgroup(swp_entry_t ent); 144extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
187extern int swap_cgroup_swapon(int type, unsigned long max_pages); 145extern int swap_cgroup_swapon(int type, unsigned long max_pages);
188extern void swap_cgroup_swapoff(int type); 146extern void swap_cgroup_swapoff(int type);
189#else 147#else
@@ -195,7 +153,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
195} 153}
196 154
197static inline 155static inline
198unsigned short lookup_swap_cgroup(swp_entry_t ent) 156unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
199{ 157{
200 return 0; 158 return 0;
201} 159}
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index ed17024d2ebe..2aa12b8499c0 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -21,8 +21,7 @@ struct pagevec {
21}; 21};
22 22
23void __pagevec_release(struct pagevec *pvec); 23void __pagevec_release(struct pagevec *pvec);
24void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); 24void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru);
25void pagevec_strip(struct pagevec *pvec);
26unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 25unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
27 pgoff_t start, unsigned nr_pages); 26 pgoff_t start, unsigned nr_pages);
28unsigned pagevec_lookup_tag(struct pagevec *pvec, 27unsigned pagevec_lookup_tag(struct pagevec *pvec,
@@ -59,7 +58,6 @@ static inline unsigned pagevec_add(struct pagevec *pvec, struct page *page)
59 return pagevec_space(pvec); 58 return pagevec_space(pvec);
60} 59}
61 60
62
63static inline void pagevec_release(struct pagevec *pvec) 61static inline void pagevec_release(struct pagevec *pvec)
64{ 62{
65 if (pagevec_count(pvec)) 63 if (pagevec_count(pvec))
@@ -68,22 +66,22 @@ static inline void pagevec_release(struct pagevec *pvec)
68 66
69static inline void __pagevec_lru_add_anon(struct pagevec *pvec) 67static inline void __pagevec_lru_add_anon(struct pagevec *pvec)
70{ 68{
71 ____pagevec_lru_add(pvec, LRU_INACTIVE_ANON); 69 __pagevec_lru_add(pvec, LRU_INACTIVE_ANON);
72} 70}
73 71
74static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec) 72static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec)
75{ 73{
76 ____pagevec_lru_add(pvec, LRU_ACTIVE_ANON); 74 __pagevec_lru_add(pvec, LRU_ACTIVE_ANON);
77} 75}
78 76
79static inline void __pagevec_lru_add_file(struct pagevec *pvec) 77static inline void __pagevec_lru_add_file(struct pagevec *pvec)
80{ 78{
81 ____pagevec_lru_add(pvec, LRU_INACTIVE_FILE); 79 __pagevec_lru_add(pvec, LRU_INACTIVE_FILE);
82} 80}
83 81
84static inline void __pagevec_lru_add_active_file(struct pagevec *pvec) 82static inline void __pagevec_lru_add_active_file(struct pagevec *pvec)
85{ 83{
86 ____pagevec_lru_add(pvec, LRU_ACTIVE_FILE); 84 __pagevec_lru_add(pvec, LRU_ACTIVE_FILE);
87} 85}
88 86
89static inline void pagevec_lru_add_file(struct pagevec *pvec) 87static inline void pagevec_lru_add_file(struct pagevec *pvec)
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2c2161..7ddc7f1b480f 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,16 @@
102 102
103#define PR_MCE_KILL_GET 34 103#define PR_MCE_KILL_GET 34
104 104
105/*
106 * Tune up process memory map specifics.
107 */
108#define PR_SET_MM 35
109# define PR_SET_MM_START_CODE 1
110# define PR_SET_MM_END_CODE 2
111# define PR_SET_MM_START_DATA 3
112# define PR_SET_MM_END_DATA 4
113# define PR_SET_MM_START_STACK 5
114# define PR_SET_MM_START_BRK 6
115# define PR_SET_MM_BRK 7
116
105#endif /* _LINUX_PRCTL_H */ 117#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 9d4539c52e53..07e360b1b282 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -49,9 +49,6 @@
49#define RADIX_TREE_EXCEPTIONAL_ENTRY 2 49#define RADIX_TREE_EXCEPTIONAL_ENTRY 2
50#define RADIX_TREE_EXCEPTIONAL_SHIFT 2 50#define RADIX_TREE_EXCEPTIONAL_SHIFT 2
51 51
52#define radix_tree_indirect_to_ptr(ptr) \
53 radix_tree_indirect_to_ptr((void __force *)(ptr))
54
55static inline int radix_tree_is_indirect_ptr(void *ptr) 52static inline int radix_tree_is_indirect_ptr(void *ptr)
56{ 53{
57 return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); 54 return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 1afb9954bbf1..1cdd62a2788a 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -158,7 +158,7 @@ static inline void page_dup_rmap(struct page *page)
158 * Called from mm/vmscan.c to handle paging out 158 * Called from mm/vmscan.c to handle paging out
159 */ 159 */
160int page_referenced(struct page *, int is_locked, 160int page_referenced(struct page *, int is_locked,
161 struct mem_cgroup *cnt, unsigned long *vm_flags); 161 struct mem_cgroup *memcg, unsigned long *vm_flags);
162int page_referenced_one(struct page *, struct vm_area_struct *, 162int page_referenced_one(struct page *, struct vm_area_struct *,
163 unsigned long address, unsigned int *mapcount, unsigned long *vm_flags); 163 unsigned long address, unsigned int *mapcount, unsigned long *vm_flags);
164 164
@@ -236,7 +236,7 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
236#define anon_vma_link(vma) do {} while (0) 236#define anon_vma_link(vma) do {} while (0)
237 237
238static inline int page_referenced(struct page *page, int is_locked, 238static inline int page_referenced(struct page *page, int is_locked,
239 struct mem_cgroup *cnt, 239 struct mem_cgroup *memcg,
240 unsigned long *vm_flags) 240 unsigned long *vm_flags)
241{ 241{
242 *vm_flags = 0; 242 *vm_flags = 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 21cd0303af51..4032ec1cf836 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2275,7 +2275,7 @@ extern void __cleanup_sighand(struct sighand_struct *);
2275extern void exit_itimers(struct signal_struct *); 2275extern void exit_itimers(struct signal_struct *);
2276extern void flush_itimer_signals(void); 2276extern void flush_itimer_signals(void);
2277 2277
2278extern NORET_TYPE void do_group_exit(int); 2278extern void do_group_exit(int);
2279 2279
2280extern void daemonize(const char *, ...); 2280extern void daemonize(const char *, ...);
2281extern int allow_signal(int); 2281extern int allow_signal(int);
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index edc4b3d25a2d..f64560e204bc 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -266,9 +266,10 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
266 unsigned long nr_lumpy_taken, 266 unsigned long nr_lumpy_taken,
267 unsigned long nr_lumpy_dirty, 267 unsigned long nr_lumpy_dirty,
268 unsigned long nr_lumpy_failed, 268 unsigned long nr_lumpy_failed,
269 isolate_mode_t isolate_mode), 269 isolate_mode_t isolate_mode,
270 int file),
270 271
271 TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode), 272 TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file),
272 273
273 TP_STRUCT__entry( 274 TP_STRUCT__entry(
274 __field(int, order) 275 __field(int, order)
@@ -279,6 +280,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
279 __field(unsigned long, nr_lumpy_dirty) 280 __field(unsigned long, nr_lumpy_dirty)
280 __field(unsigned long, nr_lumpy_failed) 281 __field(unsigned long, nr_lumpy_failed)
281 __field(isolate_mode_t, isolate_mode) 282 __field(isolate_mode_t, isolate_mode)
283 __field(int, file)
282 ), 284 ),
283 285
284 TP_fast_assign( 286 TP_fast_assign(
@@ -290,9 +292,10 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
290 __entry->nr_lumpy_dirty = nr_lumpy_dirty; 292 __entry->nr_lumpy_dirty = nr_lumpy_dirty;
291 __entry->nr_lumpy_failed = nr_lumpy_failed; 293 __entry->nr_lumpy_failed = nr_lumpy_failed;
292 __entry->isolate_mode = isolate_mode; 294 __entry->isolate_mode = isolate_mode;
295 __entry->file = file;
293 ), 296 ),
294 297
295 TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu contig_taken=%lu contig_dirty=%lu contig_failed=%lu", 298 TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu contig_taken=%lu contig_dirty=%lu contig_failed=%lu file=%d",
296 __entry->isolate_mode, 299 __entry->isolate_mode,
297 __entry->order, 300 __entry->order,
298 __entry->nr_requested, 301 __entry->nr_requested,
@@ -300,7 +303,8 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
300 __entry->nr_taken, 303 __entry->nr_taken,
301 __entry->nr_lumpy_taken, 304 __entry->nr_lumpy_taken,
302 __entry->nr_lumpy_dirty, 305 __entry->nr_lumpy_dirty,
303 __entry->nr_lumpy_failed) 306 __entry->nr_lumpy_failed,
307 __entry->file)
304); 308);
305 309
306DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, 310DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate,
@@ -312,9 +316,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate,
312 unsigned long nr_lumpy_taken, 316 unsigned long nr_lumpy_taken,
313 unsigned long nr_lumpy_dirty, 317 unsigned long nr_lumpy_dirty,
314 unsigned long nr_lumpy_failed, 318 unsigned long nr_lumpy_failed,
315 isolate_mode_t isolate_mode), 319 isolate_mode_t isolate_mode,
320 int file),
316 321
317 TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode) 322 TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file)
318 323
319); 324);
320 325
@@ -327,9 +332,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
327 unsigned long nr_lumpy_taken, 332 unsigned long nr_lumpy_taken,
328 unsigned long nr_lumpy_dirty, 333 unsigned long nr_lumpy_dirty,
329 unsigned long nr_lumpy_failed, 334 unsigned long nr_lumpy_failed,
330 isolate_mode_t isolate_mode), 335 isolate_mode_t isolate_mode,
336 int file),
331 337
332 TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode) 338 TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file)
333 339
334); 340);
335 341
diff --git a/init/Kconfig b/init/Kconfig
index 018d206c21f7..6ac2236244c3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -783,6 +783,17 @@ config DEBUG_BLK_CGROUP
783 783
784endif # CGROUPS 784endif # CGROUPS
785 785
786config CHECKPOINT_RESTORE
787 bool "Checkpoint/restore support" if EXPERT
788 default n
789 help
790 Enables additional kernel features in a sake of checkpoint/restore.
791 In particular it adds auxiliary prctl codes to setup process text,
792 data and heap segment sizes, and a few additional /proc filesystem
793 entries.
794
795 If unsure, say N here.
796
786menuconfig NAMESPACES 797menuconfig NAMESPACES
787 bool "Namespaces support" if EXPERT 798 bool "Namespaces support" if EXPERT
788 default !EXPERT 799 default !EXPERT
diff --git a/kernel/exit.c b/kernel/exit.c
index 94ed6e20bb53..c44738267be7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -887,7 +887,7 @@ static void check_stack_usage(void)
887static inline void check_stack_usage(void) {} 887static inline void check_stack_usage(void) {}
888#endif 888#endif
889 889
890NORET_TYPE void do_exit(long code) 890void do_exit(long code)
891{ 891{
892 struct task_struct *tsk = current; 892 struct task_struct *tsk = current;
893 int group_dead; 893 int group_dead;
@@ -1051,7 +1051,7 @@ NORET_TYPE void do_exit(long code)
1051 1051
1052EXPORT_SYMBOL_GPL(do_exit); 1052EXPORT_SYMBOL_GPL(do_exit);
1053 1053
1054NORET_TYPE void complete_and_exit(struct completion *comp, long code) 1054void complete_and_exit(struct completion *comp, long code)
1055{ 1055{
1056 if (comp) 1056 if (comp)
1057 complete(comp); 1057 complete(comp);
@@ -1070,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
1070 * Take down every thread in the group. This is called by fatal signals 1070 * Take down every thread in the group. This is called by fatal signals
1071 * as well as by sys_exit_group (below). 1071 * as well as by sys_exit_group (below).
1072 */ 1072 */
1073NORET_TYPE void 1073void
1074do_group_exit(int exit_code) 1074do_group_exit(int exit_code)
1075{ 1075{
1076 struct signal_struct *sig = current->signal; 1076 struct signal_struct *sig = current->signal;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 090ee10d9604..7b0886786701 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,7 +32,6 @@
32#include <linux/console.h> 32#include <linux/console.h>
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/kmsg_dump.h>
36#include <linux/syscore_ops.h> 35#include <linux/syscore_ops.h>
37 36
38#include <asm/page.h> 37#include <asm/page.h>
@@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs)
1094 if (kexec_crash_image) { 1093 if (kexec_crash_image) {
1095 struct pt_regs fixed_regs; 1094 struct pt_regs fixed_regs;
1096 1095
1097 kmsg_dump(KMSG_DUMP_KEXEC);
1098
1099 crash_setup_regs(&fixed_regs, regs); 1096 crash_setup_regs(&fixed_regs, regs);
1100 crash_save_vmcoreinfo(); 1097 crash_save_vmcoreinfo();
1101 machine_crash_shutdown(&fixed_regs); 1098 machine_crash_shutdown(&fixed_regs);
@@ -1132,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size)
1132{ 1129{
1133 int ret = 0; 1130 int ret = 0;
1134 unsigned long start, end; 1131 unsigned long start, end;
1132 unsigned long old_size;
1133 struct resource *ram_res;
1135 1134
1136 mutex_lock(&kexec_mutex); 1135 mutex_lock(&kexec_mutex);
1137 1136
@@ -1141,11 +1140,15 @@ int crash_shrink_memory(unsigned long new_size)
1141 } 1140 }
1142 start = crashk_res.start; 1141 start = crashk_res.start;
1143 end = crashk_res.end; 1142 end = crashk_res.end;
1143 old_size = (end == 0) ? 0 : end - start + 1;
1144 if (new_size >= old_size) {
1145 ret = (new_size == old_size) ? 0 : -EINVAL;
1146 goto unlock;
1147 }
1144 1148
1145 if (new_size >= end - start + 1) { 1149 ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
1146 ret = -EINVAL; 1150 if (!ram_res) {
1147 if (new_size == end - start + 1) 1151 ret = -ENOMEM;
1148 ret = 0;
1149 goto unlock; 1152 goto unlock;
1150 } 1153 }
1151 1154
@@ -1157,7 +1160,15 @@ int crash_shrink_memory(unsigned long new_size)
1157 1160
1158 if ((start == end) && (crashk_res.parent != NULL)) 1161 if ((start == end) && (crashk_res.parent != NULL))
1159 release_resource(&crashk_res); 1162 release_resource(&crashk_res);
1163
1164 ram_res->start = end;
1165 ram_res->end = crashk_res.end;
1166 ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
1167 ram_res->name = "System RAM";
1168
1160 crashk_res.end = end - 1; 1169 crashk_res.end = end - 1;
1170
1171 insert_resource(&iomem_resource, ram_res);
1161 crash_unmap_reserved_pages(); 1172 crash_unmap_reserved_pages();
1162 1173
1163unlock: 1174unlock:
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5d84644823b..95dd7212e610 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
2198 const char __user *user_buf, size_t count, loff_t *ppos) 2198 const char __user *user_buf, size_t count, loff_t *ppos)
2199{ 2199{
2200 char buf[32]; 2200 char buf[32];
2201 int buf_size; 2201 size_t buf_size;
2202 2202
2203 buf_size = min(count, (sizeof(buf)-1)); 2203 buf_size = min(count, (sizeof(buf)-1));
2204 if (copy_from_user(buf, user_buf, buf_size)) 2204 if (copy_from_user(buf, user_buf, buf_size))
diff --git a/kernel/panic.c b/kernel/panic.c
index 3458469eb7c3..80aed44e345a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -49,6 +49,15 @@ static long no_blink(int state)
49long (*panic_blink)(int state); 49long (*panic_blink)(int state);
50EXPORT_SYMBOL(panic_blink); 50EXPORT_SYMBOL(panic_blink);
51 51
52/*
53 * Stop ourself in panic -- architecture code may override this
54 */
55void __weak panic_smp_self_stop(void)
56{
57 while (1)
58 cpu_relax();
59}
60
52/** 61/**
53 * panic - halt the system 62 * panic - halt the system
54 * @fmt: The text string to print 63 * @fmt: The text string to print
@@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink);
57 * 66 *
58 * This function never returns. 67 * This function never returns.
59 */ 68 */
60NORET_TYPE void panic(const char * fmt, ...) 69void panic(const char *fmt, ...)
61{ 70{
71 static DEFINE_SPINLOCK(panic_lock);
62 static char buf[1024]; 72 static char buf[1024];
63 va_list args; 73 va_list args;
64 long i, i_next = 0; 74 long i, i_next = 0;
@@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...)
68 * It's possible to come here directly from a panic-assertion and 78 * It's possible to come here directly from a panic-assertion and
69 * not have preempt disabled. Some functions called from here want 79 * not have preempt disabled. Some functions called from here want
70 * preempt to be disabled. No point enabling it later though... 80 * preempt to be disabled. No point enabling it later though...
81 *
82 * Only one CPU is allowed to execute the panic code from here. For
83 * multiple parallel invocations of panic, all other CPUs either
84 * stop themself or will wait until they are stopped by the 1st CPU
85 * with smp_send_stop().
71 */ 86 */
72 preempt_disable(); 87 if (!spin_trylock(&panic_lock))
88 panic_smp_self_stop();
73 89
74 console_verbose(); 90 console_verbose();
75 bust_spinlocks(1); 91 bust_spinlocks(1);
@@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...)
78 va_end(args); 94 va_end(args);
79 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 95 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
80#ifdef CONFIG_DEBUG_BUGVERBOSE 96#ifdef CONFIG_DEBUG_BUGVERBOSE
81 dump_stack(); 97 /*
98 * Avoid nested stack-dumping if a panic occurs during oops processing
99 */
100 if (!oops_in_progress)
101 dump_stack();
82#endif 102#endif
83 103
84 /* 104 /*
diff --git a/kernel/pid.c b/kernel/pid.c
index fa5f72227e5f..ce8e00deaccb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b)
137} 137}
138 138
139/* 139/*
140 * We might be racing with someone else trying to set pid_ns->last_pid. 140 * We might be racing with someone else trying to set pid_ns->last_pid
141 * at the pid allocation time (there's also a sysctl for this, but racing
142 * with this one is OK, see comment in kernel/pid_namespace.c about it).
141 * We want the winner to have the "later" value, because if the 143 * We want the winner to have the "later" value, because if the
142 * "earlier" value prevails, then a pid may get reused immediately. 144 * "earlier" value prevails, then a pid may get reused immediately.
143 * 145 *
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index e9c9adc84ca6..a8968396046d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
191 return; 191 return;
192} 192}
193 193
194static int pid_ns_ctl_handler(struct ctl_table *table, int write,
195 void __user *buffer, size_t *lenp, loff_t *ppos)
196{
197 struct ctl_table tmp = *table;
198
199 if (write && !capable(CAP_SYS_ADMIN))
200 return -EPERM;
201
202 /*
203 * Writing directly to ns' last_pid field is OK, since this field
204 * is volatile in a living namespace anyway and a code writing to
205 * it should synchronize its usage with external means.
206 */
207
208 tmp.data = &current->nsproxy->pid_ns->last_pid;
209 return proc_dointvec(&tmp, write, buffer, lenp, ppos);
210}
211
212static struct ctl_table pid_ns_ctl_table[] = {
213 {
214 .procname = "ns_last_pid",
215 .maxlen = sizeof(int),
216 .mode = 0666, /* permissions are checked in the handler */
217 .proc_handler = pid_ns_ctl_handler,
218 },
219 { }
220};
221
222static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
223
194static __init int pid_namespaces_init(void) 224static __init int pid_namespaces_init(void)
195{ 225{
196 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 226 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
227 register_sysctl_paths(kern_path, pid_ns_ctl_table);
197 return 0; 228 return 0;
198} 229}
199 230
diff --git a/kernel/sys.c b/kernel/sys.c
index ddf8155bf3f8..40701538fbd1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask)
1692 return mask; 1692 return mask;
1693} 1693}
1694 1694
1695#ifdef CONFIG_CHECKPOINT_RESTORE
1696static int prctl_set_mm(int opt, unsigned long addr,
1697 unsigned long arg4, unsigned long arg5)
1698{
1699 unsigned long rlim = rlimit(RLIMIT_DATA);
1700 unsigned long vm_req_flags;
1701 unsigned long vm_bad_flags;
1702 struct vm_area_struct *vma;
1703 int error = 0;
1704 struct mm_struct *mm = current->mm;
1705
1706 if (arg4 | arg5)
1707 return -EINVAL;
1708
1709 if (!capable(CAP_SYS_ADMIN))
1710 return -EPERM;
1711
1712 if (addr >= TASK_SIZE)
1713 return -EINVAL;
1714
1715 down_read(&mm->mmap_sem);
1716 vma = find_vma(mm, addr);
1717
1718 if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
1719 /* It must be existing VMA */
1720 if (!vma || vma->vm_start > addr)
1721 goto out;
1722 }
1723
1724 error = -EINVAL;
1725 switch (opt) {
1726 case PR_SET_MM_START_CODE:
1727 case PR_SET_MM_END_CODE:
1728 vm_req_flags = VM_READ | VM_EXEC;
1729 vm_bad_flags = VM_WRITE | VM_MAYSHARE;
1730
1731 if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
1732 (vma->vm_flags & vm_bad_flags))
1733 goto out;
1734
1735 if (opt == PR_SET_MM_START_CODE)
1736 mm->start_code = addr;
1737 else
1738 mm->end_code = addr;
1739 break;
1740
1741 case PR_SET_MM_START_DATA:
1742 case PR_SET_MM_END_DATA:
1743 vm_req_flags = VM_READ | VM_WRITE;
1744 vm_bad_flags = VM_EXEC | VM_MAYSHARE;
1745
1746 if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
1747 (vma->vm_flags & vm_bad_flags))
1748 goto out;
1749
1750 if (opt == PR_SET_MM_START_DATA)
1751 mm->start_data = addr;
1752 else
1753 mm->end_data = addr;
1754 break;
1755
1756 case PR_SET_MM_START_STACK:
1757
1758#ifdef CONFIG_STACK_GROWSUP
1759 vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
1760#else
1761 vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
1762#endif
1763 if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
1764 goto out;
1765
1766 mm->start_stack = addr;
1767 break;
1768
1769 case PR_SET_MM_START_BRK:
1770 if (addr <= mm->end_data)
1771 goto out;
1772
1773 if (rlim < RLIM_INFINITY &&
1774 (mm->brk - addr) +
1775 (mm->end_data - mm->start_data) > rlim)
1776 goto out;
1777
1778 mm->start_brk = addr;
1779 break;
1780
1781 case PR_SET_MM_BRK:
1782 if (addr <= mm->end_data)
1783 goto out;
1784
1785 if (rlim < RLIM_INFINITY &&
1786 (addr - mm->start_brk) +
1787 (mm->end_data - mm->start_data) > rlim)
1788 goto out;
1789
1790 mm->brk = addr;
1791 break;
1792
1793 default:
1794 error = -EINVAL;
1795 goto out;
1796 }
1797
1798 error = 0;
1799
1800out:
1801 up_read(&mm->mmap_sem);
1802
1803 return error;
1804}
1805#else /* CONFIG_CHECKPOINT_RESTORE */
1806static int prctl_set_mm(int opt, unsigned long addr,
1807 unsigned long arg4, unsigned long arg5)
1808{
1809 return -EINVAL;
1810}
1811#endif
1812
1695SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, 1813SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1696 unsigned long, arg4, unsigned long, arg5) 1814 unsigned long, arg4, unsigned long, arg5)
1697{ 1815{
@@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1841 else 1959 else
1842 error = PR_MCE_KILL_DEFAULT; 1960 error = PR_MCE_KILL_DEFAULT;
1843 break; 1961 break;
1962 case PR_SET_MM:
1963 error = prctl_set_mm(arg2, arg3, arg4, arg5);
1964 break;
1844 default: 1965 default:
1845 error = -EINVAL; 1966 error = -EINVAL;
1846 break; 1967 break;
diff --git a/lib/decompress_unlzo.c b/lib/decompress_unlzo.c
index 5a7a2adf4c4c..4531294fa62f 100644
--- a/lib/decompress_unlzo.c
+++ b/lib/decompress_unlzo.c
@@ -279,7 +279,7 @@ STATIC inline int INIT unlzo(u8 *input, int in_len,
279 ret = 0; 279 ret = 0;
280exit_2: 280exit_2:
281 if (!input) 281 if (!input)
282 free(in_buf); 282 free(in_buf_save);
283exit_1: 283exit_1:
284 if (!output) 284 if (!output)
285 free(out_buf); 285 free(out_buf);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index d9df7454519c..dc63d0818394 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -48,16 +48,14 @@
48struct radix_tree_node { 48struct radix_tree_node {
49 unsigned int height; /* Height from the bottom */ 49 unsigned int height; /* Height from the bottom */
50 unsigned int count; 50 unsigned int count;
51 struct rcu_head rcu_head; 51 union {
52 struct radix_tree_node *parent; /* Used when ascending tree */
53 struct rcu_head rcu_head; /* Used when freeing node */
54 };
52 void __rcu *slots[RADIX_TREE_MAP_SIZE]; 55 void __rcu *slots[RADIX_TREE_MAP_SIZE];
53 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; 56 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
54}; 57};
55 58
56struct radix_tree_path {
57 struct radix_tree_node *node;
58 int offset;
59};
60
61#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) 59#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
62#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ 60#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
63 RADIX_TREE_MAP_SHIFT)) 61 RADIX_TREE_MAP_SHIFT))
@@ -256,6 +254,7 @@ static inline unsigned long radix_tree_maxindex(unsigned int height)
256static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) 254static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
257{ 255{
258 struct radix_tree_node *node; 256 struct radix_tree_node *node;
257 struct radix_tree_node *slot;
259 unsigned int height; 258 unsigned int height;
260 int tag; 259 int tag;
261 260
@@ -274,18 +273,23 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
274 if (!(node = radix_tree_node_alloc(root))) 273 if (!(node = radix_tree_node_alloc(root)))
275 return -ENOMEM; 274 return -ENOMEM;
276 275
277 /* Increase the height. */
278 node->slots[0] = indirect_to_ptr(root->rnode);
279
280 /* Propagate the aggregated tag info into the new root */ 276 /* Propagate the aggregated tag info into the new root */
281 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { 277 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
282 if (root_tag_get(root, tag)) 278 if (root_tag_get(root, tag))
283 tag_set(node, tag, 0); 279 tag_set(node, tag, 0);
284 } 280 }
285 281
282 /* Increase the height. */
286 newheight = root->height+1; 283 newheight = root->height+1;
287 node->height = newheight; 284 node->height = newheight;
288 node->count = 1; 285 node->count = 1;
286 node->parent = NULL;
287 slot = root->rnode;
288 if (newheight > 1) {
289 slot = indirect_to_ptr(slot);
290 slot->parent = node;
291 }
292 node->slots[0] = slot;
289 node = ptr_to_indirect(node); 293 node = ptr_to_indirect(node);
290 rcu_assign_pointer(root->rnode, node); 294 rcu_assign_pointer(root->rnode, node);
291 root->height = newheight; 295 root->height = newheight;
@@ -331,6 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root,
331 if (!(slot = radix_tree_node_alloc(root))) 335 if (!(slot = radix_tree_node_alloc(root)))
332 return -ENOMEM; 336 return -ENOMEM;
333 slot->height = height; 337 slot->height = height;
338 slot->parent = node;
334 if (node) { 339 if (node) {
335 rcu_assign_pointer(node->slots[offset], slot); 340 rcu_assign_pointer(node->slots[offset], slot);
336 node->count++; 341 node->count++;
@@ -504,47 +509,41 @@ EXPORT_SYMBOL(radix_tree_tag_set);
504void *radix_tree_tag_clear(struct radix_tree_root *root, 509void *radix_tree_tag_clear(struct radix_tree_root *root,
505 unsigned long index, unsigned int tag) 510 unsigned long index, unsigned int tag)
506{ 511{
507 /* 512 struct radix_tree_node *node = NULL;
508 * The radix tree path needs to be one longer than the maximum path
509 * since the "list" is null terminated.
510 */
511 struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
512 struct radix_tree_node *slot = NULL; 513 struct radix_tree_node *slot = NULL;
513 unsigned int height, shift; 514 unsigned int height, shift;
515 int uninitialized_var(offset);
514 516
515 height = root->height; 517 height = root->height;
516 if (index > radix_tree_maxindex(height)) 518 if (index > radix_tree_maxindex(height))
517 goto out; 519 goto out;
518 520
519 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 521 shift = height * RADIX_TREE_MAP_SHIFT;
520 pathp->node = NULL;
521 slot = indirect_to_ptr(root->rnode); 522 slot = indirect_to_ptr(root->rnode);
522 523
523 while (height > 0) { 524 while (shift) {
524 int offset;
525
526 if (slot == NULL) 525 if (slot == NULL)
527 goto out; 526 goto out;
528 527
528 shift -= RADIX_TREE_MAP_SHIFT;
529 offset = (index >> shift) & RADIX_TREE_MAP_MASK; 529 offset = (index >> shift) & RADIX_TREE_MAP_MASK;
530 pathp[1].offset = offset; 530 node = slot;
531 pathp[1].node = slot;
532 slot = slot->slots[offset]; 531 slot = slot->slots[offset];
533 pathp++;
534 shift -= RADIX_TREE_MAP_SHIFT;
535 height--;
536 } 532 }
537 533
538 if (slot == NULL) 534 if (slot == NULL)
539 goto out; 535 goto out;
540 536
541 while (pathp->node) { 537 while (node) {
542 if (!tag_get(pathp->node, tag, pathp->offset)) 538 if (!tag_get(node, tag, offset))
543 goto out; 539 goto out;
544 tag_clear(pathp->node, tag, pathp->offset); 540 tag_clear(node, tag, offset);
545 if (any_tag_set(pathp->node, tag)) 541 if (any_tag_set(node, tag))
546 goto out; 542 goto out;
547 pathp--; 543
544 index >>= RADIX_TREE_MAP_SHIFT;
545 offset = index & RADIX_TREE_MAP_MASK;
546 node = node->parent;
548 } 547 }
549 548
550 /* clear the root's tag bit */ 549 /* clear the root's tag bit */
@@ -646,8 +645,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
646 unsigned int iftag, unsigned int settag) 645 unsigned int iftag, unsigned int settag)
647{ 646{
648 unsigned int height = root->height; 647 unsigned int height = root->height;
649 struct radix_tree_path path[height]; 648 struct radix_tree_node *node = NULL;
650 struct radix_tree_path *pathp = path;
651 struct radix_tree_node *slot; 649 struct radix_tree_node *slot;
652 unsigned int shift; 650 unsigned int shift;
653 unsigned long tagged = 0; 651 unsigned long tagged = 0;
@@ -671,14 +669,8 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
671 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 669 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
672 slot = indirect_to_ptr(root->rnode); 670 slot = indirect_to_ptr(root->rnode);
673 671
674 /*
675 * we fill the path from (root->height - 2) to 0, leaving the index at
676 * (root->height - 1) as a terminator. Zero the node in the terminator
677 * so that we can use this to end walk loops back up the path.
678 */
679 path[height - 1].node = NULL;
680
681 for (;;) { 672 for (;;) {
673 unsigned long upindex;
682 int offset; 674 int offset;
683 675
684 offset = (index >> shift) & RADIX_TREE_MAP_MASK; 676 offset = (index >> shift) & RADIX_TREE_MAP_MASK;
@@ -686,12 +678,10 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
686 goto next; 678 goto next;
687 if (!tag_get(slot, iftag, offset)) 679 if (!tag_get(slot, iftag, offset))
688 goto next; 680 goto next;
689 if (height > 1) { 681 if (shift) {
690 /* Go down one level */ 682 /* Go down one level */
691 height--;
692 shift -= RADIX_TREE_MAP_SHIFT; 683 shift -= RADIX_TREE_MAP_SHIFT;
693 path[height - 1].node = slot; 684 node = slot;
694 path[height - 1].offset = offset;
695 slot = slot->slots[offset]; 685 slot = slot->slots[offset];
696 continue; 686 continue;
697 } 687 }
@@ -701,15 +691,27 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
701 tag_set(slot, settag, offset); 691 tag_set(slot, settag, offset);
702 692
703 /* walk back up the path tagging interior nodes */ 693 /* walk back up the path tagging interior nodes */
704 pathp = &path[0]; 694 upindex = index;
705 while (pathp->node) { 695 while (node) {
696 upindex >>= RADIX_TREE_MAP_SHIFT;
697 offset = upindex & RADIX_TREE_MAP_MASK;
698
706 /* stop if we find a node with the tag already set */ 699 /* stop if we find a node with the tag already set */
707 if (tag_get(pathp->node, settag, pathp->offset)) 700 if (tag_get(node, settag, offset))
708 break; 701 break;
709 tag_set(pathp->node, settag, pathp->offset); 702 tag_set(node, settag, offset);
710 pathp++; 703 node = node->parent;
711 } 704 }
712 705
706 /*
707 * Small optimization: now clear that node pointer.
708 * Since all of this slot's ancestors now have the tag set
709 * from setting it above, we have no further need to walk
710 * back up the tree setting tags, until we update slot to
711 * point to another radix_tree_node.
712 */
713 node = NULL;
714
713next: 715next:
714 /* Go to next item at level determined by 'shift' */ 716 /* Go to next item at level determined by 'shift' */
715 index = ((index >> shift) + 1) << shift; 717 index = ((index >> shift) + 1) << shift;
@@ -724,8 +726,7 @@ next:
724 * last_index is guaranteed to be in the tree, what 726 * last_index is guaranteed to be in the tree, what
725 * we do below cannot wander astray. 727 * we do below cannot wander astray.
726 */ 728 */
727 slot = path[height - 1].node; 729 slot = slot->parent;
728 height++;
729 shift += RADIX_TREE_MAP_SHIFT; 730 shift += RADIX_TREE_MAP_SHIFT;
730 } 731 }
731 } 732 }
@@ -1299,7 +1300,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
1299 /* try to shrink tree height */ 1300 /* try to shrink tree height */
1300 while (root->height > 0) { 1301 while (root->height > 0) {
1301 struct radix_tree_node *to_free = root->rnode; 1302 struct radix_tree_node *to_free = root->rnode;
1302 void *newptr; 1303 struct radix_tree_node *slot;
1303 1304
1304 BUG_ON(!radix_tree_is_indirect_ptr(to_free)); 1305 BUG_ON(!radix_tree_is_indirect_ptr(to_free));
1305 to_free = indirect_to_ptr(to_free); 1306 to_free = indirect_to_ptr(to_free);
@@ -1320,10 +1321,12 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
1320 * (to_free->slots[0]), it will be safe to dereference the new 1321 * (to_free->slots[0]), it will be safe to dereference the new
1321 * one (root->rnode) as far as dependent read barriers go. 1322 * one (root->rnode) as far as dependent read barriers go.
1322 */ 1323 */
1323 newptr = to_free->slots[0]; 1324 slot = to_free->slots[0];
1324 if (root->height > 1) 1325 if (root->height > 1) {
1325 newptr = ptr_to_indirect(newptr); 1326 slot->parent = NULL;
1326 root->rnode = newptr; 1327 slot = ptr_to_indirect(slot);
1328 }
1329 root->rnode = slot;
1327 root->height--; 1330 root->height--;
1328 1331
1329 /* 1332 /*
@@ -1363,16 +1366,12 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
1363 */ 1366 */
1364void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) 1367void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1365{ 1368{
1366 /* 1369 struct radix_tree_node *node = NULL;
1367 * The radix tree path needs to be one longer than the maximum path
1368 * since the "list" is null terminated.
1369 */
1370 struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
1371 struct radix_tree_node *slot = NULL; 1370 struct radix_tree_node *slot = NULL;
1372 struct radix_tree_node *to_free; 1371 struct radix_tree_node *to_free;
1373 unsigned int height, shift; 1372 unsigned int height, shift;
1374 int tag; 1373 int tag;
1375 int offset; 1374 int uninitialized_var(offset);
1376 1375
1377 height = root->height; 1376 height = root->height;
1378 if (index > radix_tree_maxindex(height)) 1377 if (index > radix_tree_maxindex(height))
@@ -1385,39 +1384,35 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1385 goto out; 1384 goto out;
1386 } 1385 }
1387 slot = indirect_to_ptr(slot); 1386 slot = indirect_to_ptr(slot);
1388 1387 shift = height * RADIX_TREE_MAP_SHIFT;
1389 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
1390 pathp->node = NULL;
1391 1388
1392 do { 1389 do {
1393 if (slot == NULL) 1390 if (slot == NULL)
1394 goto out; 1391 goto out;
1395 1392
1396 pathp++; 1393 shift -= RADIX_TREE_MAP_SHIFT;
1397 offset = (index >> shift) & RADIX_TREE_MAP_MASK; 1394 offset = (index >> shift) & RADIX_TREE_MAP_MASK;
1398 pathp->offset = offset; 1395 node = slot;
1399 pathp->node = slot;
1400 slot = slot->slots[offset]; 1396 slot = slot->slots[offset];
1401 shift -= RADIX_TREE_MAP_SHIFT; 1397 } while (shift);
1402 height--;
1403 } while (height > 0);
1404 1398
1405 if (slot == NULL) 1399 if (slot == NULL)
1406 goto out; 1400 goto out;
1407 1401
1408 /* 1402 /*
1409 * Clear all tags associated with the just-deleted item 1403 * Clear all tags associated with the item to be deleted.
1404 * This way of doing it would be inefficient, but seldom is any set.
1410 */ 1405 */
1411 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { 1406 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
1412 if (tag_get(pathp->node, tag, pathp->offset)) 1407 if (tag_get(node, tag, offset))
1413 radix_tree_tag_clear(root, index, tag); 1408 radix_tree_tag_clear(root, index, tag);
1414 } 1409 }
1415 1410
1416 to_free = NULL; 1411 to_free = NULL;
1417 /* Now free the nodes we do not need anymore */ 1412 /* Now free the nodes we do not need anymore */
1418 while (pathp->node) { 1413 while (node) {
1419 pathp->node->slots[pathp->offset] = NULL; 1414 node->slots[offset] = NULL;
1420 pathp->node->count--; 1415 node->count--;
1421 /* 1416 /*
1422 * Queue the node for deferred freeing after the 1417 * Queue the node for deferred freeing after the
1423 * last reference to it disappears (set NULL, above). 1418 * last reference to it disappears (set NULL, above).
@@ -1425,17 +1420,20 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1425 if (to_free) 1420 if (to_free)
1426 radix_tree_node_free(to_free); 1421 radix_tree_node_free(to_free);
1427 1422
1428 if (pathp->node->count) { 1423 if (node->count) {
1429 if (pathp->node == indirect_to_ptr(root->rnode)) 1424 if (node == indirect_to_ptr(root->rnode))
1430 radix_tree_shrink(root); 1425 radix_tree_shrink(root);
1431 goto out; 1426 goto out;
1432 } 1427 }
1433 1428
1434 /* Node with zero slots in use so free it */ 1429 /* Node with zero slots in use so free it */
1435 to_free = pathp->node; 1430 to_free = node;
1436 pathp--;
1437 1431
1432 index >>= RADIX_TREE_MAP_SHIFT;
1433 offset = index & RADIX_TREE_MAP_MASK;
1434 node = node->parent;
1438 } 1435 }
1436
1439 root_tag_clear_all(root); 1437 root_tag_clear_all(root);
1440 root->height = 0; 1438 root->height = 0;
1441 root->rnode = NULL; 1439 root->rnode = NULL;
diff --git a/mm/compaction.c b/mm/compaction.c
index e6670c34eb49..71a58f67f481 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -350,7 +350,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
350 } 350 }
351 351
352 if (!cc->sync) 352 if (!cc->sync)
353 mode |= ISOLATE_CLEAN; 353 mode |= ISOLATE_ASYNC_MIGRATE;
354 354
355 /* Try isolate the page */ 355 /* Try isolate the page */
356 if (__isolate_lru_page(page, mode, 0) != 0) 356 if (__isolate_lru_page(page, mode, 0) != 0)
@@ -557,7 +557,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
557 nr_migrate = cc->nr_migratepages; 557 nr_migrate = cc->nr_migratepages;
558 err = migrate_pages(&cc->migratepages, compaction_alloc, 558 err = migrate_pages(&cc->migratepages, compaction_alloc,
559 (unsigned long)cc, false, 559 (unsigned long)cc, false,
560 cc->sync); 560 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
561 update_nr_listpages(cc); 561 update_nr_listpages(cc);
562 nr_remaining = cc->nr_migratepages; 562 nr_remaining = cc->nr_migratepages;
563 563
@@ -671,6 +671,7 @@ static int compact_node(int nid)
671 .nr_freepages = 0, 671 .nr_freepages = 0,
672 .nr_migratepages = 0, 672 .nr_migratepages = 0,
673 .order = -1, 673 .order = -1,
674 .sync = true,
674 }; 675 };
675 676
676 zone = &pgdat->node_zones[zoneid]; 677 zone = &pgdat->node_zones[zoneid];
diff --git a/mm/filemap.c b/mm/filemap.c
index c4ee2e918bea..97f49ed35bd2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -393,24 +393,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range);
393int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) 393int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
394{ 394{
395 int error; 395 int error;
396 struct mem_cgroup *memcg = NULL;
397 396
398 VM_BUG_ON(!PageLocked(old)); 397 VM_BUG_ON(!PageLocked(old));
399 VM_BUG_ON(!PageLocked(new)); 398 VM_BUG_ON(!PageLocked(new));
400 VM_BUG_ON(new->mapping); 399 VM_BUG_ON(new->mapping);
401 400
402 /*
403 * This is not page migration, but prepare_migration and
404 * end_migration does enough work for charge replacement.
405 *
406 * In the longer term we probably want a specialized function
407 * for moving the charge from old to new in a more efficient
408 * manner.
409 */
410 error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
411 if (error)
412 return error;
413
414 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 401 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
415 if (!error) { 402 if (!error) {
416 struct address_space *mapping = old->mapping; 403 struct address_space *mapping = old->mapping;
@@ -432,13 +419,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
432 if (PageSwapBacked(new)) 419 if (PageSwapBacked(new))
433 __inc_zone_page_state(new, NR_SHMEM); 420 __inc_zone_page_state(new, NR_SHMEM);
434 spin_unlock_irq(&mapping->tree_lock); 421 spin_unlock_irq(&mapping->tree_lock);
422 /* mem_cgroup codes must not be called under tree_lock */
423 mem_cgroup_replace_page_cache(old, new);
435 radix_tree_preload_end(); 424 radix_tree_preload_end();
436 if (freepage) 425 if (freepage)
437 freepage(old); 426 freepage(old);
438 page_cache_release(old); 427 page_cache_release(old);
439 mem_cgroup_end_migration(memcg, old, new, true);
440 } else {
441 mem_cgroup_end_migration(memcg, old, new, false);
442 } 428 }
443 429
444 return error; 430 return error;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 36b3d988b4ef..b3ffc21ce801 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -487,41 +487,68 @@ static struct attribute_group khugepaged_attr_group = {
487 .attrs = khugepaged_attr, 487 .attrs = khugepaged_attr,
488 .name = "khugepaged", 488 .name = "khugepaged",
489}; 489};
490#endif /* CONFIG_SYSFS */
491 490
492static int __init hugepage_init(void) 491static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
493{ 492{
494 int err; 493 int err;
495#ifdef CONFIG_SYSFS
496 static struct kobject *hugepage_kobj;
497#endif
498
499 err = -EINVAL;
500 if (!has_transparent_hugepage()) {
501 transparent_hugepage_flags = 0;
502 goto out;
503 }
504 494
505#ifdef CONFIG_SYSFS 495 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
506 err = -ENOMEM; 496 if (unlikely(!*hugepage_kobj)) {
507 hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
508 if (unlikely(!hugepage_kobj)) {
509 printk(KERN_ERR "hugepage: failed kobject create\n"); 497 printk(KERN_ERR "hugepage: failed kobject create\n");
510 goto out; 498 return -ENOMEM;
511 } 499 }
512 500
513 err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); 501 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
514 if (err) { 502 if (err) {
515 printk(KERN_ERR "hugepage: failed register hugeage group\n"); 503 printk(KERN_ERR "hugepage: failed register hugeage group\n");
516 goto out; 504 goto delete_obj;
517 } 505 }
518 506
519 err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); 507 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
520 if (err) { 508 if (err) {
521 printk(KERN_ERR "hugepage: failed register hugeage group\n"); 509 printk(KERN_ERR "hugepage: failed register hugeage group\n");
522 goto out; 510 goto remove_hp_group;
523 } 511 }
524#endif 512
513 return 0;
514
515remove_hp_group:
516 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
517delete_obj:
518 kobject_put(*hugepage_kobj);
519 return err;
520}
521
522static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
523{
524 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
525 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
526 kobject_put(hugepage_kobj);
527}
528#else
529static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
530{
531 return 0;
532}
533
534static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
535{
536}
537#endif /* CONFIG_SYSFS */
538
539static int __init hugepage_init(void)
540{
541 int err;
542 struct kobject *hugepage_kobj;
543
544 if (!has_transparent_hugepage()) {
545 transparent_hugepage_flags = 0;
546 return -EINVAL;
547 }
548
549 err = hugepage_init_sysfs(&hugepage_kobj);
550 if (err)
551 return err;
525 552
526 err = khugepaged_slab_init(); 553 err = khugepaged_slab_init();
527 if (err) 554 if (err)
@@ -545,7 +572,9 @@ static int __init hugepage_init(void)
545 572
546 set_recommended_min_free_kbytes(); 573 set_recommended_min_free_kbytes();
547 574
575 return 0;
548out: 576out:
577 hugepage_exit_sysfs(hugepage_kobj);
549 return err; 578 return err;
550} 579}
551module_init(hugepage_init) 580module_init(hugepage_init)
@@ -997,7 +1026,7 @@ out:
997} 1026}
998 1027
999int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1028int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1000 pmd_t *pmd) 1029 pmd_t *pmd, unsigned long addr)
1001{ 1030{
1002 int ret = 0; 1031 int ret = 0;
1003 1032
@@ -1013,6 +1042,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1013 pgtable = get_pmd_huge_pte(tlb->mm); 1042 pgtable = get_pmd_huge_pte(tlb->mm);
1014 page = pmd_page(*pmd); 1043 page = pmd_page(*pmd);
1015 pmd_clear(pmd); 1044 pmd_clear(pmd);
1045 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1016 page_remove_rmap(page); 1046 page_remove_rmap(page);
1017 VM_BUG_ON(page_mapcount(page) < 0); 1047 VM_BUG_ON(page_mapcount(page) < 0);
1018 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1048 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
@@ -1116,7 +1146,6 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1116 entry = pmd_modify(entry, newprot); 1146 entry = pmd_modify(entry, newprot);
1117 set_pmd_at(mm, addr, pmd, entry); 1147 set_pmd_at(mm, addr, pmd, entry);
1118 spin_unlock(&vma->vm_mm->page_table_lock); 1148 spin_unlock(&vma->vm_mm->page_table_lock);
1119 flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
1120 ret = 1; 1149 ret = 1;
1121 } 1150 }
1122 } else 1151 } else
@@ -1199,16 +1228,16 @@ static int __split_huge_page_splitting(struct page *page,
1199static void __split_huge_page_refcount(struct page *page) 1228static void __split_huge_page_refcount(struct page *page)
1200{ 1229{
1201 int i; 1230 int i;
1202 unsigned long head_index = page->index;
1203 struct zone *zone = page_zone(page); 1231 struct zone *zone = page_zone(page);
1204 int zonestat;
1205 int tail_count = 0; 1232 int tail_count = 0;
1206 1233
1207 /* prevent PageLRU to go away from under us, and freeze lru stats */ 1234 /* prevent PageLRU to go away from under us, and freeze lru stats */
1208 spin_lock_irq(&zone->lru_lock); 1235 spin_lock_irq(&zone->lru_lock);
1209 compound_lock(page); 1236 compound_lock(page);
1237 /* complete memcg works before add pages to LRU */
1238 mem_cgroup_split_huge_fixup(page);
1210 1239
1211 for (i = 1; i < HPAGE_PMD_NR; i++) { 1240 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
1212 struct page *page_tail = page + i; 1241 struct page *page_tail = page + i;
1213 1242
1214 /* tail_page->_mapcount cannot change */ 1243 /* tail_page->_mapcount cannot change */
@@ -1271,14 +1300,13 @@ static void __split_huge_page_refcount(struct page *page)
1271 BUG_ON(page_tail->mapping); 1300 BUG_ON(page_tail->mapping);
1272 page_tail->mapping = page->mapping; 1301 page_tail->mapping = page->mapping;
1273 1302
1274 page_tail->index = ++head_index; 1303 page_tail->index = page->index + i;
1275 1304
1276 BUG_ON(!PageAnon(page_tail)); 1305 BUG_ON(!PageAnon(page_tail));
1277 BUG_ON(!PageUptodate(page_tail)); 1306 BUG_ON(!PageUptodate(page_tail));
1278 BUG_ON(!PageDirty(page_tail)); 1307 BUG_ON(!PageDirty(page_tail));
1279 BUG_ON(!PageSwapBacked(page_tail)); 1308 BUG_ON(!PageSwapBacked(page_tail));
1280 1309
1281 mem_cgroup_split_huge_fixup(page, page_tail);
1282 1310
1283 lru_add_page_tail(zone, page, page_tail); 1311 lru_add_page_tail(zone, page, page_tail);
1284 } 1312 }
@@ -1288,15 +1316,6 @@ static void __split_huge_page_refcount(struct page *page)
1288 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1316 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1289 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); 1317 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
1290 1318
1291 /*
1292 * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
1293 * so adjust those appropriately if this page is on the LRU.
1294 */
1295 if (PageLRU(page)) {
1296 zonestat = NR_LRU_BASE + page_lru(page);
1297 __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
1298 }
1299
1300 ClearPageCompound(page); 1319 ClearPageCompound(page);
1301 compound_unlock(page); 1320 compound_unlock(page);
1302 spin_unlock_irq(&zone->lru_lock); 1321 spin_unlock_irq(&zone->lru_lock);
diff --git a/mm/ksm.c b/mm/ksm.c
index 310544a379ae..1925ffbfb27f 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -28,6 +28,7 @@
28#include <linux/kthread.h> 28#include <linux/kthread.h>
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/memcontrol.h>
31#include <linux/rbtree.h> 32#include <linux/rbtree.h>
32#include <linux/memory.h> 33#include <linux/memory.h>
33#include <linux/mmu_notifier.h> 34#include <linux/mmu_notifier.h>
@@ -1571,6 +1572,16 @@ struct page *ksm_does_need_to_copy(struct page *page,
1571 1572
1572 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1573 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1573 if (new_page) { 1574 if (new_page) {
1575 /*
1576 * The memcg-specific accounting when moving
1577 * pages around the LRU lists relies on the
1578 * page's owner (memcg) to be valid. Usually,
1579 * pages are assigned to a new owner before
1580 * being put on the LRU list, but since this
1581 * is not the case here, the stale owner from
1582 * a previous allocation cycle must be reset.
1583 */
1584 mem_cgroup_reset_owner(new_page);
1574 copy_user_highpage(new_page, page, address, vma); 1585 copy_user_highpage(new_page, page, address, vma);
1575 1586
1576 SetPageDirty(new_page); 1587 SetPageDirty(new_page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d87aa3510c5e..602207be9853 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -123,16 +123,22 @@ struct mem_cgroup_stat_cpu {
123 unsigned long targets[MEM_CGROUP_NTARGETS]; 123 unsigned long targets[MEM_CGROUP_NTARGETS];
124}; 124};
125 125
126struct mem_cgroup_reclaim_iter {
127 /* css_id of the last scanned hierarchy member */
128 int position;
129 /* scan generation, increased every round-trip */
130 unsigned int generation;
131};
132
126/* 133/*
127 * per-zone information in memory controller. 134 * per-zone information in memory controller.
128 */ 135 */
129struct mem_cgroup_per_zone { 136struct mem_cgroup_per_zone {
130 /* 137 struct lruvec lruvec;
131 * spin_lock to protect the per cgroup LRU
132 */
133 struct list_head lists[NR_LRU_LISTS];
134 unsigned long count[NR_LRU_LISTS]; 138 unsigned long count[NR_LRU_LISTS];
135 139
140 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
141
136 struct zone_reclaim_stat reclaim_stat; 142 struct zone_reclaim_stat reclaim_stat;
137 struct rb_node tree_node; /* RB tree node */ 143 struct rb_node tree_node; /* RB tree node */
138 unsigned long long usage_in_excess;/* Set to the value by which */ 144 unsigned long long usage_in_excess;/* Set to the value by which */
@@ -233,11 +239,6 @@ struct mem_cgroup {
233 * per zone LRU lists. 239 * per zone LRU lists.
234 */ 240 */
235 struct mem_cgroup_lru_info info; 241 struct mem_cgroup_lru_info info;
236 /*
237 * While reclaiming in a hierarchy, we cache the last child we
238 * reclaimed from.
239 */
240 int last_scanned_child;
241 int last_scanned_node; 242 int last_scanned_node;
242#if MAX_NUMNODES > 1 243#if MAX_NUMNODES > 1
243 nodemask_t scan_nodes; 244 nodemask_t scan_nodes;
@@ -366,8 +367,6 @@ enum charge_type {
366#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 367#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
367#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 368#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
368#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 369#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
369#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
370#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
371 370
372static void mem_cgroup_get(struct mem_cgroup *memcg); 371static void mem_cgroup_get(struct mem_cgroup *memcg);
373static void mem_cgroup_put(struct mem_cgroup *memcg); 372static void mem_cgroup_put(struct mem_cgroup *memcg);
@@ -566,7 +565,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
566 struct mem_cgroup_per_zone *mz; 565 struct mem_cgroup_per_zone *mz;
567 struct mem_cgroup_tree_per_zone *mctz; 566 struct mem_cgroup_tree_per_zone *mctz;
568 567
569 for_each_node_state(node, N_POSSIBLE) { 568 for_each_node(node) {
570 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 569 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
571 mz = mem_cgroup_zoneinfo(memcg, node, zone); 570 mz = mem_cgroup_zoneinfo(memcg, node, zone);
572 mctz = soft_limit_tree_node_zone(node, zone); 571 mctz = soft_limit_tree_node_zone(node, zone);
@@ -656,16 +655,6 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
656 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 655 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
657} 656}
658 657
659void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val)
660{
661 this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
662}
663
664void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val)
665{
666 this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
667}
668
669static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 658static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
670 enum mem_cgroup_events_index idx) 659 enum mem_cgroup_events_index idx)
671{ 660{
@@ -749,37 +738,32 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
749 return total; 738 return total;
750} 739}
751 740
752static bool __memcg_event_check(struct mem_cgroup *memcg, int target) 741static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
742 enum mem_cgroup_events_target target)
753{ 743{
754 unsigned long val, next; 744 unsigned long val, next;
755 745
756 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); 746 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
757 next = __this_cpu_read(memcg->stat->targets[target]); 747 next = __this_cpu_read(memcg->stat->targets[target]);
758 /* from time_after() in jiffies.h */ 748 /* from time_after() in jiffies.h */
759 return ((long)next - (long)val < 0); 749 if ((long)next - (long)val < 0) {
760} 750 switch (target) {
761 751 case MEM_CGROUP_TARGET_THRESH:
762static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) 752 next = val + THRESHOLDS_EVENTS_TARGET;
763{ 753 break;
764 unsigned long val, next; 754 case MEM_CGROUP_TARGET_SOFTLIMIT:
765 755 next = val + SOFTLIMIT_EVENTS_TARGET;
766 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); 756 break;
767 757 case MEM_CGROUP_TARGET_NUMAINFO:
768 switch (target) { 758 next = val + NUMAINFO_EVENTS_TARGET;
769 case MEM_CGROUP_TARGET_THRESH: 759 break;
770 next = val + THRESHOLDS_EVENTS_TARGET; 760 default:
771 break; 761 break;
772 case MEM_CGROUP_TARGET_SOFTLIMIT: 762 }
773 next = val + SOFTLIMIT_EVENTS_TARGET; 763 __this_cpu_write(memcg->stat->targets[target], next);
774 break; 764 return true;
775 case MEM_CGROUP_TARGET_NUMAINFO:
776 next = val + NUMAINFO_EVENTS_TARGET;
777 break;
778 default:
779 return;
780 } 765 }
781 766 return false;
782 __this_cpu_write(memcg->stat->targets[target], next);
783} 767}
784 768
785/* 769/*
@@ -790,25 +774,27 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
790{ 774{
791 preempt_disable(); 775 preempt_disable();
792 /* threshold event is triggered in finer grain than soft limit */ 776 /* threshold event is triggered in finer grain than soft limit */
793 if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { 777 if (unlikely(mem_cgroup_event_ratelimit(memcg,
778 MEM_CGROUP_TARGET_THRESH))) {
779 bool do_softlimit, do_numainfo;
780
781 do_softlimit = mem_cgroup_event_ratelimit(memcg,
782 MEM_CGROUP_TARGET_SOFTLIMIT);
783#if MAX_NUMNODES > 1
784 do_numainfo = mem_cgroup_event_ratelimit(memcg,
785 MEM_CGROUP_TARGET_NUMAINFO);
786#endif
787 preempt_enable();
788
794 mem_cgroup_threshold(memcg); 789 mem_cgroup_threshold(memcg);
795 __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); 790 if (unlikely(do_softlimit))
796 if (unlikely(__memcg_event_check(memcg,
797 MEM_CGROUP_TARGET_SOFTLIMIT))) {
798 mem_cgroup_update_tree(memcg, page); 791 mem_cgroup_update_tree(memcg, page);
799 __mem_cgroup_target_update(memcg,
800 MEM_CGROUP_TARGET_SOFTLIMIT);
801 }
802#if MAX_NUMNODES > 1 792#if MAX_NUMNODES > 1
803 if (unlikely(__memcg_event_check(memcg, 793 if (unlikely(do_numainfo))
804 MEM_CGROUP_TARGET_NUMAINFO))) {
805 atomic_inc(&memcg->numainfo_events); 794 atomic_inc(&memcg->numainfo_events);
806 __mem_cgroup_target_update(memcg,
807 MEM_CGROUP_TARGET_NUMAINFO);
808 }
809#endif 795#endif
810 } 796 } else
811 preempt_enable(); 797 preempt_enable();
812} 798}
813 799
814struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 800struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
@@ -853,83 +839,116 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
853 return memcg; 839 return memcg;
854} 840}
855 841
856/* The caller has to guarantee "mem" exists before calling this */ 842/**
857static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) 843 * mem_cgroup_iter - iterate over memory cgroup hierarchy
844 * @root: hierarchy root
845 * @prev: previously returned memcg, NULL on first invocation
846 * @reclaim: cookie for shared reclaim walks, NULL for full walks
847 *
848 * Returns references to children of the hierarchy below @root, or
849 * @root itself, or %NULL after a full round-trip.
850 *
851 * Caller must pass the return value in @prev on subsequent
852 * invocations for reference counting, or use mem_cgroup_iter_break()
853 * to cancel a hierarchy walk before the round-trip is complete.
854 *
855 * Reclaimers can specify a zone and a priority level in @reclaim to
856 * divide up the memcgs in the hierarchy among all concurrent
857 * reclaimers operating on the same zone and priority.
858 */
859struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
860 struct mem_cgroup *prev,
861 struct mem_cgroup_reclaim_cookie *reclaim)
858{ 862{
859 struct cgroup_subsys_state *css; 863 struct mem_cgroup *memcg = NULL;
860 int found; 864 int id = 0;
861 865
862 if (!memcg) /* ROOT cgroup has the smallest ID */ 866 if (mem_cgroup_disabled())
863 return root_mem_cgroup; /*css_put/get against root is ignored*/
864 if (!memcg->use_hierarchy) {
865 if (css_tryget(&memcg->css))
866 return memcg;
867 return NULL; 867 return NULL;
868 }
869 rcu_read_lock();
870 /*
871 * searching a memory cgroup which has the smallest ID under given
872 * ROOT cgroup. (ID >= 1)
873 */
874 css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found);
875 if (css && css_tryget(css))
876 memcg = container_of(css, struct mem_cgroup, css);
877 else
878 memcg = NULL;
879 rcu_read_unlock();
880 return memcg;
881}
882 868
883static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, 869 if (!root)
884 struct mem_cgroup *root, 870 root = root_mem_cgroup;
885 bool cond)
886{
887 int nextid = css_id(&iter->css) + 1;
888 int found;
889 int hierarchy_used;
890 struct cgroup_subsys_state *css;
891 871
892 hierarchy_used = iter->use_hierarchy; 872 if (prev && !reclaim)
873 id = css_id(&prev->css);
893 874
894 css_put(&iter->css); 875 if (prev && prev != root)
895 /* If no ROOT, walk all, ignore hierarchy */ 876 css_put(&prev->css);
896 if (!cond || (root && !hierarchy_used))
897 return NULL;
898 877
899 if (!root) 878 if (!root->use_hierarchy && root != root_mem_cgroup) {
900 root = root_mem_cgroup; 879 if (prev)
880 return NULL;
881 return root;
882 }
901 883
902 do { 884 while (!memcg) {
903 iter = NULL; 885 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
904 rcu_read_lock(); 886 struct cgroup_subsys_state *css;
887
888 if (reclaim) {
889 int nid = zone_to_nid(reclaim->zone);
890 int zid = zone_idx(reclaim->zone);
891 struct mem_cgroup_per_zone *mz;
905 892
906 css = css_get_next(&mem_cgroup_subsys, nextid, 893 mz = mem_cgroup_zoneinfo(root, nid, zid);
907 &root->css, &found); 894 iter = &mz->reclaim_iter[reclaim->priority];
908 if (css && css_tryget(css)) 895 if (prev && reclaim->generation != iter->generation)
909 iter = container_of(css, struct mem_cgroup, css); 896 return NULL;
897 id = iter->position;
898 }
899
900 rcu_read_lock();
901 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
902 if (css) {
903 if (css == &root->css || css_tryget(css))
904 memcg = container_of(css,
905 struct mem_cgroup, css);
906 } else
907 id = 0;
910 rcu_read_unlock(); 908 rcu_read_unlock();
911 /* If css is NULL, no more cgroups will be found */
912 nextid = found + 1;
913 } while (css && !iter);
914 909
915 return iter; 910 if (reclaim) {
911 iter->position = id;
912 if (!css)
913 iter->generation++;
914 else if (!prev && memcg)
915 reclaim->generation = iter->generation;
916 }
917
918 if (prev && !css)
919 return NULL;
920 }
921 return memcg;
916} 922}
917/*
918 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
919 * be careful that "break" loop is not allowed. We have reference count.
920 * Instead of that modify "cond" to be false and "continue" to exit the loop.
921 */
922#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
923 for (iter = mem_cgroup_start_loop(root);\
924 iter != NULL;\
925 iter = mem_cgroup_get_next(iter, root, cond))
926 923
927#define for_each_mem_cgroup_tree(iter, root) \ 924/**
928 for_each_mem_cgroup_tree_cond(iter, root, true) 925 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
926 * @root: hierarchy root
927 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
928 */
929void mem_cgroup_iter_break(struct mem_cgroup *root,
930 struct mem_cgroup *prev)
931{
932 if (!root)
933 root = root_mem_cgroup;
934 if (prev && prev != root)
935 css_put(&prev->css);
936}
929 937
930#define for_each_mem_cgroup_all(iter) \ 938/*
931 for_each_mem_cgroup_tree_cond(iter, NULL, true) 939 * Iteration constructs for visiting all cgroups (under a tree). If
940 * loops are exited prematurely (break), mem_cgroup_iter_break() must
941 * be used for reference counting.
942 */
943#define for_each_mem_cgroup_tree(iter, root) \
944 for (iter = mem_cgroup_iter(root, NULL, NULL); \
945 iter != NULL; \
946 iter = mem_cgroup_iter(root, iter, NULL))
932 947
948#define for_each_mem_cgroup(iter) \
949 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
950 iter != NULL; \
951 iter = mem_cgroup_iter(NULL, iter, NULL))
933 952
934static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 953static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
935{ 954{
@@ -949,11 +968,11 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
949 goto out; 968 goto out;
950 969
951 switch (idx) { 970 switch (idx) {
952 case PGMAJFAULT:
953 mem_cgroup_pgmajfault(memcg, 1);
954 break;
955 case PGFAULT: 971 case PGFAULT:
956 mem_cgroup_pgfault(memcg, 1); 972 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
973 break;
974 case PGMAJFAULT:
975 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
957 break; 976 break;
958 default: 977 default:
959 BUG(); 978 BUG();
@@ -963,6 +982,27 @@ out:
963} 982}
964EXPORT_SYMBOL(mem_cgroup_count_vm_event); 983EXPORT_SYMBOL(mem_cgroup_count_vm_event);
965 984
985/**
986 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
987 * @zone: zone of the wanted lruvec
988 * @mem: memcg of the wanted lruvec
989 *
990 * Returns the lru list vector holding pages for the given @zone and
991 * @mem. This can be the global zone lruvec, if the memory controller
992 * is disabled.
993 */
994struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
995 struct mem_cgroup *memcg)
996{
997 struct mem_cgroup_per_zone *mz;
998
999 if (mem_cgroup_disabled())
1000 return &zone->lruvec;
1001
1002 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1003 return &mz->lruvec;
1004}
1005
966/* 1006/*
967 * Following LRU functions are allowed to be used without PCG_LOCK. 1007 * Following LRU functions are allowed to be used without PCG_LOCK.
968 * Operations are called by routine of global LRU independently from memcg. 1008 * Operations are called by routine of global LRU independently from memcg.
@@ -977,180 +1017,91 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event);
977 * When moving account, the page is not on LRU. It's isolated. 1017 * When moving account, the page is not on LRU. It's isolated.
978 */ 1018 */
979 1019
980void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 1020/**
981{ 1021 * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec
982 struct page_cgroup *pc; 1022 * @zone: zone of the page
983 struct mem_cgroup_per_zone *mz; 1023 * @page: the page
984 1024 * @lru: current lru
985 if (mem_cgroup_disabled()) 1025 *
986 return; 1026 * This function accounts for @page being added to @lru, and returns
987 pc = lookup_page_cgroup(page); 1027 * the lruvec for the given @zone and the memcg @page is charged to.
988 /* can happen while we handle swapcache. */ 1028 *
989 if (!TestClearPageCgroupAcctLRU(pc)) 1029 * The callsite is then responsible for physically linking the page to
990 return; 1030 * the returned lruvec->lists[@lru].
991 VM_BUG_ON(!pc->mem_cgroup);
992 /*
993 * We don't check PCG_USED bit. It's cleared when the "page" is finally
994 * removed from global LRU.
995 */
996 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
997 /* huge page split is done under lru_lock. so, we have no races. */
998 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
999 if (mem_cgroup_is_root(pc->mem_cgroup))
1000 return;
1001 VM_BUG_ON(list_empty(&pc->lru));
1002 list_del_init(&pc->lru);
1003}
1004
1005void mem_cgroup_del_lru(struct page *page)
1006{
1007 mem_cgroup_del_lru_list(page, page_lru(page));
1008}
1009
1010/*
1011 * Writeback is about to end against a page which has been marked for immediate
1012 * reclaim. If it still appears to be reclaimable, move it to the tail of the
1013 * inactive list.
1014 */ 1031 */
1015void mem_cgroup_rotate_reclaimable_page(struct page *page) 1032struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1033 enum lru_list lru)
1016{ 1034{
1017 struct mem_cgroup_per_zone *mz; 1035 struct mem_cgroup_per_zone *mz;
1036 struct mem_cgroup *memcg;
1018 struct page_cgroup *pc; 1037 struct page_cgroup *pc;
1019 enum lru_list lru = page_lru(page);
1020 1038
1021 if (mem_cgroup_disabled()) 1039 if (mem_cgroup_disabled())
1022 return; 1040 return &zone->lruvec;
1023 1041
1024 pc = lookup_page_cgroup(page); 1042 pc = lookup_page_cgroup(page);
1025 /* unused or root page is not rotated. */ 1043 memcg = pc->mem_cgroup;
1026 if (!PageCgroupUsed(pc)) 1044 mz = page_cgroup_zoneinfo(memcg, page);
1027 return; 1045 /* compound_order() is stabilized through lru_lock */
1028 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1046 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
1029 smp_rmb(); 1047 return &mz->lruvec;
1030 if (mem_cgroup_is_root(pc->mem_cgroup))
1031 return;
1032 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1033 list_move_tail(&pc->lru, &mz->lists[lru]);
1034} 1048}
1035 1049
1036void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 1050/**
1051 * mem_cgroup_lru_del_list - account for removing an lru page
1052 * @page: the page
1053 * @lru: target lru
1054 *
1055 * This function accounts for @page being removed from @lru.
1056 *
1057 * The callsite is then responsible for physically unlinking
1058 * @page->lru.
1059 */
1060void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
1037{ 1061{
1038 struct mem_cgroup_per_zone *mz; 1062 struct mem_cgroup_per_zone *mz;
1063 struct mem_cgroup *memcg;
1039 struct page_cgroup *pc; 1064 struct page_cgroup *pc;
1040 1065
1041 if (mem_cgroup_disabled()) 1066 if (mem_cgroup_disabled())
1042 return; 1067 return;
1043 1068
1044 pc = lookup_page_cgroup(page); 1069 pc = lookup_page_cgroup(page);
1045 /* unused or root page is not rotated. */ 1070 memcg = pc->mem_cgroup;
1046 if (!PageCgroupUsed(pc)) 1071 VM_BUG_ON(!memcg);
1047 return; 1072 mz = page_cgroup_zoneinfo(memcg, page);
1048 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1049 smp_rmb();
1050 if (mem_cgroup_is_root(pc->mem_cgroup))
1051 return;
1052 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1053 list_move(&pc->lru, &mz->lists[lru]);
1054}
1055
1056void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
1057{
1058 struct page_cgroup *pc;
1059 struct mem_cgroup_per_zone *mz;
1060
1061 if (mem_cgroup_disabled())
1062 return;
1063 pc = lookup_page_cgroup(page);
1064 VM_BUG_ON(PageCgroupAcctLRU(pc));
1065 /*
1066 * putback: charge:
1067 * SetPageLRU SetPageCgroupUsed
1068 * smp_mb smp_mb
1069 * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU
1070 *
1071 * Ensure that one of the two sides adds the page to the memcg
1072 * LRU during a race.
1073 */
1074 smp_mb();
1075 if (!PageCgroupUsed(pc))
1076 return;
1077 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1078 smp_rmb();
1079 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1080 /* huge page split is done under lru_lock. so, we have no races. */ 1073 /* huge page split is done under lru_lock. so, we have no races. */
1081 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 1074 VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page)));
1082 SetPageCgroupAcctLRU(pc); 1075 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
1083 if (mem_cgroup_is_root(pc->mem_cgroup))
1084 return;
1085 list_add(&pc->lru, &mz->lists[lru]);
1086}
1087
1088/*
1089 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
1090 * while it's linked to lru because the page may be reused after it's fully
1091 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
1092 * It's done under lock_page and expected that zone->lru_lock isnever held.
1093 */
1094static void mem_cgroup_lru_del_before_commit(struct page *page)
1095{
1096 unsigned long flags;
1097 struct zone *zone = page_zone(page);
1098 struct page_cgroup *pc = lookup_page_cgroup(page);
1099
1100 /*
1101 * Doing this check without taking ->lru_lock seems wrong but this
1102 * is safe. Because if page_cgroup's USED bit is unset, the page
1103 * will not be added to any memcg's LRU. If page_cgroup's USED bit is
1104 * set, the commit after this will fail, anyway.
1105 * This all charge/uncharge is done under some mutual execustion.
1106 * So, we don't need to taking care of changes in USED bit.
1107 */
1108 if (likely(!PageLRU(page)))
1109 return;
1110
1111 spin_lock_irqsave(&zone->lru_lock, flags);
1112 /*
1113 * Forget old LRU when this page_cgroup is *not* used. This Used bit
1114 * is guarded by lock_page() because the page is SwapCache.
1115 */
1116 if (!PageCgroupUsed(pc))
1117 mem_cgroup_del_lru_list(page, page_lru(page));
1118 spin_unlock_irqrestore(&zone->lru_lock, flags);
1119} 1076}
1120 1077
1121static void mem_cgroup_lru_add_after_commit(struct page *page) 1078void mem_cgroup_lru_del(struct page *page)
1122{ 1079{
1123 unsigned long flags; 1080 mem_cgroup_lru_del_list(page, page_lru(page));
1124 struct zone *zone = page_zone(page);
1125 struct page_cgroup *pc = lookup_page_cgroup(page);
1126 /*
1127 * putback: charge:
1128 * SetPageLRU SetPageCgroupUsed
1129 * smp_mb smp_mb
1130 * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU
1131 *
1132 * Ensure that one of the two sides adds the page to the memcg
1133 * LRU during a race.
1134 */
1135 smp_mb();
1136 /* taking care of that the page is added to LRU while we commit it */
1137 if (likely(!PageLRU(page)))
1138 return;
1139 spin_lock_irqsave(&zone->lru_lock, flags);
1140 /* link when the page is linked to LRU but page_cgroup isn't */
1141 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
1142 mem_cgroup_add_lru_list(page, page_lru(page));
1143 spin_unlock_irqrestore(&zone->lru_lock, flags);
1144} 1081}
1145 1082
1146 1083/**
1147void mem_cgroup_move_lists(struct page *page, 1084 * mem_cgroup_lru_move_lists - account for moving a page between lrus
1148 enum lru_list from, enum lru_list to) 1085 * @zone: zone of the page
1086 * @page: the page
1087 * @from: current lru
1088 * @to: target lru
1089 *
1090 * This function accounts for @page being moved between the lrus @from
1091 * and @to, and returns the lruvec for the given @zone and the memcg
1092 * @page is charged to.
1093 *
1094 * The callsite is then responsible for physically relinking
1095 * @page->lru to the returned lruvec->lists[@to].
1096 */
1097struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
1098 struct page *page,
1099 enum lru_list from,
1100 enum lru_list to)
1149{ 1101{
1150 if (mem_cgroup_disabled()) 1102 /* XXX: Optimize this, especially for @from == @to */
1151 return; 1103 mem_cgroup_lru_del_list(page, from);
1152 mem_cgroup_del_lru_list(page, from); 1104 return mem_cgroup_lru_add_list(zone, page, to);
1153 mem_cgroup_add_lru_list(page, to);
1154} 1105}
1155 1106
1156/* 1107/*
@@ -1175,10 +1126,21 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1175 struct task_struct *p; 1126 struct task_struct *p;
1176 1127
1177 p = find_lock_task_mm(task); 1128 p = find_lock_task_mm(task);
1178 if (!p) 1129 if (p) {
1179 return 0; 1130 curr = try_get_mem_cgroup_from_mm(p->mm);
1180 curr = try_get_mem_cgroup_from_mm(p->mm); 1131 task_unlock(p);
1181 task_unlock(p); 1132 } else {
1133 /*
1134 * All threads may have already detached their mm's, but the oom
1135 * killer still needs to detect if they have already been oom
1136 * killed to prevent needlessly killing additional tasks.
1137 */
1138 task_lock(task);
1139 curr = mem_cgroup_from_task(task);
1140 if (curr)
1141 css_get(&curr->css);
1142 task_unlock(task);
1143 }
1182 if (!curr) 1144 if (!curr)
1183 return 0; 1145 return 0;
1184 /* 1146 /*
@@ -1258,68 +1220,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1258 return &mz->reclaim_stat; 1220 return &mz->reclaim_stat;
1259} 1221}
1260 1222
1261unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1262 struct list_head *dst,
1263 unsigned long *scanned, int order,
1264 isolate_mode_t mode,
1265 struct zone *z,
1266 struct mem_cgroup *mem_cont,
1267 int active, int file)
1268{
1269 unsigned long nr_taken = 0;
1270 struct page *page;
1271 unsigned long scan;
1272 LIST_HEAD(pc_list);
1273 struct list_head *src;
1274 struct page_cgroup *pc, *tmp;
1275 int nid = zone_to_nid(z);
1276 int zid = zone_idx(z);
1277 struct mem_cgroup_per_zone *mz;
1278 int lru = LRU_FILE * file + active;
1279 int ret;
1280
1281 BUG_ON(!mem_cont);
1282 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1283 src = &mz->lists[lru];
1284
1285 scan = 0;
1286 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
1287 if (scan >= nr_to_scan)
1288 break;
1289
1290 if (unlikely(!PageCgroupUsed(pc)))
1291 continue;
1292
1293 page = lookup_cgroup_page(pc);
1294
1295 if (unlikely(!PageLRU(page)))
1296 continue;
1297
1298 scan++;
1299 ret = __isolate_lru_page(page, mode, file);
1300 switch (ret) {
1301 case 0:
1302 list_move(&page->lru, dst);
1303 mem_cgroup_del_lru(page);
1304 nr_taken += hpage_nr_pages(page);
1305 break;
1306 case -EBUSY:
1307 /* we don't affect global LRU but rotate in our LRU */
1308 mem_cgroup_rotate_lru_list(page, page_lru(page));
1309 break;
1310 default:
1311 break;
1312 }
1313 }
1314
1315 *scanned = scan;
1316
1317 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1318 0, 0, 0, mode);
1319
1320 return nr_taken;
1321}
1322
1323#define mem_cgroup_from_res_counter(counter, member) \ 1223#define mem_cgroup_from_res_counter(counter, member) \
1324 container_of(counter, struct mem_cgroup, member) 1224 container_of(counter, struct mem_cgroup, member)
1325 1225
@@ -1536,41 +1436,40 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1536 return min(limit, memsw); 1436 return min(limit, memsw);
1537} 1437}
1538 1438
1539/* 1439static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1540 * Visit the first child (need not be the first child as per the ordering 1440 gfp_t gfp_mask,
1541 * of the cgroup list, since we track last_scanned_child) of @mem and use 1441 unsigned long flags)
1542 * that to reclaim free pages from.
1543 */
1544static struct mem_cgroup *
1545mem_cgroup_select_victim(struct mem_cgroup *root_memcg)
1546{ 1442{
1547 struct mem_cgroup *ret = NULL; 1443 unsigned long total = 0;
1548 struct cgroup_subsys_state *css; 1444 bool noswap = false;
1549 int nextid, found; 1445 int loop;
1550
1551 if (!root_memcg->use_hierarchy) {
1552 css_get(&root_memcg->css);
1553 ret = root_memcg;
1554 }
1555 1446
1556 while (!ret) { 1447 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1557 rcu_read_lock(); 1448 noswap = true;
1558 nextid = root_memcg->last_scanned_child + 1; 1449 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1559 css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css, 1450 noswap = true;
1560 &found);
1561 if (css && css_tryget(css))
1562 ret = container_of(css, struct mem_cgroup, css);
1563 1451
1564 rcu_read_unlock(); 1452 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1565 /* Updates scanning parameter */ 1453 if (loop)
1566 if (!css) { 1454 drain_all_stock_async(memcg);
1567 /* this means start scan from ID:1 */ 1455 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1568 root_memcg->last_scanned_child = 0; 1456 /*
1569 } else 1457 * Allow limit shrinkers, which are triggered directly
1570 root_memcg->last_scanned_child = found; 1458 * by userspace, to catch signals and stop reclaim
1459 * after minimal progress, regardless of the margin.
1460 */
1461 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1462 break;
1463 if (mem_cgroup_margin(memcg))
1464 break;
1465 /*
1466 * If nothing was reclaimed after two attempts, there
1467 * may be no reclaimable pages in this hierarchy.
1468 */
1469 if (loop && !total)
1470 break;
1571 } 1471 }
1572 1472 return total;
1573 return ret;
1574} 1473}
1575 1474
1576/** 1475/**
@@ -1710,61 +1609,35 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1710} 1609}
1711#endif 1610#endif
1712 1611
1713/* 1612static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1714 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1613 struct zone *zone,
1715 * we reclaimed from, so that we don't end up penalizing one child extensively 1614 gfp_t gfp_mask,
1716 * based on its position in the children list. 1615 unsigned long *total_scanned)
1717 * 1616{
1718 * root_memcg is the original ancestor that we've been reclaim from. 1617 struct mem_cgroup *victim = NULL;
1719 * 1618 int total = 0;
1720 * We give up and return to the caller when we visit root_memcg twice.
1721 * (other groups can be removed while we're walking....)
1722 *
1723 * If shrink==true, for avoiding to free too much, this returns immedieately.
1724 */
1725static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1726 struct zone *zone,
1727 gfp_t gfp_mask,
1728 unsigned long reclaim_options,
1729 unsigned long *total_scanned)
1730{
1731 struct mem_cgroup *victim;
1732 int ret, total = 0;
1733 int loop = 0; 1619 int loop = 0;
1734 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1735 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1736 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1737 unsigned long excess; 1620 unsigned long excess;
1738 unsigned long nr_scanned; 1621 unsigned long nr_scanned;
1622 struct mem_cgroup_reclaim_cookie reclaim = {
1623 .zone = zone,
1624 .priority = 0,
1625 };
1739 1626
1740 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 1627 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
1741 1628
1742 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1743 if (!check_soft && !shrink && root_memcg->memsw_is_minimum)
1744 noswap = true;
1745
1746 while (1) { 1629 while (1) {
1747 victim = mem_cgroup_select_victim(root_memcg); 1630 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1748 if (victim == root_memcg) { 1631 if (!victim) {
1749 loop++; 1632 loop++;
1750 /*
1751 * We are not draining per cpu cached charges during
1752 * soft limit reclaim because global reclaim doesn't
1753 * care about charges. It tries to free some memory and
1754 * charges will not give any.
1755 */
1756 if (!check_soft && loop >= 1)
1757 drain_all_stock_async(root_memcg);
1758 if (loop >= 2) { 1633 if (loop >= 2) {
1759 /* 1634 /*
1760 * If we have not been able to reclaim 1635 * If we have not been able to reclaim
1761 * anything, it might because there are 1636 * anything, it might because there are
1762 * no reclaimable pages under this hierarchy 1637 * no reclaimable pages under this hierarchy
1763 */ 1638 */
1764 if (!check_soft || !total) { 1639 if (!total)
1765 css_put(&victim->css);
1766 break; 1640 break;
1767 }
1768 /* 1641 /*
1769 * We want to do more targeted reclaim. 1642 * We want to do more targeted reclaim.
1770 * excess >> 2 is not to excessive so as to 1643 * excess >> 2 is not to excessive so as to
@@ -1772,40 +1645,20 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1772 * coming back to reclaim from this cgroup 1645 * coming back to reclaim from this cgroup
1773 */ 1646 */
1774 if (total >= (excess >> 2) || 1647 if (total >= (excess >> 2) ||
1775 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1648 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1776 css_put(&victim->css);
1777 break; 1649 break;
1778 }
1779 } 1650 }
1780 }
1781 if (!mem_cgroup_reclaimable(victim, noswap)) {
1782 /* this cgroup's local usage == 0 */
1783 css_put(&victim->css);
1784 continue; 1651 continue;
1785 } 1652 }
1786 /* we use swappiness of local cgroup */ 1653 if (!mem_cgroup_reclaimable(victim, false))
1787 if (check_soft) { 1654 continue;
1788 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1655 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1789 noswap, zone, &nr_scanned); 1656 zone, &nr_scanned);
1790 *total_scanned += nr_scanned; 1657 *total_scanned += nr_scanned;
1791 } else 1658 if (!res_counter_soft_limit_excess(&root_memcg->res))
1792 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1659 break;
1793 noswap);
1794 css_put(&victim->css);
1795 /*
1796 * At shrinking usage, we can't check we should stop here or
1797 * reclaim more. It's depends on callers. last_scanned_child
1798 * will work enough for keeping fairness under tree.
1799 */
1800 if (shrink)
1801 return ret;
1802 total += ret;
1803 if (check_soft) {
1804 if (!res_counter_soft_limit_excess(&root_memcg->res))
1805 return total;
1806 } else if (mem_cgroup_margin(root_memcg))
1807 return total;
1808 } 1660 }
1661 mem_cgroup_iter_break(root_memcg, victim);
1809 return total; 1662 return total;
1810} 1663}
1811 1664
@@ -1817,16 +1670,16 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1817static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) 1670static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1818{ 1671{
1819 struct mem_cgroup *iter, *failed = NULL; 1672 struct mem_cgroup *iter, *failed = NULL;
1820 bool cond = true;
1821 1673
1822 for_each_mem_cgroup_tree_cond(iter, memcg, cond) { 1674 for_each_mem_cgroup_tree(iter, memcg) {
1823 if (iter->oom_lock) { 1675 if (iter->oom_lock) {
1824 /* 1676 /*
1825 * this subtree of our hierarchy is already locked 1677 * this subtree of our hierarchy is already locked
1826 * so we cannot give a lock. 1678 * so we cannot give a lock.
1827 */ 1679 */
1828 failed = iter; 1680 failed = iter;
1829 cond = false; 1681 mem_cgroup_iter_break(memcg, iter);
1682 break;
1830 } else 1683 } else
1831 iter->oom_lock = true; 1684 iter->oom_lock = true;
1832 } 1685 }
@@ -1838,11 +1691,10 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1838 * OK, we failed to lock the whole subtree so we have to clean up 1691 * OK, we failed to lock the whole subtree so we have to clean up
1839 * what we set up to the failing subtree 1692 * what we set up to the failing subtree
1840 */ 1693 */
1841 cond = true; 1694 for_each_mem_cgroup_tree(iter, memcg) {
1842 for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
1843 if (iter == failed) { 1695 if (iter == failed) {
1844 cond = false; 1696 mem_cgroup_iter_break(memcg, iter);
1845 continue; 1697 break;
1846 } 1698 }
1847 iter->oom_lock = false; 1699 iter->oom_lock = false;
1848 } 1700 }
@@ -2007,7 +1859,7 @@ void mem_cgroup_update_page_stat(struct page *page,
2007 bool need_unlock = false; 1859 bool need_unlock = false;
2008 unsigned long uninitialized_var(flags); 1860 unsigned long uninitialized_var(flags);
2009 1861
2010 if (unlikely(!pc)) 1862 if (mem_cgroup_disabled())
2011 return; 1863 return;
2012 1864
2013 rcu_read_lock(); 1865 rcu_read_lock();
@@ -2238,7 +2090,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2238 struct mem_cgroup *iter; 2090 struct mem_cgroup *iter;
2239 2091
2240 if ((action == CPU_ONLINE)) { 2092 if ((action == CPU_ONLINE)) {
2241 for_each_mem_cgroup_all(iter) 2093 for_each_mem_cgroup(iter)
2242 synchronize_mem_cgroup_on_move(iter, cpu); 2094 synchronize_mem_cgroup_on_move(iter, cpu);
2243 return NOTIFY_OK; 2095 return NOTIFY_OK;
2244 } 2096 }
@@ -2246,7 +2098,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2246 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 2098 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2247 return NOTIFY_OK; 2099 return NOTIFY_OK;
2248 2100
2249 for_each_mem_cgroup_all(iter) 2101 for_each_mem_cgroup(iter)
2250 mem_cgroup_drain_pcp_counter(iter, cpu); 2102 mem_cgroup_drain_pcp_counter(iter, cpu);
2251 2103
2252 stock = &per_cpu(memcg_stock, cpu); 2104 stock = &per_cpu(memcg_stock, cpu);
@@ -2300,8 +2152,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2300 if (!(gfp_mask & __GFP_WAIT)) 2152 if (!(gfp_mask & __GFP_WAIT))
2301 return CHARGE_WOULDBLOCK; 2153 return CHARGE_WOULDBLOCK;
2302 2154
2303 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 2155 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2304 gfp_mask, flags, NULL);
2305 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2156 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2306 return CHARGE_RETRY; 2157 return CHARGE_RETRY;
2307 /* 2158 /*
@@ -2334,8 +2185,25 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2334} 2185}
2335 2186
2336/* 2187/*
2337 * Unlike exported interface, "oom" parameter is added. if oom==true, 2188 * __mem_cgroup_try_charge() does
2338 * oom-killer can be invoked. 2189 * 1. detect memcg to be charged against from passed *mm and *ptr,
2190 * 2. update res_counter
2191 * 3. call memory reclaim if necessary.
2192 *
2193 * In some special case, if the task is fatal, fatal_signal_pending() or
2194 * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
2195 * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
2196 * as possible without any hazards. 2: all pages should have a valid
2197 * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
2198 * pointer, that is treated as a charge to root_mem_cgroup.
2199 *
2200 * So __mem_cgroup_try_charge() will return
2201 * 0 ... on success, filling *ptr with a valid memcg pointer.
2202 * -ENOMEM ... charge failure because of resource limits.
2203 * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.
2204 *
2205 * Unlike the exported interface, an "oom" parameter is added. if oom==true,
2206 * the oom-killer can be invoked.
2339 */ 2207 */
2340static int __mem_cgroup_try_charge(struct mm_struct *mm, 2208static int __mem_cgroup_try_charge(struct mm_struct *mm,
2341 gfp_t gfp_mask, 2209 gfp_t gfp_mask,
@@ -2364,7 +2232,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2364 * set, if so charge the init_mm (happens for pagecache usage). 2232 * set, if so charge the init_mm (happens for pagecache usage).
2365 */ 2233 */
2366 if (!*ptr && !mm) 2234 if (!*ptr && !mm)
2367 goto bypass; 2235 *ptr = root_mem_cgroup;
2368again: 2236again:
2369 if (*ptr) { /* css should be a valid one */ 2237 if (*ptr) { /* css should be a valid one */
2370 memcg = *ptr; 2238 memcg = *ptr;
@@ -2390,7 +2258,9 @@ again:
2390 * task-struct. So, mm->owner can be NULL. 2258 * task-struct. So, mm->owner can be NULL.
2391 */ 2259 */
2392 memcg = mem_cgroup_from_task(p); 2260 memcg = mem_cgroup_from_task(p);
2393 if (!memcg || mem_cgroup_is_root(memcg)) { 2261 if (!memcg)
2262 memcg = root_mem_cgroup;
2263 if (mem_cgroup_is_root(memcg)) {
2394 rcu_read_unlock(); 2264 rcu_read_unlock();
2395 goto done; 2265 goto done;
2396 } 2266 }
@@ -2465,8 +2335,8 @@ nomem:
2465 *ptr = NULL; 2335 *ptr = NULL;
2466 return -ENOMEM; 2336 return -ENOMEM;
2467bypass: 2337bypass:
2468 *ptr = NULL; 2338 *ptr = root_mem_cgroup;
2469 return 0; 2339 return -EINTR;
2470} 2340}
2471 2341
2472/* 2342/*
@@ -2522,7 +2392,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2522 memcg = NULL; 2392 memcg = NULL;
2523 } else if (PageSwapCache(page)) { 2393 } else if (PageSwapCache(page)) {
2524 ent.val = page_private(page); 2394 ent.val = page_private(page);
2525 id = lookup_swap_cgroup(ent); 2395 id = lookup_swap_cgroup_id(ent);
2526 rcu_read_lock(); 2396 rcu_read_lock();
2527 memcg = mem_cgroup_lookup(id); 2397 memcg = mem_cgroup_lookup(id);
2528 if (memcg && !css_tryget(&memcg->css)) 2398 if (memcg && !css_tryget(&memcg->css))
@@ -2574,6 +2444,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2574 2444
2575 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); 2445 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
2576 unlock_page_cgroup(pc); 2446 unlock_page_cgroup(pc);
2447 WARN_ON_ONCE(PageLRU(page));
2577 /* 2448 /*
2578 * "charge_statistics" updated event counter. Then, check it. 2449 * "charge_statistics" updated event counter. Then, check it.
2579 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2450 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@ -2585,44 +2456,29 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2585#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2456#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2586 2457
2587#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2458#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2588 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) 2459 (1 << PCG_MIGRATION))
2589/* 2460/*
2590 * Because tail pages are not marked as "used", set it. We're under 2461 * Because tail pages are not marked as "used", set it. We're under
2591 * zone->lru_lock, 'splitting on pmd' and compund_lock. 2462 * zone->lru_lock, 'splitting on pmd' and compound_lock.
2463 * charge/uncharge will be never happen and move_account() is done under
2464 * compound_lock(), so we don't have to take care of races.
2592 */ 2465 */
2593void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) 2466void mem_cgroup_split_huge_fixup(struct page *head)
2594{ 2467{
2595 struct page_cgroup *head_pc = lookup_page_cgroup(head); 2468 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2596 struct page_cgroup *tail_pc = lookup_page_cgroup(tail); 2469 struct page_cgroup *pc;
2597 unsigned long flags; 2470 int i;
2598 2471
2599 if (mem_cgroup_disabled()) 2472 if (mem_cgroup_disabled())
2600 return; 2473 return;
2601 /* 2474 for (i = 1; i < HPAGE_PMD_NR; i++) {
2602 * We have no races with charge/uncharge but will have races with 2475 pc = head_pc + i;
2603 * page state accounting. 2476 pc->mem_cgroup = head_pc->mem_cgroup;
2604 */ 2477 smp_wmb();/* see __commit_charge() */
2605 move_lock_page_cgroup(head_pc, &flags); 2478 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2606
2607 tail_pc->mem_cgroup = head_pc->mem_cgroup;
2608 smp_wmb(); /* see __commit_charge() */
2609 if (PageCgroupAcctLRU(head_pc)) {
2610 enum lru_list lru;
2611 struct mem_cgroup_per_zone *mz;
2612
2613 /*
2614 * LRU flags cannot be copied because we need to add tail
2615 *.page to LRU by generic call and our hook will be called.
2616 * We hold lru_lock, then, reduce counter directly.
2617 */
2618 lru = page_lru(head);
2619 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
2620 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2621 } 2479 }
2622 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2623 move_unlock_page_cgroup(head_pc, &flags);
2624} 2480}
2625#endif 2481#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2626 2482
2627/** 2483/**
2628 * mem_cgroup_move_account - move account of the page 2484 * mem_cgroup_move_account - move account of the page
@@ -2737,7 +2593,7 @@ static int mem_cgroup_move_parent(struct page *page,
2737 2593
2738 parent = mem_cgroup_from_cont(pcg); 2594 parent = mem_cgroup_from_cont(pcg);
2739 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); 2595 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
2740 if (ret || !parent) 2596 if (ret)
2741 goto put_back; 2597 goto put_back;
2742 2598
2743 if (nr_pages > 1) 2599 if (nr_pages > 1)
@@ -2783,12 +2639,9 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2783 } 2639 }
2784 2640
2785 pc = lookup_page_cgroup(page); 2641 pc = lookup_page_cgroup(page);
2786 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
2787
2788 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 2642 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2789 if (ret || !memcg) 2643 if (ret == -ENOMEM)
2790 return ret; 2644 return ret;
2791
2792 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); 2645 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
2793 return 0; 2646 return 0;
2794} 2647}
@@ -2798,19 +2651,11 @@ int mem_cgroup_newpage_charge(struct page *page,
2798{ 2651{
2799 if (mem_cgroup_disabled()) 2652 if (mem_cgroup_disabled())
2800 return 0; 2653 return 0;
2801 /* 2654 VM_BUG_ON(page_mapped(page));
2802 * If already mapped, we don't have to account. 2655 VM_BUG_ON(page->mapping && !PageAnon(page));
2803 * If page cache, page->mapping has address_space. 2656 VM_BUG_ON(!mm);
2804 * But page->mapping may have out-of-use anon_vma pointer,
2805 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
2806 * is NULL.
2807 */
2808 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2809 return 0;
2810 if (unlikely(!mm))
2811 mm = &init_mm;
2812 return mem_cgroup_charge_common(page, mm, gfp_mask, 2657 return mem_cgroup_charge_common(page, mm, gfp_mask,
2813 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2658 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2814} 2659}
2815 2660
2816static void 2661static void
@@ -2822,14 +2667,27 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
2822 enum charge_type ctype) 2667 enum charge_type ctype)
2823{ 2668{
2824 struct page_cgroup *pc = lookup_page_cgroup(page); 2669 struct page_cgroup *pc = lookup_page_cgroup(page);
2670 struct zone *zone = page_zone(page);
2671 unsigned long flags;
2672 bool removed = false;
2673
2825 /* 2674 /*
2826 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page 2675 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2827 * is already on LRU. It means the page may on some other page_cgroup's 2676 * is already on LRU. It means the page may on some other page_cgroup's
2828 * LRU. Take care of it. 2677 * LRU. Take care of it.
2829 */ 2678 */
2830 mem_cgroup_lru_del_before_commit(page); 2679 spin_lock_irqsave(&zone->lru_lock, flags);
2680 if (PageLRU(page)) {
2681 del_page_from_lru_list(zone, page, page_lru(page));
2682 ClearPageLRU(page);
2683 removed = true;
2684 }
2831 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); 2685 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
2832 mem_cgroup_lru_add_after_commit(page); 2686 if (removed) {
2687 add_page_to_lru_list(zone, page, page_lru(page));
2688 SetPageLRU(page);
2689 }
2690 spin_unlock_irqrestore(&zone->lru_lock, flags);
2833 return; 2691 return;
2834} 2692}
2835 2693
@@ -2837,6 +2695,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2837 gfp_t gfp_mask) 2695 gfp_t gfp_mask)
2838{ 2696{
2839 struct mem_cgroup *memcg = NULL; 2697 struct mem_cgroup *memcg = NULL;
2698 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2840 int ret; 2699 int ret;
2841 2700
2842 if (mem_cgroup_disabled()) 2701 if (mem_cgroup_disabled())
@@ -2846,31 +2705,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2846 2705
2847 if (unlikely(!mm)) 2706 if (unlikely(!mm))
2848 mm = &init_mm; 2707 mm = &init_mm;
2708 if (!page_is_file_cache(page))
2709 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2849 2710
2850 if (page_is_file_cache(page)) { 2711 if (!PageSwapCache(page))
2851 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); 2712 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2852 if (ret || !memcg) 2713 else { /* page is swapcache/shmem */
2853 return ret;
2854
2855 /*
2856 * FUSE reuses pages without going through the final
2857 * put that would remove them from the LRU list, make
2858 * sure that they get relinked properly.
2859 */
2860 __mem_cgroup_commit_charge_lrucare(page, memcg,
2861 MEM_CGROUP_CHARGE_TYPE_CACHE);
2862 return ret;
2863 }
2864 /* shmem */
2865 if (PageSwapCache(page)) {
2866 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); 2714 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
2867 if (!ret) 2715 if (!ret)
2868 __mem_cgroup_commit_charge_swapin(page, memcg, 2716 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2869 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2717 }
2870 } else
2871 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2872 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2873
2874 return ret; 2718 return ret;
2875} 2719}
2876 2720
@@ -2882,12 +2726,12 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2882 */ 2726 */
2883int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2727int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2884 struct page *page, 2728 struct page *page,
2885 gfp_t mask, struct mem_cgroup **ptr) 2729 gfp_t mask, struct mem_cgroup **memcgp)
2886{ 2730{
2887 struct mem_cgroup *memcg; 2731 struct mem_cgroup *memcg;
2888 int ret; 2732 int ret;
2889 2733
2890 *ptr = NULL; 2734 *memcgp = NULL;
2891 2735
2892 if (mem_cgroup_disabled()) 2736 if (mem_cgroup_disabled())
2893 return 0; 2737 return 0;
@@ -2905,27 +2749,32 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2905 memcg = try_get_mem_cgroup_from_page(page); 2749 memcg = try_get_mem_cgroup_from_page(page);
2906 if (!memcg) 2750 if (!memcg)
2907 goto charge_cur_mm; 2751 goto charge_cur_mm;
2908 *ptr = memcg; 2752 *memcgp = memcg;
2909 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); 2753 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
2910 css_put(&memcg->css); 2754 css_put(&memcg->css);
2755 if (ret == -EINTR)
2756 ret = 0;
2911 return ret; 2757 return ret;
2912charge_cur_mm: 2758charge_cur_mm:
2913 if (unlikely(!mm)) 2759 if (unlikely(!mm))
2914 mm = &init_mm; 2760 mm = &init_mm;
2915 return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); 2761 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
2762 if (ret == -EINTR)
2763 ret = 0;
2764 return ret;
2916} 2765}
2917 2766
2918static void 2767static void
2919__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2768__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2920 enum charge_type ctype) 2769 enum charge_type ctype)
2921{ 2770{
2922 if (mem_cgroup_disabled()) 2771 if (mem_cgroup_disabled())
2923 return; 2772 return;
2924 if (!ptr) 2773 if (!memcg)
2925 return; 2774 return;
2926 cgroup_exclude_rmdir(&ptr->css); 2775 cgroup_exclude_rmdir(&memcg->css);
2927 2776
2928 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); 2777 __mem_cgroup_commit_charge_lrucare(page, memcg, ctype);
2929 /* 2778 /*
2930 * Now swap is on-memory. This means this page may be 2779 * Now swap is on-memory. This means this page may be
2931 * counted both as mem and swap....double count. 2780 * counted both as mem and swap....double count.
@@ -2935,21 +2784,22 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2935 */ 2784 */
2936 if (do_swap_account && PageSwapCache(page)) { 2785 if (do_swap_account && PageSwapCache(page)) {
2937 swp_entry_t ent = {.val = page_private(page)}; 2786 swp_entry_t ent = {.val = page_private(page)};
2787 struct mem_cgroup *swap_memcg;
2938 unsigned short id; 2788 unsigned short id;
2939 struct mem_cgroup *memcg;
2940 2789
2941 id = swap_cgroup_record(ent, 0); 2790 id = swap_cgroup_record(ent, 0);
2942 rcu_read_lock(); 2791 rcu_read_lock();
2943 memcg = mem_cgroup_lookup(id); 2792 swap_memcg = mem_cgroup_lookup(id);
2944 if (memcg) { 2793 if (swap_memcg) {
2945 /* 2794 /*
2946 * This recorded memcg can be obsolete one. So, avoid 2795 * This recorded memcg can be obsolete one. So, avoid
2947 * calling css_tryget 2796 * calling css_tryget
2948 */ 2797 */
2949 if (!mem_cgroup_is_root(memcg)) 2798 if (!mem_cgroup_is_root(swap_memcg))
2950 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2799 res_counter_uncharge(&swap_memcg->memsw,
2951 mem_cgroup_swap_statistics(memcg, false); 2800 PAGE_SIZE);
2952 mem_cgroup_put(memcg); 2801 mem_cgroup_swap_statistics(swap_memcg, false);
2802 mem_cgroup_put(swap_memcg);
2953 } 2803 }
2954 rcu_read_unlock(); 2804 rcu_read_unlock();
2955 } 2805 }
@@ -2958,13 +2808,14 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2958 * So, rmdir()->pre_destroy() can be called while we do this charge. 2808 * So, rmdir()->pre_destroy() can be called while we do this charge.
2959 * In that case, we need to call pre_destroy() again. check it here. 2809 * In that case, we need to call pre_destroy() again. check it here.
2960 */ 2810 */
2961 cgroup_release_and_wakeup_rmdir(&ptr->css); 2811 cgroup_release_and_wakeup_rmdir(&memcg->css);
2962} 2812}
2963 2813
2964void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 2814void mem_cgroup_commit_charge_swapin(struct page *page,
2815 struct mem_cgroup *memcg)
2965{ 2816{
2966 __mem_cgroup_commit_charge_swapin(page, ptr, 2817 __mem_cgroup_commit_charge_swapin(page, memcg,
2967 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2818 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2968} 2819}
2969 2820
2970void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 2821void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
@@ -3054,7 +2905,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3054 * Check if our page_cgroup is valid 2905 * Check if our page_cgroup is valid
3055 */ 2906 */
3056 pc = lookup_page_cgroup(page); 2907 pc = lookup_page_cgroup(page);
3057 if (unlikely(!pc || !PageCgroupUsed(pc))) 2908 if (unlikely(!PageCgroupUsed(pc)))
3058 return NULL; 2909 return NULL;
3059 2910
3060 lock_page_cgroup(pc); 2911 lock_page_cgroup(pc);
@@ -3117,8 +2968,7 @@ void mem_cgroup_uncharge_page(struct page *page)
3117 /* early check. */ 2968 /* early check. */
3118 if (page_mapped(page)) 2969 if (page_mapped(page))
3119 return; 2970 return;
3120 if (page->mapping && !PageAnon(page)) 2971 VM_BUG_ON(page->mapping && !PageAnon(page));
3121 return;
3122 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 2972 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
3123} 2973}
3124 2974
@@ -3176,6 +3026,23 @@ void mem_cgroup_uncharge_end(void)
3176 batch->memcg = NULL; 3026 batch->memcg = NULL;
3177} 3027}
3178 3028
3029/*
3030 * A function for resetting pc->mem_cgroup for newly allocated pages.
3031 * This function should be called if the newpage will be added to LRU
3032 * before start accounting.
3033 */
3034void mem_cgroup_reset_owner(struct page *newpage)
3035{
3036 struct page_cgroup *pc;
3037
3038 if (mem_cgroup_disabled())
3039 return;
3040
3041 pc = lookup_page_cgroup(newpage);
3042 VM_BUG_ON(PageCgroupUsed(pc));
3043 pc->mem_cgroup = root_mem_cgroup;
3044}
3045
3179#ifdef CONFIG_SWAP 3046#ifdef CONFIG_SWAP
3180/* 3047/*
3181 * called after __delete_from_swap_cache() and drop "page" account. 3048 * called after __delete_from_swap_cache() and drop "page" account.
@@ -3293,14 +3160,14 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3293 * page belongs to. 3160 * page belongs to.
3294 */ 3161 */
3295int mem_cgroup_prepare_migration(struct page *page, 3162int mem_cgroup_prepare_migration(struct page *page,
3296 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) 3163 struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
3297{ 3164{
3298 struct mem_cgroup *memcg = NULL; 3165 struct mem_cgroup *memcg = NULL;
3299 struct page_cgroup *pc; 3166 struct page_cgroup *pc;
3300 enum charge_type ctype; 3167 enum charge_type ctype;
3301 int ret = 0; 3168 int ret = 0;
3302 3169
3303 *ptr = NULL; 3170 *memcgp = NULL;
3304 3171
3305 VM_BUG_ON(PageTransHuge(page)); 3172 VM_BUG_ON(PageTransHuge(page));
3306 if (mem_cgroup_disabled()) 3173 if (mem_cgroup_disabled())
@@ -3351,10 +3218,10 @@ int mem_cgroup_prepare_migration(struct page *page,
3351 if (!memcg) 3218 if (!memcg)
3352 return 0; 3219 return 0;
3353 3220
3354 *ptr = memcg; 3221 *memcgp = memcg;
3355 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); 3222 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
3356 css_put(&memcg->css);/* drop extra refcnt */ 3223 css_put(&memcg->css);/* drop extra refcnt */
3357 if (ret || *ptr == NULL) { 3224 if (ret) {
3358 if (PageAnon(page)) { 3225 if (PageAnon(page)) {
3359 lock_page_cgroup(pc); 3226 lock_page_cgroup(pc);
3360 ClearPageCgroupMigration(pc); 3227 ClearPageCgroupMigration(pc);
@@ -3364,6 +3231,7 @@ int mem_cgroup_prepare_migration(struct page *page,
3364 */ 3231 */
3365 mem_cgroup_uncharge_page(page); 3232 mem_cgroup_uncharge_page(page);
3366 } 3233 }
3234 /* we'll need to revisit this error code (we have -EINTR) */
3367 return -ENOMEM; 3235 return -ENOMEM;
3368 } 3236 }
3369 /* 3237 /*
@@ -3432,12 +3300,51 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3432 cgroup_release_and_wakeup_rmdir(&memcg->css); 3300 cgroup_release_and_wakeup_rmdir(&memcg->css);
3433} 3301}
3434 3302
3303/*
3304 * At replace page cache, newpage is not under any memcg but it's on
3305 * LRU. So, this function doesn't touch res_counter but handles LRU
3306 * in correct way. Both pages are locked so we cannot race with uncharge.
3307 */
3308void mem_cgroup_replace_page_cache(struct page *oldpage,
3309 struct page *newpage)
3310{
3311 struct mem_cgroup *memcg;
3312 struct page_cgroup *pc;
3313 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3314
3315 if (mem_cgroup_disabled())
3316 return;
3317
3318 pc = lookup_page_cgroup(oldpage);
3319 /* fix accounting on old pages */
3320 lock_page_cgroup(pc);
3321 memcg = pc->mem_cgroup;
3322 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
3323 ClearPageCgroupUsed(pc);
3324 unlock_page_cgroup(pc);
3325
3326 if (PageSwapBacked(oldpage))
3327 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3328
3329 /*
3330 * Even if newpage->mapping was NULL before starting replacement,
3331 * the newpage may be on LRU(or pagevec for LRU) already. We lock
3332 * LRU while we overwrite pc->mem_cgroup.
3333 */
3334 __mem_cgroup_commit_charge_lrucare(newpage, memcg, type);
3335}
3336
3435#ifdef CONFIG_DEBUG_VM 3337#ifdef CONFIG_DEBUG_VM
3436static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3338static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3437{ 3339{
3438 struct page_cgroup *pc; 3340 struct page_cgroup *pc;
3439 3341
3440 pc = lookup_page_cgroup(page); 3342 pc = lookup_page_cgroup(page);
3343 /*
3344 * Can be NULL while feeding pages into the page allocator for
3345 * the first time, i.e. during boot or memory hotplug;
3346 * or when mem_cgroup_disabled().
3347 */
3441 if (likely(pc) && PageCgroupUsed(pc)) 3348 if (likely(pc) && PageCgroupUsed(pc))
3442 return pc; 3349 return pc;
3443 return NULL; 3350 return NULL;
@@ -3457,23 +3364,8 @@ void mem_cgroup_print_bad_page(struct page *page)
3457 3364
3458 pc = lookup_page_cgroup_used(page); 3365 pc = lookup_page_cgroup_used(page);
3459 if (pc) { 3366 if (pc) {
3460 int ret = -1; 3367 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
3461 char *path;
3462
3463 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
3464 pc, pc->flags, pc->mem_cgroup); 3368 pc, pc->flags, pc->mem_cgroup);
3465
3466 path = kmalloc(PATH_MAX, GFP_KERNEL);
3467 if (path) {
3468 rcu_read_lock();
3469 ret = cgroup_path(pc->mem_cgroup->css.cgroup,
3470 path, PATH_MAX);
3471 rcu_read_unlock();
3472 }
3473
3474 printk(KERN_CONT "(%s)\n",
3475 (ret < 0) ? "cannot get the path" : path);
3476 kfree(path);
3477 } 3369 }
3478} 3370}
3479#endif 3371#endif
@@ -3534,9 +3426,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3534 if (!ret) 3426 if (!ret)
3535 break; 3427 break;
3536 3428
3537 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3429 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3538 MEM_CGROUP_RECLAIM_SHRINK, 3430 MEM_CGROUP_RECLAIM_SHRINK);
3539 NULL);
3540 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3431 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3541 /* Usage is reduced ? */ 3432 /* Usage is reduced ? */
3542 if (curusage >= oldusage) 3433 if (curusage >= oldusage)
@@ -3594,10 +3485,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3594 if (!ret) 3485 if (!ret)
3595 break; 3486 break;
3596 3487
3597 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3488 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3598 MEM_CGROUP_RECLAIM_NOSWAP | 3489 MEM_CGROUP_RECLAIM_NOSWAP |
3599 MEM_CGROUP_RECLAIM_SHRINK, 3490 MEM_CGROUP_RECLAIM_SHRINK);
3600 NULL);
3601 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3491 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3602 /* Usage is reduced ? */ 3492 /* Usage is reduced ? */
3603 if (curusage >= oldusage) 3493 if (curusage >= oldusage)
@@ -3640,10 +3530,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3640 break; 3530 break;
3641 3531
3642 nr_scanned = 0; 3532 nr_scanned = 0;
3643 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 3533 reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone,
3644 gfp_mask, 3534 gfp_mask, &nr_scanned);
3645 MEM_CGROUP_RECLAIM_SOFT,
3646 &nr_scanned);
3647 nr_reclaimed += reclaimed; 3535 nr_reclaimed += reclaimed;
3648 *total_scanned += nr_scanned; 3536 *total_scanned += nr_scanned;
3649 spin_lock(&mctz->lock); 3537 spin_lock(&mctz->lock);
@@ -3711,22 +3599,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3711static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3599static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3712 int node, int zid, enum lru_list lru) 3600 int node, int zid, enum lru_list lru)
3713{ 3601{
3714 struct zone *zone;
3715 struct mem_cgroup_per_zone *mz; 3602 struct mem_cgroup_per_zone *mz;
3716 struct page_cgroup *pc, *busy;
3717 unsigned long flags, loop; 3603 unsigned long flags, loop;
3718 struct list_head *list; 3604 struct list_head *list;
3605 struct page *busy;
3606 struct zone *zone;
3719 int ret = 0; 3607 int ret = 0;
3720 3608
3721 zone = &NODE_DATA(node)->node_zones[zid]; 3609 zone = &NODE_DATA(node)->node_zones[zid];
3722 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3610 mz = mem_cgroup_zoneinfo(memcg, node, zid);
3723 list = &mz->lists[lru]; 3611 list = &mz->lruvec.lists[lru];
3724 3612
3725 loop = MEM_CGROUP_ZSTAT(mz, lru); 3613 loop = MEM_CGROUP_ZSTAT(mz, lru);
3726 /* give some margin against EBUSY etc...*/ 3614 /* give some margin against EBUSY etc...*/
3727 loop += 256; 3615 loop += 256;
3728 busy = NULL; 3616 busy = NULL;
3729 while (loop--) { 3617 while (loop--) {
3618 struct page_cgroup *pc;
3730 struct page *page; 3619 struct page *page;
3731 3620
3732 ret = 0; 3621 ret = 0;
@@ -3735,24 +3624,24 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3735 spin_unlock_irqrestore(&zone->lru_lock, flags); 3624 spin_unlock_irqrestore(&zone->lru_lock, flags);
3736 break; 3625 break;
3737 } 3626 }
3738 pc = list_entry(list->prev, struct page_cgroup, lru); 3627 page = list_entry(list->prev, struct page, lru);
3739 if (busy == pc) { 3628 if (busy == page) {
3740 list_move(&pc->lru, list); 3629 list_move(&page->lru, list);
3741 busy = NULL; 3630 busy = NULL;
3742 spin_unlock_irqrestore(&zone->lru_lock, flags); 3631 spin_unlock_irqrestore(&zone->lru_lock, flags);
3743 continue; 3632 continue;
3744 } 3633 }
3745 spin_unlock_irqrestore(&zone->lru_lock, flags); 3634 spin_unlock_irqrestore(&zone->lru_lock, flags);
3746 3635
3747 page = lookup_cgroup_page(pc); 3636 pc = lookup_page_cgroup(page);
3748 3637
3749 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); 3638 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
3750 if (ret == -ENOMEM) 3639 if (ret == -ENOMEM || ret == -EINTR)
3751 break; 3640 break;
3752 3641
3753 if (ret == -EBUSY || ret == -EINVAL) { 3642 if (ret == -EBUSY || ret == -EINVAL) {
3754 /* found lock contention or "pc" is obsolete. */ 3643 /* found lock contention or "pc" is obsolete. */
3755 busy = pc; 3644 busy = page;
3756 cond_resched(); 3645 cond_resched();
3757 } else 3646 } else
3758 busy = NULL; 3647 busy = NULL;
@@ -4846,7 +4735,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4846 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4735 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4847 mz = &pn->zoneinfo[zone]; 4736 mz = &pn->zoneinfo[zone];
4848 for_each_lru(l) 4737 for_each_lru(l)
4849 INIT_LIST_HEAD(&mz->lists[l]); 4738 INIT_LIST_HEAD(&mz->lruvec.lists[l]);
4850 mz->usage_in_excess = 0; 4739 mz->usage_in_excess = 0;
4851 mz->on_tree = false; 4740 mz->on_tree = false;
4852 mz->mem = memcg; 4741 mz->mem = memcg;
@@ -4906,7 +4795,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4906 mem_cgroup_remove_from_trees(memcg); 4795 mem_cgroup_remove_from_trees(memcg);
4907 free_css_id(&mem_cgroup_subsys, &memcg->css); 4796 free_css_id(&mem_cgroup_subsys, &memcg->css);
4908 4797
4909 for_each_node_state(node, N_POSSIBLE) 4798 for_each_node(node)
4910 free_mem_cgroup_per_zone_info(memcg, node); 4799 free_mem_cgroup_per_zone_info(memcg, node);
4911 4800
4912 free_percpu(memcg->stat); 4801 free_percpu(memcg->stat);
@@ -4965,13 +4854,13 @@ static int mem_cgroup_soft_limit_tree_init(void)
4965 struct mem_cgroup_tree_per_zone *rtpz; 4854 struct mem_cgroup_tree_per_zone *rtpz;
4966 int tmp, node, zone; 4855 int tmp, node, zone;
4967 4856
4968 for_each_node_state(node, N_POSSIBLE) { 4857 for_each_node(node) {
4969 tmp = node; 4858 tmp = node;
4970 if (!node_state(node, N_NORMAL_MEMORY)) 4859 if (!node_state(node, N_NORMAL_MEMORY))
4971 tmp = -1; 4860 tmp = -1;
4972 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 4861 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4973 if (!rtpn) 4862 if (!rtpn)
4974 return 1; 4863 goto err_cleanup;
4975 4864
4976 soft_limit_tree.rb_tree_per_node[node] = rtpn; 4865 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4977 4866
@@ -4982,6 +4871,16 @@ static int mem_cgroup_soft_limit_tree_init(void)
4982 } 4871 }
4983 } 4872 }
4984 return 0; 4873 return 0;
4874
4875err_cleanup:
4876 for_each_node(node) {
4877 if (!soft_limit_tree.rb_tree_per_node[node])
4878 break;
4879 kfree(soft_limit_tree.rb_tree_per_node[node]);
4880 soft_limit_tree.rb_tree_per_node[node] = NULL;
4881 }
4882 return 1;
4883
4985} 4884}
4986 4885
4987static struct cgroup_subsys_state * __ref 4886static struct cgroup_subsys_state * __ref
@@ -4995,7 +4894,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4995 if (!memcg) 4894 if (!memcg)
4996 return ERR_PTR(error); 4895 return ERR_PTR(error);
4997 4896
4998 for_each_node_state(node, N_POSSIBLE) 4897 for_each_node(node)
4999 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 4898 if (alloc_mem_cgroup_per_zone_info(memcg, node))
5000 goto free_out; 4899 goto free_out;
5001 4900
@@ -5033,7 +4932,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
5033 res_counter_init(&memcg->res, NULL); 4932 res_counter_init(&memcg->res, NULL);
5034 res_counter_init(&memcg->memsw, NULL); 4933 res_counter_init(&memcg->memsw, NULL);
5035 } 4934 }
5036 memcg->last_scanned_child = 0;
5037 memcg->last_scanned_node = MAX_NUMNODES; 4935 memcg->last_scanned_node = MAX_NUMNODES;
5038 INIT_LIST_HEAD(&memcg->oom_notify); 4936 INIT_LIST_HEAD(&memcg->oom_notify);
5039 4937
@@ -5129,9 +5027,9 @@ one_by_one:
5129 } 5027 }
5130 ret = __mem_cgroup_try_charge(NULL, 5028 ret = __mem_cgroup_try_charge(NULL,
5131 GFP_KERNEL, 1, &memcg, false); 5029 GFP_KERNEL, 1, &memcg, false);
5132 if (ret || !memcg) 5030 if (ret)
5133 /* mem_cgroup_clear_mc() will do uncharge later */ 5031 /* mem_cgroup_clear_mc() will do uncharge later */
5134 return -ENOMEM; 5032 return ret;
5135 mc.precharge++; 5033 mc.precharge++;
5136 } 5034 }
5137 return ret; 5035 return ret;
@@ -5276,7 +5174,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
5276 } 5174 }
5277 /* There is a swap entry and a page doesn't exist or isn't charged */ 5175 /* There is a swap entry and a page doesn't exist or isn't charged */
5278 if (ent.val && !ret && 5176 if (ent.val && !ret &&
5279 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { 5177 css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
5280 ret = MC_TARGET_SWAP; 5178 ret = MC_TARGET_SWAP;
5281 if (target) 5179 if (target)
5282 target->ent = ent; 5180 target->ent = ent;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 06d3479513aa..56080ea36140 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1557,7 +1557,7 @@ int soft_offline_page(struct page *page, int flags)
1557 page_is_file_cache(page)); 1557 page_is_file_cache(page));
1558 list_add(&page->lru, &pagelist); 1558 list_add(&page->lru, &pagelist);
1559 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1559 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1560 0, true); 1560 0, MIGRATE_SYNC);
1561 if (ret) { 1561 if (ret) {
1562 putback_lru_pages(&pagelist); 1562 putback_lru_pages(&pagelist);
1563 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1563 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 829d43735402..5e30583c2605 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -293,7 +293,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
293{ 293{
294 struct mmu_gather_batch *batch; 294 struct mmu_gather_batch *batch;
295 295
296 tlb->need_flush = 1; 296 VM_BUG_ON(!tlb->need_flush);
297 297
298 if (tlb_fast_mode(tlb)) { 298 if (tlb_fast_mode(tlb)) {
299 free_page_and_swap_cache(page); 299 free_page_and_swap_cache(page);
@@ -1231,7 +1231,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1231 if (next-addr != HPAGE_PMD_SIZE) { 1231 if (next-addr != HPAGE_PMD_SIZE) {
1232 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1232 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1233 split_huge_page_pmd(vma->vm_mm, pmd); 1233 split_huge_page_pmd(vma->vm_mm, pmd);
1234 } else if (zap_huge_pmd(tlb, vma, pmd)) 1234 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1235 continue; 1235 continue;
1236 /* fall through */ 1236 /* fall through */
1237 } 1237 }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2168489c0bc9..6629fafd6ce4 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -809,7 +809,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
809 } 809 }
810 /* this function returns # of failed pages */ 810 /* this function returns # of failed pages */
811 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 811 ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
812 true, true); 812 true, MIGRATE_SYNC);
813 if (ret) 813 if (ret)
814 putback_lru_pages(&source); 814 putback_lru_pages(&source);
815 } 815 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e3d58f088466..06b145fb64ab 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -942,7 +942,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
942 942
943 if (!list_empty(&pagelist)) { 943 if (!list_empty(&pagelist)) {
944 err = migrate_pages(&pagelist, new_node_page, dest, 944 err = migrate_pages(&pagelist, new_node_page, dest,
945 false, true); 945 false, MIGRATE_SYNC);
946 if (err) 946 if (err)
947 putback_lru_pages(&pagelist); 947 putback_lru_pages(&pagelist);
948 } 948 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 89ea0854332e..9871a56d82c3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -216,6 +216,56 @@ out:
216 pte_unmap_unlock(ptep, ptl); 216 pte_unmap_unlock(ptep, ptl);
217} 217}
218 218
219#ifdef CONFIG_BLOCK
220/* Returns true if all buffers are successfully locked */
221static bool buffer_migrate_lock_buffers(struct buffer_head *head,
222 enum migrate_mode mode)
223{
224 struct buffer_head *bh = head;
225
226 /* Simple case, sync compaction */
227 if (mode != MIGRATE_ASYNC) {
228 do {
229 get_bh(bh);
230 lock_buffer(bh);
231 bh = bh->b_this_page;
232
233 } while (bh != head);
234
235 return true;
236 }
237
238 /* async case, we cannot block on lock_buffer so use trylock_buffer */
239 do {
240 get_bh(bh);
241 if (!trylock_buffer(bh)) {
242 /*
243 * We failed to lock the buffer and cannot stall in
244 * async migration. Release the taken locks
245 */
246 struct buffer_head *failed_bh = bh;
247 put_bh(failed_bh);
248 bh = head;
249 while (bh != failed_bh) {
250 unlock_buffer(bh);
251 put_bh(bh);
252 bh = bh->b_this_page;
253 }
254 return false;
255 }
256
257 bh = bh->b_this_page;
258 } while (bh != head);
259 return true;
260}
261#else
262static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
263 enum migrate_mode mode)
264{
265 return true;
266}
267#endif /* CONFIG_BLOCK */
268
219/* 269/*
220 * Replace the page in the mapping. 270 * Replace the page in the mapping.
221 * 271 *
@@ -225,7 +275,8 @@ out:
225 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 275 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
226 */ 276 */
227static int migrate_page_move_mapping(struct address_space *mapping, 277static int migrate_page_move_mapping(struct address_space *mapping,
228 struct page *newpage, struct page *page) 278 struct page *newpage, struct page *page,
279 struct buffer_head *head, enum migrate_mode mode)
229{ 280{
230 int expected_count; 281 int expected_count;
231 void **pslot; 282 void **pslot;
@@ -255,6 +306,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
255 } 306 }
256 307
257 /* 308 /*
309 * In the async migration case of moving a page with buffers, lock the
310 * buffers using trylock before the mapping is moved. If the mapping
311 * was moved, we later failed to lock the buffers and could not move
312 * the mapping back due to an elevated page count, we would have to
313 * block waiting on other references to be dropped.
314 */
315 if (mode == MIGRATE_ASYNC && head &&
316 !buffer_migrate_lock_buffers(head, mode)) {
317 page_unfreeze_refs(page, expected_count);
318 spin_unlock_irq(&mapping->tree_lock);
319 return -EAGAIN;
320 }
321
322 /*
258 * Now we know that no one else is looking at the page. 323 * Now we know that no one else is looking at the page.
259 */ 324 */
260 get_page(newpage); /* add cache reference */ 325 get_page(newpage); /* add cache reference */
@@ -409,13 +474,14 @@ EXPORT_SYMBOL(fail_migrate_page);
409 * Pages are locked upon entry and exit. 474 * Pages are locked upon entry and exit.
410 */ 475 */
411int migrate_page(struct address_space *mapping, 476int migrate_page(struct address_space *mapping,
412 struct page *newpage, struct page *page) 477 struct page *newpage, struct page *page,
478 enum migrate_mode mode)
413{ 479{
414 int rc; 480 int rc;
415 481
416 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 482 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
417 483
418 rc = migrate_page_move_mapping(mapping, newpage, page); 484 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
419 485
420 if (rc) 486 if (rc)
421 return rc; 487 return rc;
@@ -432,28 +498,28 @@ EXPORT_SYMBOL(migrate_page);
432 * exist. 498 * exist.
433 */ 499 */
434int buffer_migrate_page(struct address_space *mapping, 500int buffer_migrate_page(struct address_space *mapping,
435 struct page *newpage, struct page *page) 501 struct page *newpage, struct page *page, enum migrate_mode mode)
436{ 502{
437 struct buffer_head *bh, *head; 503 struct buffer_head *bh, *head;
438 int rc; 504 int rc;
439 505
440 if (!page_has_buffers(page)) 506 if (!page_has_buffers(page))
441 return migrate_page(mapping, newpage, page); 507 return migrate_page(mapping, newpage, page, mode);
442 508
443 head = page_buffers(page); 509 head = page_buffers(page);
444 510
445 rc = migrate_page_move_mapping(mapping, newpage, page); 511 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
446 512
447 if (rc) 513 if (rc)
448 return rc; 514 return rc;
449 515
450 bh = head; 516 /*
451 do { 517 * In the async case, migrate_page_move_mapping locked the buffers
452 get_bh(bh); 518 * with an IRQ-safe spinlock held. In the sync case, the buffers
453 lock_buffer(bh); 519 * need to be locked now
454 bh = bh->b_this_page; 520 */
455 521 if (mode != MIGRATE_ASYNC)
456 } while (bh != head); 522 BUG_ON(!buffer_migrate_lock_buffers(head, mode));
457 523
458 ClearPagePrivate(page); 524 ClearPagePrivate(page);
459 set_page_private(newpage, page_private(page)); 525 set_page_private(newpage, page_private(page));
@@ -530,10 +596,14 @@ static int writeout(struct address_space *mapping, struct page *page)
530 * Default handling if a filesystem does not provide a migration function. 596 * Default handling if a filesystem does not provide a migration function.
531 */ 597 */
532static int fallback_migrate_page(struct address_space *mapping, 598static int fallback_migrate_page(struct address_space *mapping,
533 struct page *newpage, struct page *page) 599 struct page *newpage, struct page *page, enum migrate_mode mode)
534{ 600{
535 if (PageDirty(page)) 601 if (PageDirty(page)) {
602 /* Only writeback pages in full synchronous migration */
603 if (mode != MIGRATE_SYNC)
604 return -EBUSY;
536 return writeout(mapping, page); 605 return writeout(mapping, page);
606 }
537 607
538 /* 608 /*
539 * Buffers may be managed in a filesystem specific way. 609 * Buffers may be managed in a filesystem specific way.
@@ -543,7 +613,7 @@ static int fallback_migrate_page(struct address_space *mapping,
543 !try_to_release_page(page, GFP_KERNEL)) 613 !try_to_release_page(page, GFP_KERNEL))
544 return -EAGAIN; 614 return -EAGAIN;
545 615
546 return migrate_page(mapping, newpage, page); 616 return migrate_page(mapping, newpage, page, mode);
547} 617}
548 618
549/* 619/*
@@ -558,7 +628,7 @@ static int fallback_migrate_page(struct address_space *mapping,
558 * == 0 - success 628 * == 0 - success
559 */ 629 */
560static int move_to_new_page(struct page *newpage, struct page *page, 630static int move_to_new_page(struct page *newpage, struct page *page,
561 int remap_swapcache, bool sync) 631 int remap_swapcache, enum migrate_mode mode)
562{ 632{
563 struct address_space *mapping; 633 struct address_space *mapping;
564 int rc; 634 int rc;
@@ -579,29 +649,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
579 649
580 mapping = page_mapping(page); 650 mapping = page_mapping(page);
581 if (!mapping) 651 if (!mapping)
582 rc = migrate_page(mapping, newpage, page); 652 rc = migrate_page(mapping, newpage, page, mode);
583 else { 653 else if (mapping->a_ops->migratepage)
584 /* 654 /*
585 * Do not writeback pages if !sync and migratepage is 655 * Most pages have a mapping and most filesystems provide a
586 * not pointing to migrate_page() which is nonblocking 656 * migratepage callback. Anonymous pages are part of swap
587 * (swapcache/tmpfs uses migratepage = migrate_page). 657 * space which also has its own migratepage callback. This
658 * is the most common path for page migration.
588 */ 659 */
589 if (PageDirty(page) && !sync && 660 rc = mapping->a_ops->migratepage(mapping,
590 mapping->a_ops->migratepage != migrate_page) 661 newpage, page, mode);
591 rc = -EBUSY; 662 else
592 else if (mapping->a_ops->migratepage) 663 rc = fallback_migrate_page(mapping, newpage, page, mode);
593 /*
594 * Most pages have a mapping and most filesystems
595 * should provide a migration function. Anonymous
596 * pages are part of swap space which also has its
597 * own migration function. This is the most common
598 * path for page migration.
599 */
600 rc = mapping->a_ops->migratepage(mapping,
601 newpage, page);
602 else
603 rc = fallback_migrate_page(mapping, newpage, page);
604 }
605 664
606 if (rc) { 665 if (rc) {
607 newpage->mapping = NULL; 666 newpage->mapping = NULL;
@@ -616,7 +675,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
616} 675}
617 676
618static int __unmap_and_move(struct page *page, struct page *newpage, 677static int __unmap_and_move(struct page *page, struct page *newpage,
619 int force, bool offlining, bool sync) 678 int force, bool offlining, enum migrate_mode mode)
620{ 679{
621 int rc = -EAGAIN; 680 int rc = -EAGAIN;
622 int remap_swapcache = 1; 681 int remap_swapcache = 1;
@@ -625,7 +684,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
625 struct anon_vma *anon_vma = NULL; 684 struct anon_vma *anon_vma = NULL;
626 685
627 if (!trylock_page(page)) { 686 if (!trylock_page(page)) {
628 if (!force || !sync) 687 if (!force || mode == MIGRATE_ASYNC)
629 goto out; 688 goto out;
630 689
631 /* 690 /*
@@ -671,10 +730,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
671 730
672 if (PageWriteback(page)) { 731 if (PageWriteback(page)) {
673 /* 732 /*
674 * For !sync, there is no point retrying as the retry loop 733 * Only in the case of a full syncronous migration is it
675 * is expected to be too short for PageWriteback to be cleared 734 * necessary to wait for PageWriteback. In the async case,
735 * the retry loop is too short and in the sync-light case,
736 * the overhead of stalling is too much
676 */ 737 */
677 if (!sync) { 738 if (mode != MIGRATE_SYNC) {
678 rc = -EBUSY; 739 rc = -EBUSY;
679 goto uncharge; 740 goto uncharge;
680 } 741 }
@@ -745,7 +806,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
745 806
746skip_unmap: 807skip_unmap:
747 if (!page_mapped(page)) 808 if (!page_mapped(page))
748 rc = move_to_new_page(newpage, page, remap_swapcache, sync); 809 rc = move_to_new_page(newpage, page, remap_swapcache, mode);
749 810
750 if (rc && remap_swapcache) 811 if (rc && remap_swapcache)
751 remove_migration_ptes(page, page); 812 remove_migration_ptes(page, page);
@@ -768,7 +829,8 @@ out:
768 * to the newly allocated page in newpage. 829 * to the newly allocated page in newpage.
769 */ 830 */
770static int unmap_and_move(new_page_t get_new_page, unsigned long private, 831static int unmap_and_move(new_page_t get_new_page, unsigned long private,
771 struct page *page, int force, bool offlining, bool sync) 832 struct page *page, int force, bool offlining,
833 enum migrate_mode mode)
772{ 834{
773 int rc = 0; 835 int rc = 0;
774 int *result = NULL; 836 int *result = NULL;
@@ -777,6 +839,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
777 if (!newpage) 839 if (!newpage)
778 return -ENOMEM; 840 return -ENOMEM;
779 841
842 mem_cgroup_reset_owner(newpage);
843
780 if (page_count(page) == 1) { 844 if (page_count(page) == 1) {
781 /* page was freed from under us. So we are done. */ 845 /* page was freed from under us. So we are done. */
782 goto out; 846 goto out;
@@ -786,7 +850,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
786 if (unlikely(split_huge_page(page))) 850 if (unlikely(split_huge_page(page)))
787 goto out; 851 goto out;
788 852
789 rc = __unmap_and_move(page, newpage, force, offlining, sync); 853 rc = __unmap_and_move(page, newpage, force, offlining, mode);
790out: 854out:
791 if (rc != -EAGAIN) { 855 if (rc != -EAGAIN) {
792 /* 856 /*
@@ -834,7 +898,8 @@ out:
834 */ 898 */
835static int unmap_and_move_huge_page(new_page_t get_new_page, 899static int unmap_and_move_huge_page(new_page_t get_new_page,
836 unsigned long private, struct page *hpage, 900 unsigned long private, struct page *hpage,
837 int force, bool offlining, bool sync) 901 int force, bool offlining,
902 enum migrate_mode mode)
838{ 903{
839 int rc = 0; 904 int rc = 0;
840 int *result = NULL; 905 int *result = NULL;
@@ -847,7 +912,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
847 rc = -EAGAIN; 912 rc = -EAGAIN;
848 913
849 if (!trylock_page(hpage)) { 914 if (!trylock_page(hpage)) {
850 if (!force || !sync) 915 if (!force || mode != MIGRATE_SYNC)
851 goto out; 916 goto out;
852 lock_page(hpage); 917 lock_page(hpage);
853 } 918 }
@@ -858,7 +923,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
858 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 923 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
859 924
860 if (!page_mapped(hpage)) 925 if (!page_mapped(hpage))
861 rc = move_to_new_page(new_hpage, hpage, 1, sync); 926 rc = move_to_new_page(new_hpage, hpage, 1, mode);
862 927
863 if (rc) 928 if (rc)
864 remove_migration_ptes(hpage, hpage); 929 remove_migration_ptes(hpage, hpage);
@@ -901,7 +966,7 @@ out:
901 */ 966 */
902int migrate_pages(struct list_head *from, 967int migrate_pages(struct list_head *from,
903 new_page_t get_new_page, unsigned long private, bool offlining, 968 new_page_t get_new_page, unsigned long private, bool offlining,
904 bool sync) 969 enum migrate_mode mode)
905{ 970{
906 int retry = 1; 971 int retry = 1;
907 int nr_failed = 0; 972 int nr_failed = 0;
@@ -922,7 +987,7 @@ int migrate_pages(struct list_head *from,
922 987
923 rc = unmap_and_move(get_new_page, private, 988 rc = unmap_and_move(get_new_page, private,
924 page, pass > 2, offlining, 989 page, pass > 2, offlining,
925 sync); 990 mode);
926 991
927 switch(rc) { 992 switch(rc) {
928 case -ENOMEM: 993 case -ENOMEM:
@@ -952,7 +1017,7 @@ out:
952 1017
953int migrate_huge_pages(struct list_head *from, 1018int migrate_huge_pages(struct list_head *from,
954 new_page_t get_new_page, unsigned long private, bool offlining, 1019 new_page_t get_new_page, unsigned long private, bool offlining,
955 bool sync) 1020 enum migrate_mode mode)
956{ 1021{
957 int retry = 1; 1022 int retry = 1;
958 int nr_failed = 0; 1023 int nr_failed = 0;
@@ -969,7 +1034,7 @@ int migrate_huge_pages(struct list_head *from,
969 1034
970 rc = unmap_and_move_huge_page(get_new_page, 1035 rc = unmap_and_move_huge_page(get_new_page,
971 private, page, pass > 2, offlining, 1036 private, page, pass > 2, offlining,
972 sync); 1037 mode);
973 1038
974 switch(rc) { 1039 switch(rc) {
975 case -ENOMEM: 1040 case -ENOMEM:
@@ -1098,7 +1163,7 @@ set_status:
1098 err = 0; 1163 err = 0;
1099 if (!list_empty(&pagelist)) { 1164 if (!list_empty(&pagelist)) {
1100 err = migrate_pages(&pagelist, new_page_node, 1165 err = migrate_pages(&pagelist, new_page_node,
1101 (unsigned long)pm, 0, true); 1166 (unsigned long)pm, 0, MIGRATE_SYNC);
1102 if (err) 1167 if (err)
1103 putback_lru_pages(&pagelist); 1168 putback_lru_pages(&pagelist);
1104 } 1169 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7c122faa05c5..2958fd8e7c9a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -152,7 +152,7 @@ struct task_struct *find_lock_task_mm(struct task_struct *p)
152 152
153/* return true if the task is not adequate as candidate victim task. */ 153/* return true if the task is not adequate as candidate victim task. */
154static bool oom_unkillable_task(struct task_struct *p, 154static bool oom_unkillable_task(struct task_struct *p,
155 const struct mem_cgroup *mem, const nodemask_t *nodemask) 155 const struct mem_cgroup *memcg, const nodemask_t *nodemask)
156{ 156{
157 if (is_global_init(p)) 157 if (is_global_init(p))
158 return true; 158 return true;
@@ -160,7 +160,7 @@ static bool oom_unkillable_task(struct task_struct *p,
160 return true; 160 return true;
161 161
162 /* When mem_cgroup_out_of_memory() and p is not member of the group */ 162 /* When mem_cgroup_out_of_memory() and p is not member of the group */
163 if (mem && !task_in_mem_cgroup(p, mem)) 163 if (memcg && !task_in_mem_cgroup(p, memcg))
164 return true; 164 return true;
165 165
166 /* p may not have freeable memory in nodemask */ 166 /* p may not have freeable memory in nodemask */
@@ -179,12 +179,12 @@ static bool oom_unkillable_task(struct task_struct *p,
179 * predictable as possible. The goal is to return the highest value for the 179 * predictable as possible. The goal is to return the highest value for the
180 * task consuming the most memory to avoid subsequent oom failures. 180 * task consuming the most memory to avoid subsequent oom failures.
181 */ 181 */
182unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, 182unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
183 const nodemask_t *nodemask, unsigned long totalpages) 183 const nodemask_t *nodemask, unsigned long totalpages)
184{ 184{
185 long points; 185 long points;
186 186
187 if (oom_unkillable_task(p, mem, nodemask)) 187 if (oom_unkillable_task(p, memcg, nodemask))
188 return 0; 188 return 0;
189 189
190 p = find_lock_task_mm(p); 190 p = find_lock_task_mm(p);
@@ -308,7 +308,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
308 * (not docbooked, we don't want this one cluttering up the manual) 308 * (not docbooked, we don't want this one cluttering up the manual)
309 */ 309 */
310static struct task_struct *select_bad_process(unsigned int *ppoints, 310static struct task_struct *select_bad_process(unsigned int *ppoints,
311 unsigned long totalpages, struct mem_cgroup *mem, 311 unsigned long totalpages, struct mem_cgroup *memcg,
312 const nodemask_t *nodemask) 312 const nodemask_t *nodemask)
313{ 313{
314 struct task_struct *g, *p; 314 struct task_struct *g, *p;
@@ -320,7 +320,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
320 320
321 if (p->exit_state) 321 if (p->exit_state)
322 continue; 322 continue;
323 if (oom_unkillable_task(p, mem, nodemask)) 323 if (oom_unkillable_task(p, memcg, nodemask))
324 continue; 324 continue;
325 325
326 /* 326 /*
@@ -364,7 +364,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
364 } 364 }
365 } 365 }
366 366
367 points = oom_badness(p, mem, nodemask, totalpages); 367 points = oom_badness(p, memcg, nodemask, totalpages);
368 if (points > *ppoints) { 368 if (points > *ppoints) {
369 chosen = p; 369 chosen = p;
370 *ppoints = points; 370 *ppoints = points;
@@ -387,14 +387,14 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
387 * 387 *
388 * Call with tasklist_lock read-locked. 388 * Call with tasklist_lock read-locked.
389 */ 389 */
390static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) 390static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
391{ 391{
392 struct task_struct *p; 392 struct task_struct *p;
393 struct task_struct *task; 393 struct task_struct *task;
394 394
395 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); 395 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
396 for_each_process(p) { 396 for_each_process(p) {
397 if (oom_unkillable_task(p, mem, nodemask)) 397 if (oom_unkillable_task(p, memcg, nodemask))
398 continue; 398 continue;
399 399
400 task = find_lock_task_mm(p); 400 task = find_lock_task_mm(p);
@@ -417,7 +417,7 @@ static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
417} 417}
418 418
419static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, 419static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
420 struct mem_cgroup *mem, const nodemask_t *nodemask) 420 struct mem_cgroup *memcg, const nodemask_t *nodemask)
421{ 421{
422 task_lock(current); 422 task_lock(current);
423 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 423 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
@@ -427,14 +427,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
427 cpuset_print_task_mems_allowed(current); 427 cpuset_print_task_mems_allowed(current);
428 task_unlock(current); 428 task_unlock(current);
429 dump_stack(); 429 dump_stack();
430 mem_cgroup_print_oom_info(mem, p); 430 mem_cgroup_print_oom_info(memcg, p);
431 show_mem(SHOW_MEM_FILTER_NODES); 431 show_mem(SHOW_MEM_FILTER_NODES);
432 if (sysctl_oom_dump_tasks) 432 if (sysctl_oom_dump_tasks)
433 dump_tasks(mem, nodemask); 433 dump_tasks(memcg, nodemask);
434} 434}
435 435
436#define K(x) ((x) << (PAGE_SHIFT-10)) 436#define K(x) ((x) << (PAGE_SHIFT-10))
437static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) 437static int oom_kill_task(struct task_struct *p)
438{ 438{
439 struct task_struct *q; 439 struct task_struct *q;
440 struct mm_struct *mm; 440 struct mm_struct *mm;
@@ -484,7 +484,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
484 484
485static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 485static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
486 unsigned int points, unsigned long totalpages, 486 unsigned int points, unsigned long totalpages,
487 struct mem_cgroup *mem, nodemask_t *nodemask, 487 struct mem_cgroup *memcg, nodemask_t *nodemask,
488 const char *message) 488 const char *message)
489{ 489{
490 struct task_struct *victim = p; 490 struct task_struct *victim = p;
@@ -493,7 +493,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
493 unsigned int victim_points = 0; 493 unsigned int victim_points = 0;
494 494
495 if (printk_ratelimit()) 495 if (printk_ratelimit())
496 dump_header(p, gfp_mask, order, mem, nodemask); 496 dump_header(p, gfp_mask, order, memcg, nodemask);
497 497
498 /* 498 /*
499 * If the task is already exiting, don't alarm the sysadmin or kill 499 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -524,7 +524,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
524 /* 524 /*
525 * oom_badness() returns 0 if the thread is unkillable 525 * oom_badness() returns 0 if the thread is unkillable
526 */ 526 */
527 child_points = oom_badness(child, mem, nodemask, 527 child_points = oom_badness(child, memcg, nodemask,
528 totalpages); 528 totalpages);
529 if (child_points > victim_points) { 529 if (child_points > victim_points) {
530 victim = child; 530 victim = child;
@@ -533,7 +533,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
533 } 533 }
534 } while_each_thread(p, t); 534 } while_each_thread(p, t);
535 535
536 return oom_kill_task(victim, mem); 536 return oom_kill_task(victim);
537} 537}
538 538
539/* 539/*
@@ -561,7 +561,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
561} 561}
562 562
563#ifdef CONFIG_CGROUP_MEM_RES_CTLR 563#ifdef CONFIG_CGROUP_MEM_RES_CTLR
564void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) 564void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
565{ 565{
566 unsigned long limit; 566 unsigned long limit;
567 unsigned int points = 0; 567 unsigned int points = 0;
@@ -578,14 +578,14 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
578 } 578 }
579 579
580 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); 580 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
581 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; 581 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
582 read_lock(&tasklist_lock); 582 read_lock(&tasklist_lock);
583retry: 583retry:
584 p = select_bad_process(&points, limit, mem, NULL); 584 p = select_bad_process(&points, limit, memcg, NULL);
585 if (!p || PTR_ERR(p) == -1UL) 585 if (!p || PTR_ERR(p) == -1UL)
586 goto out; 586 goto out;
587 587
588 if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL, 588 if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL,
589 "Memory cgroup out of memory")) 589 "Memory cgroup out of memory"))
590 goto retry; 590 goto retry;
591out: 591out:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 794e6715c226..0027d8f4a1bb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1981,14 +1981,20 @@ static struct page *
1981__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1981__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1982 struct zonelist *zonelist, enum zone_type high_zoneidx, 1982 struct zonelist *zonelist, enum zone_type high_zoneidx,
1983 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1983 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1984 int migratetype, unsigned long *did_some_progress, 1984 int migratetype, bool sync_migration,
1985 bool sync_migration) 1985 bool *deferred_compaction,
1986 unsigned long *did_some_progress)
1986{ 1987{
1987 struct page *page; 1988 struct page *page;
1988 1989
1989 if (!order || compaction_deferred(preferred_zone)) 1990 if (!order)
1990 return NULL; 1991 return NULL;
1991 1992
1993 if (compaction_deferred(preferred_zone)) {
1994 *deferred_compaction = true;
1995 return NULL;
1996 }
1997
1992 current->flags |= PF_MEMALLOC; 1998 current->flags |= PF_MEMALLOC;
1993 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 1999 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1994 nodemask, sync_migration); 2000 nodemask, sync_migration);
@@ -2016,7 +2022,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2016 * but not enough to satisfy watermarks. 2022 * but not enough to satisfy watermarks.
2017 */ 2023 */
2018 count_vm_event(COMPACTFAIL); 2024 count_vm_event(COMPACTFAIL);
2019 defer_compaction(preferred_zone); 2025
2026 /*
2027 * As async compaction considers a subset of pageblocks, only
2028 * defer if the failure was a sync compaction failure.
2029 */
2030 if (sync_migration)
2031 defer_compaction(preferred_zone);
2020 2032
2021 cond_resched(); 2033 cond_resched();
2022 } 2034 }
@@ -2028,8 +2040,9 @@ static inline struct page *
2028__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2040__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2029 struct zonelist *zonelist, enum zone_type high_zoneidx, 2041 struct zonelist *zonelist, enum zone_type high_zoneidx,
2030 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2042 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2031 int migratetype, unsigned long *did_some_progress, 2043 int migratetype, bool sync_migration,
2032 bool sync_migration) 2044 bool *deferred_compaction,
2045 unsigned long *did_some_progress)
2033{ 2046{
2034 return NULL; 2047 return NULL;
2035} 2048}
@@ -2179,6 +2192,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2179 unsigned long pages_reclaimed = 0; 2192 unsigned long pages_reclaimed = 0;
2180 unsigned long did_some_progress; 2193 unsigned long did_some_progress;
2181 bool sync_migration = false; 2194 bool sync_migration = false;
2195 bool deferred_compaction = false;
2182 2196
2183 /* 2197 /*
2184 * In the slowpath, we sanity check order to avoid ever trying to 2198 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2259,12 +2273,22 @@ rebalance:
2259 zonelist, high_zoneidx, 2273 zonelist, high_zoneidx,
2260 nodemask, 2274 nodemask,
2261 alloc_flags, preferred_zone, 2275 alloc_flags, preferred_zone,
2262 migratetype, &did_some_progress, 2276 migratetype, sync_migration,
2263 sync_migration); 2277 &deferred_compaction,
2278 &did_some_progress);
2264 if (page) 2279 if (page)
2265 goto got_pg; 2280 goto got_pg;
2266 sync_migration = true; 2281 sync_migration = true;
2267 2282
2283 /*
2284 * If compaction is deferred for high-order allocations, it is because
2285 * sync compaction recently failed. In this is the case and the caller
2286 * has requested the system not be heavily disrupted, fail the
2287 * allocation now instead of entering direct reclaim
2288 */
2289 if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
2290 goto nopage;
2291
2268 /* Try direct reclaim and then allocating */ 2292 /* Try direct reclaim and then allocating */
2269 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2293 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2270 zonelist, high_zoneidx, 2294 zonelist, high_zoneidx,
@@ -2328,8 +2352,9 @@ rebalance:
2328 zonelist, high_zoneidx, 2352 zonelist, high_zoneidx,
2329 nodemask, 2353 nodemask,
2330 alloc_flags, preferred_zone, 2354 alloc_flags, preferred_zone,
2331 migratetype, &did_some_progress, 2355 migratetype, sync_migration,
2332 sync_migration); 2356 &deferred_compaction,
2357 &did_some_progress);
2333 if (page) 2358 if (page)
2334 goto got_pg; 2359 goto got_pg;
2335 } 2360 }
@@ -4237,7 +4262,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4237 for (j = 0; j < MAX_NR_ZONES; j++) { 4262 for (j = 0; j < MAX_NR_ZONES; j++) {
4238 struct zone *zone = pgdat->node_zones + j; 4263 struct zone *zone = pgdat->node_zones + j;
4239 unsigned long size, realsize, memmap_pages; 4264 unsigned long size, realsize, memmap_pages;
4240 enum lru_list l; 4265 enum lru_list lru;
4241 4266
4242 size = zone_spanned_pages_in_node(nid, j, zones_size); 4267 size = zone_spanned_pages_in_node(nid, j, zones_size);
4243 realsize = size - zone_absent_pages_in_node(nid, j, 4268 realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4287,8 +4312,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4287 zone->zone_pgdat = pgdat; 4312 zone->zone_pgdat = pgdat;
4288 4313
4289 zone_pcp_init(zone); 4314 zone_pcp_init(zone);
4290 for_each_lru(l) 4315 for_each_lru(lru)
4291 INIT_LIST_HEAD(&zone->lru[l].list); 4316 INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
4292 zone->reclaim_stat.recent_rotated[0] = 0; 4317 zone->reclaim_stat.recent_rotated[0] = 0;
4293 zone->reclaim_stat.recent_rotated[1] = 0; 4318 zone->reclaim_stat.recent_rotated[1] = 0;
4294 zone->reclaim_stat.recent_scanned[0] = 0; 4319 zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4642,8 +4667,10 @@ static void check_for_regular_memory(pg_data_t *pgdat)
4642 4667
4643 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { 4668 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
4644 struct zone *zone = &pgdat->node_zones[zone_type]; 4669 struct zone *zone = &pgdat->node_zones[zone_type];
4645 if (zone->present_pages) 4670 if (zone->present_pages) {
4646 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); 4671 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
4672 break;
4673 }
4647 } 4674 }
4648#endif 4675#endif
4649} 4676}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 2d123f94a8df..de1616aa9b1e 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -11,13 +11,6 @@
11#include <linux/swapops.h> 11#include <linux/swapops.h>
12#include <linux/kmemleak.h> 12#include <linux/kmemleak.h>
13 13
14static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
15{
16 pc->flags = 0;
17 set_page_cgroup_array_id(pc, id);
18 pc->mem_cgroup = NULL;
19 INIT_LIST_HEAD(&pc->lru);
20}
21static unsigned long total_usage; 14static unsigned long total_usage;
22 15
23#if !defined(CONFIG_SPARSEMEM) 16#if !defined(CONFIG_SPARSEMEM)
@@ -35,35 +28,27 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
35 struct page_cgroup *base; 28 struct page_cgroup *base;
36 29
37 base = NODE_DATA(page_to_nid(page))->node_page_cgroup; 30 base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
31#ifdef CONFIG_DEBUG_VM
32 /*
33 * The sanity checks the page allocator does upon freeing a
34 * page can reach here before the page_cgroup arrays are
35 * allocated when feeding a range of pages to the allocator
36 * for the first time during bootup or memory hotplug.
37 */
38 if (unlikely(!base)) 38 if (unlikely(!base))
39 return NULL; 39 return NULL;
40 40#endif
41 offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; 41 offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
42 return base + offset; 42 return base + offset;
43} 43}
44 44
45struct page *lookup_cgroup_page(struct page_cgroup *pc)
46{
47 unsigned long pfn;
48 struct page *page;
49 pg_data_t *pgdat;
50
51 pgdat = NODE_DATA(page_cgroup_array_id(pc));
52 pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
53 page = pfn_to_page(pfn);
54 VM_BUG_ON(pc != lookup_page_cgroup(page));
55 return page;
56}
57
58static int __init alloc_node_page_cgroup(int nid) 45static int __init alloc_node_page_cgroup(int nid)
59{ 46{
60 struct page_cgroup *base, *pc; 47 struct page_cgroup *base;
61 unsigned long table_size; 48 unsigned long table_size;
62 unsigned long start_pfn, nr_pages, index; 49 unsigned long nr_pages;
63 50
64 start_pfn = NODE_DATA(nid)->node_start_pfn;
65 nr_pages = NODE_DATA(nid)->node_spanned_pages; 51 nr_pages = NODE_DATA(nid)->node_spanned_pages;
66
67 if (!nr_pages) 52 if (!nr_pages)
68 return 0; 53 return 0;
69 54
@@ -73,10 +58,6 @@ static int __init alloc_node_page_cgroup(int nid)
73 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 58 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
74 if (!base) 59 if (!base)
75 return -ENOMEM; 60 return -ENOMEM;
76 for (index = 0; index < nr_pages; index++) {
77 pc = base + index;
78 init_page_cgroup(pc, nid);
79 }
80 NODE_DATA(nid)->node_page_cgroup = base; 61 NODE_DATA(nid)->node_page_cgroup = base;
81 total_usage += table_size; 62 total_usage += table_size;
82 return 0; 63 return 0;
@@ -111,29 +92,23 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
111{ 92{
112 unsigned long pfn = page_to_pfn(page); 93 unsigned long pfn = page_to_pfn(page);
113 struct mem_section *section = __pfn_to_section(pfn); 94 struct mem_section *section = __pfn_to_section(pfn);
114 95#ifdef CONFIG_DEBUG_VM
96 /*
97 * The sanity checks the page allocator does upon freeing a
98 * page can reach here before the page_cgroup arrays are
99 * allocated when feeding a range of pages to the allocator
100 * for the first time during bootup or memory hotplug.
101 */
115 if (!section->page_cgroup) 102 if (!section->page_cgroup)
116 return NULL; 103 return NULL;
104#endif
117 return section->page_cgroup + pfn; 105 return section->page_cgroup + pfn;
118} 106}
119 107
120struct page *lookup_cgroup_page(struct page_cgroup *pc)
121{
122 struct mem_section *section;
123 struct page *page;
124 unsigned long nr;
125
126 nr = page_cgroup_array_id(pc);
127 section = __nr_to_section(nr);
128 page = pfn_to_page(pc - section->page_cgroup);
129 VM_BUG_ON(pc != lookup_page_cgroup(page));
130 return page;
131}
132
133static void *__meminit alloc_page_cgroup(size_t size, int nid) 108static void *__meminit alloc_page_cgroup(size_t size, int nid)
134{ 109{
110 gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
135 void *addr = NULL; 111 void *addr = NULL;
136 gfp_t flags = GFP_KERNEL | __GFP_NOWARN;
137 112
138 addr = alloc_pages_exact_nid(nid, size, flags); 113 addr = alloc_pages_exact_nid(nid, size, flags);
139 if (addr) { 114 if (addr) {
@@ -142,39 +117,20 @@ static void *__meminit alloc_page_cgroup(size_t size, int nid)
142 } 117 }
143 118
144 if (node_state(nid, N_HIGH_MEMORY)) 119 if (node_state(nid, N_HIGH_MEMORY))
145 addr = vmalloc_node(size, nid); 120 addr = vzalloc_node(size, nid);
146 else 121 else
147 addr = vmalloc(size); 122 addr = vzalloc(size);
148 123
149 return addr; 124 return addr;
150} 125}
151 126
152#ifdef CONFIG_MEMORY_HOTPLUG
153static void free_page_cgroup(void *addr)
154{
155 if (is_vmalloc_addr(addr)) {
156 vfree(addr);
157 } else {
158 struct page *page = virt_to_page(addr);
159 size_t table_size =
160 sizeof(struct page_cgroup) * PAGES_PER_SECTION;
161
162 BUG_ON(PageReserved(page));
163 free_pages_exact(addr, table_size);
164 }
165}
166#endif
167
168static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) 127static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
169{ 128{
170 struct page_cgroup *base, *pc;
171 struct mem_section *section; 129 struct mem_section *section;
130 struct page_cgroup *base;
172 unsigned long table_size; 131 unsigned long table_size;
173 unsigned long nr;
174 int index;
175 132
176 nr = pfn_to_section_nr(pfn); 133 section = __pfn_to_section(pfn);
177 section = __nr_to_section(nr);
178 134
179 if (section->page_cgroup) 135 if (section->page_cgroup)
180 return 0; 136 return 0;
@@ -194,10 +150,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
194 return -ENOMEM; 150 return -ENOMEM;
195 } 151 }
196 152
197 for (index = 0; index < PAGES_PER_SECTION; index++) {
198 pc = base + index;
199 init_page_cgroup(pc, nr);
200 }
201 /* 153 /*
202 * The passed "pfn" may not be aligned to SECTION. For the calculation 154 * The passed "pfn" may not be aligned to SECTION. For the calculation
203 * we need to apply a mask. 155 * we need to apply a mask.
@@ -208,6 +160,20 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
208 return 0; 160 return 0;
209} 161}
210#ifdef CONFIG_MEMORY_HOTPLUG 162#ifdef CONFIG_MEMORY_HOTPLUG
163static void free_page_cgroup(void *addr)
164{
165 if (is_vmalloc_addr(addr)) {
166 vfree(addr);
167 } else {
168 struct page *page = virt_to_page(addr);
169 size_t table_size =
170 sizeof(struct page_cgroup) * PAGES_PER_SECTION;
171
172 BUG_ON(PageReserved(page));
173 free_pages_exact(addr, table_size);
174 }
175}
176
211void __free_page_cgroup(unsigned long pfn) 177void __free_page_cgroup(unsigned long pfn)
212{ 178{
213 struct mem_section *ms; 179 struct mem_section *ms;
@@ -366,7 +332,6 @@ struct swap_cgroup {
366 unsigned short id; 332 unsigned short id;
367}; 333};
368#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) 334#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
369#define SC_POS_MASK (SC_PER_PAGE - 1)
370 335
371/* 336/*
372 * SwapCgroup implements "lookup" and "exchange" operations. 337 * SwapCgroup implements "lookup" and "exchange" operations.
@@ -408,6 +373,21 @@ not_enough_page:
408 return -ENOMEM; 373 return -ENOMEM;
409} 374}
410 375
376static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
377 struct swap_cgroup_ctrl **ctrlp)
378{
379 pgoff_t offset = swp_offset(ent);
380 struct swap_cgroup_ctrl *ctrl;
381 struct page *mappage;
382
383 ctrl = &swap_cgroup_ctrl[swp_type(ent)];
384 if (ctrlp)
385 *ctrlp = ctrl;
386
387 mappage = ctrl->map[offset / SC_PER_PAGE];
388 return page_address(mappage) + offset % SC_PER_PAGE;
389}
390
411/** 391/**
412 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. 392 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
413 * @end: swap entry to be cmpxchged 393 * @end: swap entry to be cmpxchged
@@ -420,21 +400,13 @@ not_enough_page:
420unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, 400unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
421 unsigned short old, unsigned short new) 401 unsigned short old, unsigned short new)
422{ 402{
423 int type = swp_type(ent);
424 unsigned long offset = swp_offset(ent);
425 unsigned long idx = offset / SC_PER_PAGE;
426 unsigned long pos = offset & SC_POS_MASK;
427 struct swap_cgroup_ctrl *ctrl; 403 struct swap_cgroup_ctrl *ctrl;
428 struct page *mappage;
429 struct swap_cgroup *sc; 404 struct swap_cgroup *sc;
430 unsigned long flags; 405 unsigned long flags;
431 unsigned short retval; 406 unsigned short retval;
432 407
433 ctrl = &swap_cgroup_ctrl[type]; 408 sc = lookup_swap_cgroup(ent, &ctrl);
434 409
435 mappage = ctrl->map[idx];
436 sc = page_address(mappage);
437 sc += pos;
438 spin_lock_irqsave(&ctrl->lock, flags); 410 spin_lock_irqsave(&ctrl->lock, flags);
439 retval = sc->id; 411 retval = sc->id;
440 if (retval == old) 412 if (retval == old)
@@ -455,21 +427,13 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
455 */ 427 */
456unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) 428unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
457{ 429{
458 int type = swp_type(ent);
459 unsigned long offset = swp_offset(ent);
460 unsigned long idx = offset / SC_PER_PAGE;
461 unsigned long pos = offset & SC_POS_MASK;
462 struct swap_cgroup_ctrl *ctrl; 430 struct swap_cgroup_ctrl *ctrl;
463 struct page *mappage;
464 struct swap_cgroup *sc; 431 struct swap_cgroup *sc;
465 unsigned short old; 432 unsigned short old;
466 unsigned long flags; 433 unsigned long flags;
467 434
468 ctrl = &swap_cgroup_ctrl[type]; 435 sc = lookup_swap_cgroup(ent, &ctrl);
469 436
470 mappage = ctrl->map[idx];
471 sc = page_address(mappage);
472 sc += pos;
473 spin_lock_irqsave(&ctrl->lock, flags); 437 spin_lock_irqsave(&ctrl->lock, flags);
474 old = sc->id; 438 old = sc->id;
475 sc->id = id; 439 sc->id = id;
@@ -479,28 +443,14 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
479} 443}
480 444
481/** 445/**
482 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry 446 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
483 * @ent: swap entry to be looked up. 447 * @ent: swap entry to be looked up.
484 * 448 *
485 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) 449 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
486 */ 450 */
487unsigned short lookup_swap_cgroup(swp_entry_t ent) 451unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
488{ 452{
489 int type = swp_type(ent); 453 return lookup_swap_cgroup(ent, NULL)->id;
490 unsigned long offset = swp_offset(ent);
491 unsigned long idx = offset / SC_PER_PAGE;
492 unsigned long pos = offset & SC_POS_MASK;
493 struct swap_cgroup_ctrl *ctrl;
494 struct page *mappage;
495 struct swap_cgroup *sc;
496 unsigned short ret;
497
498 ctrl = &swap_cgroup_ctrl[type];
499 mappage = ctrl->map[idx];
500 sc = page_address(mappage);
501 sc += pos;
502 ret = sc->id;
503 return ret;
504} 454}
505 455
506int swap_cgroup_swapon(int type, unsigned long max_pages) 456int swap_cgroup_swapon(int type, unsigned long max_pages)
diff --git a/mm/rmap.c b/mm/rmap.c
index a2e5ce1fa081..c8454e06b6c8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -773,7 +773,7 @@ out:
773} 773}
774 774
775static int page_referenced_anon(struct page *page, 775static int page_referenced_anon(struct page *page,
776 struct mem_cgroup *mem_cont, 776 struct mem_cgroup *memcg,
777 unsigned long *vm_flags) 777 unsigned long *vm_flags)
778{ 778{
779 unsigned int mapcount; 779 unsigned int mapcount;
@@ -796,7 +796,7 @@ static int page_referenced_anon(struct page *page,
796 * counting on behalf of references from different 796 * counting on behalf of references from different
797 * cgroups 797 * cgroups
798 */ 798 */
799 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 799 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
800 continue; 800 continue;
801 referenced += page_referenced_one(page, vma, address, 801 referenced += page_referenced_one(page, vma, address,
802 &mapcount, vm_flags); 802 &mapcount, vm_flags);
@@ -811,7 +811,7 @@ static int page_referenced_anon(struct page *page,
811/** 811/**
812 * page_referenced_file - referenced check for object-based rmap 812 * page_referenced_file - referenced check for object-based rmap
813 * @page: the page we're checking references on. 813 * @page: the page we're checking references on.
814 * @mem_cont: target memory controller 814 * @memcg: target memory control group
815 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 815 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
816 * 816 *
817 * For an object-based mapped page, find all the places it is mapped and 817 * For an object-based mapped page, find all the places it is mapped and
@@ -822,7 +822,7 @@ static int page_referenced_anon(struct page *page,
822 * This function is only called from page_referenced for object-based pages. 822 * This function is only called from page_referenced for object-based pages.
823 */ 823 */
824static int page_referenced_file(struct page *page, 824static int page_referenced_file(struct page *page,
825 struct mem_cgroup *mem_cont, 825 struct mem_cgroup *memcg,
826 unsigned long *vm_flags) 826 unsigned long *vm_flags)
827{ 827{
828 unsigned int mapcount; 828 unsigned int mapcount;
@@ -864,7 +864,7 @@ static int page_referenced_file(struct page *page,
864 * counting on behalf of references from different 864 * counting on behalf of references from different
865 * cgroups 865 * cgroups
866 */ 866 */
867 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 867 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
868 continue; 868 continue;
869 referenced += page_referenced_one(page, vma, address, 869 referenced += page_referenced_one(page, vma, address,
870 &mapcount, vm_flags); 870 &mapcount, vm_flags);
@@ -880,7 +880,7 @@ static int page_referenced_file(struct page *page,
880 * page_referenced - test if the page was referenced 880 * page_referenced - test if the page was referenced
881 * @page: the page to test 881 * @page: the page to test
882 * @is_locked: caller holds lock on the page 882 * @is_locked: caller holds lock on the page
883 * @mem_cont: target memory controller 883 * @memcg: target memory cgroup
884 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 884 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
885 * 885 *
886 * Quick test_and_clear_referenced for all mappings to a page, 886 * Quick test_and_clear_referenced for all mappings to a page,
@@ -888,7 +888,7 @@ static int page_referenced_file(struct page *page,
888 */ 888 */
889int page_referenced(struct page *page, 889int page_referenced(struct page *page,
890 int is_locked, 890 int is_locked,
891 struct mem_cgroup *mem_cont, 891 struct mem_cgroup *memcg,
892 unsigned long *vm_flags) 892 unsigned long *vm_flags)
893{ 893{
894 int referenced = 0; 894 int referenced = 0;
@@ -904,13 +904,13 @@ int page_referenced(struct page *page,
904 } 904 }
905 } 905 }
906 if (unlikely(PageKsm(page))) 906 if (unlikely(PageKsm(page)))
907 referenced += page_referenced_ksm(page, mem_cont, 907 referenced += page_referenced_ksm(page, memcg,
908 vm_flags); 908 vm_flags);
909 else if (PageAnon(page)) 909 else if (PageAnon(page))
910 referenced += page_referenced_anon(page, mem_cont, 910 referenced += page_referenced_anon(page, memcg,
911 vm_flags); 911 vm_flags);
912 else if (page->mapping) 912 else if (page->mapping)
913 referenced += page_referenced_file(page, mem_cont, 913 referenced += page_referenced_file(page, memcg,
914 vm_flags); 914 vm_flags);
915 if (we_locked) 915 if (we_locked)
916 unlock_page(page); 916 unlock_page(page);
diff --git a/mm/slub.c b/mm/slub.c
index 5d37b5e44140..4907563ef7ff 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -366,7 +366,8 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
366 const char *n) 366 const char *n)
367{ 367{
368 VM_BUG_ON(!irqs_disabled()); 368 VM_BUG_ON(!irqs_disabled());
369#ifdef CONFIG_CMPXCHG_DOUBLE 369#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
370 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
370 if (s->flags & __CMPXCHG_DOUBLE) { 371 if (s->flags & __CMPXCHG_DOUBLE) {
371 if (cmpxchg_double(&page->freelist, &page->counters, 372 if (cmpxchg_double(&page->freelist, &page->counters,
372 freelist_old, counters_old, 373 freelist_old, counters_old,
@@ -400,7 +401,8 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
400 void *freelist_new, unsigned long counters_new, 401 void *freelist_new, unsigned long counters_new,
401 const char *n) 402 const char *n)
402{ 403{
403#ifdef CONFIG_CMPXCHG_DOUBLE 404#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
405 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
404 if (s->flags & __CMPXCHG_DOUBLE) { 406 if (s->flags & __CMPXCHG_DOUBLE) {
405 if (cmpxchg_double(&page->freelist, &page->counters, 407 if (cmpxchg_double(&page->freelist, &page->counters,
406 freelist_old, counters_old, 408 freelist_old, counters_old,
@@ -3014,7 +3016,8 @@ static int kmem_cache_open(struct kmem_cache *s,
3014 } 3016 }
3015 } 3017 }
3016 3018
3017#ifdef CONFIG_CMPXCHG_DOUBLE 3019#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
3020 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
3018 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) 3021 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
3019 /* Enable fast mode */ 3022 /* Enable fast mode */
3020 s->flags |= __CMPXCHG_DOUBLE; 3023 s->flags |= __CMPXCHG_DOUBLE;
diff --git a/mm/swap.c b/mm/swap.c
index 67a09a633a09..b0f529b38979 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -23,7 +23,6 @@
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/export.h> 24#include <linux/export.h>
25#include <linux/mm_inline.h> 25#include <linux/mm_inline.h>
26#include <linux/buffer_head.h> /* for try_to_release_page() */
27#include <linux/percpu_counter.h> 26#include <linux/percpu_counter.h>
28#include <linux/percpu.h> 27#include <linux/percpu.h>
29#include <linux/cpu.h> 28#include <linux/cpu.h>
@@ -54,7 +53,7 @@ static void __page_cache_release(struct page *page)
54 spin_lock_irqsave(&zone->lru_lock, flags); 53 spin_lock_irqsave(&zone->lru_lock, flags);
55 VM_BUG_ON(!PageLRU(page)); 54 VM_BUG_ON(!PageLRU(page));
56 __ClearPageLRU(page); 55 __ClearPageLRU(page);
57 del_page_from_lru(zone, page); 56 del_page_from_lru_list(zone, page, page_off_lru(page));
58 spin_unlock_irqrestore(&zone->lru_lock, flags); 57 spin_unlock_irqrestore(&zone->lru_lock, flags);
59 } 58 }
60} 59}
@@ -232,12 +231,14 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
232static void pagevec_move_tail_fn(struct page *page, void *arg) 231static void pagevec_move_tail_fn(struct page *page, void *arg)
233{ 232{
234 int *pgmoved = arg; 233 int *pgmoved = arg;
235 struct zone *zone = page_zone(page);
236 234
237 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 235 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
238 enum lru_list lru = page_lru_base_type(page); 236 enum lru_list lru = page_lru_base_type(page);
239 list_move_tail(&page->lru, &zone->lru[lru].list); 237 struct lruvec *lruvec;
240 mem_cgroup_rotate_reclaimable_page(page); 238
239 lruvec = mem_cgroup_lru_move_lists(page_zone(page),
240 page, lru, lru);
241 list_move_tail(&page->lru, &lruvec->lists[lru]);
241 (*pgmoved)++; 242 (*pgmoved)++;
242 } 243 }
243} 244}
@@ -368,7 +369,6 @@ void mark_page_accessed(struct page *page)
368 SetPageReferenced(page); 369 SetPageReferenced(page);
369 } 370 }
370} 371}
371
372EXPORT_SYMBOL(mark_page_accessed); 372EXPORT_SYMBOL(mark_page_accessed);
373 373
374void __lru_cache_add(struct page *page, enum lru_list lru) 374void __lru_cache_add(struct page *page, enum lru_list lru)
@@ -377,7 +377,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru)
377 377
378 page_cache_get(page); 378 page_cache_get(page);
379 if (!pagevec_add(pvec, page)) 379 if (!pagevec_add(pvec, page))
380 ____pagevec_lru_add(pvec, lru); 380 __pagevec_lru_add(pvec, lru);
381 put_cpu_var(lru_add_pvecs); 381 put_cpu_var(lru_add_pvecs);
382} 382}
383EXPORT_SYMBOL(__lru_cache_add); 383EXPORT_SYMBOL(__lru_cache_add);
@@ -476,12 +476,13 @@ static void lru_deactivate_fn(struct page *page, void *arg)
476 */ 476 */
477 SetPageReclaim(page); 477 SetPageReclaim(page);
478 } else { 478 } else {
479 struct lruvec *lruvec;
479 /* 480 /*
480 * The page's writeback ends up during pagevec 481 * The page's writeback ends up during pagevec
481 * We moves tha page into tail of inactive. 482 * We moves tha page into tail of inactive.
482 */ 483 */
483 list_move_tail(&page->lru, &zone->lru[lru].list); 484 lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru);
484 mem_cgroup_rotate_reclaimable_page(page); 485 list_move_tail(&page->lru, &lruvec->lists[lru]);
485 __count_vm_event(PGROTATED); 486 __count_vm_event(PGROTATED);
486 } 487 }
487 488
@@ -504,7 +505,7 @@ static void drain_cpu_pagevecs(int cpu)
504 for_each_lru(lru) { 505 for_each_lru(lru) {
505 pvec = &pvecs[lru - LRU_BASE]; 506 pvec = &pvecs[lru - LRU_BASE];
506 if (pagevec_count(pvec)) 507 if (pagevec_count(pvec))
507 ____pagevec_lru_add(pvec, lru); 508 __pagevec_lru_add(pvec, lru);
508 } 509 }
509 510
510 pvec = &per_cpu(lru_rotate_pvecs, cpu); 511 pvec = &per_cpu(lru_rotate_pvecs, cpu);
@@ -616,7 +617,7 @@ void release_pages(struct page **pages, int nr, int cold)
616 } 617 }
617 VM_BUG_ON(!PageLRU(page)); 618 VM_BUG_ON(!PageLRU(page));
618 __ClearPageLRU(page); 619 __ClearPageLRU(page);
619 del_page_from_lru(zone, page); 620 del_page_from_lru_list(zone, page, page_off_lru(page));
620 } 621 }
621 622
622 list_add(&page->lru, &pages_to_free); 623 list_add(&page->lru, &pages_to_free);
@@ -644,9 +645,9 @@ void __pagevec_release(struct pagevec *pvec)
644 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 645 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
645 pagevec_reinit(pvec); 646 pagevec_reinit(pvec);
646} 647}
647
648EXPORT_SYMBOL(__pagevec_release); 648EXPORT_SYMBOL(__pagevec_release);
649 649
650#ifdef CONFIG_TRANSPARENT_HUGEPAGE
650/* used by __split_huge_page_refcount() */ 651/* used by __split_huge_page_refcount() */
651void lru_add_page_tail(struct zone* zone, 652void lru_add_page_tail(struct zone* zone,
652 struct page *page, struct page *page_tail) 653 struct page *page, struct page *page_tail)
@@ -654,7 +655,6 @@ void lru_add_page_tail(struct zone* zone,
654 int active; 655 int active;
655 enum lru_list lru; 656 enum lru_list lru;
656 const int file = 0; 657 const int file = 0;
657 struct list_head *head;
658 658
659 VM_BUG_ON(!PageHead(page)); 659 VM_BUG_ON(!PageHead(page));
660 VM_BUG_ON(PageCompound(page_tail)); 660 VM_BUG_ON(PageCompound(page_tail));
@@ -673,18 +673,30 @@ void lru_add_page_tail(struct zone* zone,
673 lru = LRU_INACTIVE_ANON; 673 lru = LRU_INACTIVE_ANON;
674 } 674 }
675 update_page_reclaim_stat(zone, page_tail, file, active); 675 update_page_reclaim_stat(zone, page_tail, file, active);
676 if (likely(PageLRU(page)))
677 head = page->lru.prev;
678 else
679 head = &zone->lru[lru].list;
680 __add_page_to_lru_list(zone, page_tail, lru, head);
681 } else { 676 } else {
682 SetPageUnevictable(page_tail); 677 SetPageUnevictable(page_tail);
683 add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); 678 lru = LRU_UNEVICTABLE;
679 }
680
681 if (likely(PageLRU(page)))
682 list_add_tail(&page_tail->lru, &page->lru);
683 else {
684 struct list_head *list_head;
685 /*
686 * Head page has not yet been counted, as an hpage,
687 * so we must account for each subpage individually.
688 *
689 * Use the standard add function to put page_tail on the list,
690 * but then correct its position so they all end up in order.
691 */
692 add_page_to_lru_list(zone, page_tail, lru);
693 list_head = page_tail->lru.prev;
694 list_move_tail(&page_tail->lru, list_head);
684 } 695 }
685} 696}
697#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
686 698
687static void ____pagevec_lru_add_fn(struct page *page, void *arg) 699static void __pagevec_lru_add_fn(struct page *page, void *arg)
688{ 700{
689 enum lru_list lru = (enum lru_list)arg; 701 enum lru_list lru = (enum lru_list)arg;
690 struct zone *zone = page_zone(page); 702 struct zone *zone = page_zone(page);
@@ -706,32 +718,13 @@ static void ____pagevec_lru_add_fn(struct page *page, void *arg)
706 * Add the passed pages to the LRU, then drop the caller's refcount 718 * Add the passed pages to the LRU, then drop the caller's refcount
707 * on them. Reinitialises the caller's pagevec. 719 * on them. Reinitialises the caller's pagevec.
708 */ 720 */
709void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) 721void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
710{ 722{
711 VM_BUG_ON(is_unevictable_lru(lru)); 723 VM_BUG_ON(is_unevictable_lru(lru));
712 724
713 pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); 725 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru);
714}
715
716EXPORT_SYMBOL(____pagevec_lru_add);
717
718/*
719 * Try to drop buffers from the pages in a pagevec
720 */
721void pagevec_strip(struct pagevec *pvec)
722{
723 int i;
724
725 for (i = 0; i < pagevec_count(pvec); i++) {
726 struct page *page = pvec->pages[i];
727
728 if (page_has_private(page) && trylock_page(page)) {
729 if (page_has_private(page))
730 try_to_release_page(page, 0);
731 unlock_page(page);
732 }
733 }
734} 726}
727EXPORT_SYMBOL(__pagevec_lru_add);
735 728
736/** 729/**
737 * pagevec_lookup - gang pagecache lookup 730 * pagevec_lookup - gang pagecache lookup
@@ -755,7 +748,6 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
755 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 748 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
756 return pagevec_count(pvec); 749 return pagevec_count(pvec);
757} 750}
758
759EXPORT_SYMBOL(pagevec_lookup); 751EXPORT_SYMBOL(pagevec_lookup);
760 752
761unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 753unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
@@ -765,7 +757,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
765 nr_pages, pvec->pages); 757 nr_pages, pvec->pages);
766 return pagevec_count(pvec); 758 return pagevec_count(pvec);
767} 759}
768
769EXPORT_SYMBOL(pagevec_lookup_tag); 760EXPORT_SYMBOL(pagevec_lookup_tag);
770 761
771/* 762/*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ea6b32d61873..470038a91873 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -300,6 +300,16 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
300 new_page = alloc_page_vma(gfp_mask, vma, addr); 300 new_page = alloc_page_vma(gfp_mask, vma, addr);
301 if (!new_page) 301 if (!new_page)
302 break; /* Out of memory */ 302 break; /* Out of memory */
303 /*
304 * The memcg-specific accounting when moving
305 * pages around the LRU lists relies on the
306 * page's owner (memcg) to be valid. Usually,
307 * pages are assigned to a new owner before
308 * being put on the LRU list, but since this
309 * is not the case here, the stale owner from
310 * a previous allocation cycle must be reset.
311 */
312 mem_cgroup_reset_owner(new_page);
303 } 313 }
304 314
305 /* 315 /*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9520592d4231..d999f090dfda 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -847,12 +847,13 @@ unsigned int count_swap_pages(int type, int free)
847static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 847static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
848 unsigned long addr, swp_entry_t entry, struct page *page) 848 unsigned long addr, swp_entry_t entry, struct page *page)
849{ 849{
850 struct mem_cgroup *ptr; 850 struct mem_cgroup *memcg;
851 spinlock_t *ptl; 851 spinlock_t *ptl;
852 pte_t *pte; 852 pte_t *pte;
853 int ret = 1; 853 int ret = 1;
854 854
855 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { 855 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
856 GFP_KERNEL, &memcg)) {
856 ret = -ENOMEM; 857 ret = -ENOMEM;
857 goto out_nolock; 858 goto out_nolock;
858 } 859 }
@@ -860,7 +861,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
860 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 861 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
861 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 862 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
862 if (ret > 0) 863 if (ret > 0)
863 mem_cgroup_cancel_charge_swapin(ptr); 864 mem_cgroup_cancel_charge_swapin(memcg);
864 ret = 0; 865 ret = 0;
865 goto out; 866 goto out;
866 } 867 }
@@ -871,7 +872,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
871 set_pte_at(vma->vm_mm, addr, pte, 872 set_pte_at(vma->vm_mm, addr, pte,
872 pte_mkold(mk_pte(page, vma->vm_page_prot))); 873 pte_mkold(mk_pte(page, vma->vm_page_prot)));
873 page_add_anon_rmap(page, vma, addr); 874 page_add_anon_rmap(page, vma, addr);
874 mem_cgroup_commit_charge_swapin(page, ptr); 875 mem_cgroup_commit_charge_swapin(page, memcg);
875 swap_free(entry); 876 swap_free(entry);
876 /* 877 /*
877 * Move the page to the active list so it is not 878 * Move the page to the active list so it is not
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 877ca046f43d..86ce9a526c17 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2378,7 +2378,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2378 vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); 2378 vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
2379 vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); 2379 vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
2380 if (!vas || !vms) 2380 if (!vas || !vms)
2381 goto err_free; 2381 goto err_free2;
2382 2382
2383 for (area = 0; area < nr_vms; area++) { 2383 for (area = 0; area < nr_vms; area++) {
2384 vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); 2384 vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
@@ -2476,11 +2476,10 @@ found:
2476 2476
2477err_free: 2477err_free:
2478 for (area = 0; area < nr_vms; area++) { 2478 for (area = 0; area < nr_vms; area++) {
2479 if (vas) 2479 kfree(vas[area]);
2480 kfree(vas[area]); 2480 kfree(vms[area]);
2481 if (vms)
2482 kfree(vms[area]);
2483 } 2481 }
2482err_free2:
2484 kfree(vas); 2483 kfree(vas);
2485 kfree(vms); 2484 kfree(vms);
2486 return NULL; 2485 return NULL;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26f4a8a4e0c7..2880396f7953 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -103,8 +103,11 @@ struct scan_control {
103 */ 103 */
104 reclaim_mode_t reclaim_mode; 104 reclaim_mode_t reclaim_mode;
105 105
106 /* Which cgroup do we reclaim from */ 106 /*
107 struct mem_cgroup *mem_cgroup; 107 * The memory cgroup that hit its limit and as a result is the
108 * primary target of this reclaim invocation.
109 */
110 struct mem_cgroup *target_mem_cgroup;
108 111
109 /* 112 /*
110 * Nodemask of nodes allowed by the caller. If NULL, all nodes 113 * Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -113,6 +116,11 @@ struct scan_control {
113 nodemask_t *nodemask; 116 nodemask_t *nodemask;
114}; 117};
115 118
119struct mem_cgroup_zone {
120 struct mem_cgroup *mem_cgroup;
121 struct zone *zone;
122};
123
116#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 124#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
117 125
118#ifdef ARCH_HAS_PREFETCH 126#ifdef ARCH_HAS_PREFETCH
@@ -153,28 +161,45 @@ static LIST_HEAD(shrinker_list);
153static DECLARE_RWSEM(shrinker_rwsem); 161static DECLARE_RWSEM(shrinker_rwsem);
154 162
155#ifdef CONFIG_CGROUP_MEM_RES_CTLR 163#ifdef CONFIG_CGROUP_MEM_RES_CTLR
156#define scanning_global_lru(sc) (!(sc)->mem_cgroup) 164static bool global_reclaim(struct scan_control *sc)
165{
166 return !sc->target_mem_cgroup;
167}
168
169static bool scanning_global_lru(struct mem_cgroup_zone *mz)
170{
171 return !mz->mem_cgroup;
172}
157#else 173#else
158#define scanning_global_lru(sc) (1) 174static bool global_reclaim(struct scan_control *sc)
175{
176 return true;
177}
178
179static bool scanning_global_lru(struct mem_cgroup_zone *mz)
180{
181 return true;
182}
159#endif 183#endif
160 184
161static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, 185static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
162 struct scan_control *sc)
163{ 186{
164 if (!scanning_global_lru(sc)) 187 if (!scanning_global_lru(mz))
165 return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); 188 return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
166 189
167 return &zone->reclaim_stat; 190 return &mz->zone->reclaim_stat;
168} 191}
169 192
170static unsigned long zone_nr_lru_pages(struct zone *zone, 193static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
171 struct scan_control *sc, enum lru_list lru) 194 enum lru_list lru)
172{ 195{
173 if (!scanning_global_lru(sc)) 196 if (!scanning_global_lru(mz))
174 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, 197 return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
175 zone_to_nid(zone), zone_idx(zone), BIT(lru)); 198 zone_to_nid(mz->zone),
199 zone_idx(mz->zone),
200 BIT(lru));
176 201
177 return zone_page_state(zone, NR_LRU_BASE + lru); 202 return zone_page_state(mz->zone, NR_LRU_BASE + lru);
178} 203}
179 204
180 205
@@ -677,12 +702,13 @@ enum page_references {
677}; 702};
678 703
679static enum page_references page_check_references(struct page *page, 704static enum page_references page_check_references(struct page *page,
705 struct mem_cgroup_zone *mz,
680 struct scan_control *sc) 706 struct scan_control *sc)
681{ 707{
682 int referenced_ptes, referenced_page; 708 int referenced_ptes, referenced_page;
683 unsigned long vm_flags; 709 unsigned long vm_flags;
684 710
685 referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); 711 referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags);
686 referenced_page = TestClearPageReferenced(page); 712 referenced_page = TestClearPageReferenced(page);
687 713
688 /* Lumpy reclaim - ignore references */ 714 /* Lumpy reclaim - ignore references */
@@ -738,7 +764,7 @@ static enum page_references page_check_references(struct page *page,
738 * shrink_page_list() returns the number of reclaimed pages 764 * shrink_page_list() returns the number of reclaimed pages
739 */ 765 */
740static unsigned long shrink_page_list(struct list_head *page_list, 766static unsigned long shrink_page_list(struct list_head *page_list,
741 struct zone *zone, 767 struct mem_cgroup_zone *mz,
742 struct scan_control *sc, 768 struct scan_control *sc,
743 int priority, 769 int priority,
744 unsigned long *ret_nr_dirty, 770 unsigned long *ret_nr_dirty,
@@ -769,7 +795,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
769 goto keep; 795 goto keep;
770 796
771 VM_BUG_ON(PageActive(page)); 797 VM_BUG_ON(PageActive(page));
772 VM_BUG_ON(page_zone(page) != zone); 798 VM_BUG_ON(page_zone(page) != mz->zone);
773 799
774 sc->nr_scanned++; 800 sc->nr_scanned++;
775 801
@@ -803,7 +829,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
803 } 829 }
804 } 830 }
805 831
806 references = page_check_references(page, sc); 832 references = page_check_references(page, mz, sc);
807 switch (references) { 833 switch (references) {
808 case PAGEREF_ACTIVATE: 834 case PAGEREF_ACTIVATE:
809 goto activate_locked; 835 goto activate_locked;
@@ -994,8 +1020,8 @@ keep_lumpy:
994 * back off and wait for congestion to clear because further reclaim 1020 * back off and wait for congestion to clear because further reclaim
995 * will encounter the same problem 1021 * will encounter the same problem
996 */ 1022 */
997 if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) 1023 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
998 zone_set_flag(zone, ZONE_CONGESTED); 1024 zone_set_flag(mz->zone, ZONE_CONGESTED);
999 1025
1000 free_hot_cold_page_list(&free_pages, 1); 1026 free_hot_cold_page_list(&free_pages, 1);
1001 1027
@@ -1049,8 +1075,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1049 1075
1050 ret = -EBUSY; 1076 ret = -EBUSY;
1051 1077
1052 if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) 1078 /*
1053 return ret; 1079 * To minimise LRU disruption, the caller can indicate that it only
1080 * wants to isolate pages it will be able to operate on without
1081 * blocking - clean pages for the most part.
1082 *
1083 * ISOLATE_CLEAN means that only clean pages should be isolated. This
1084 * is used by reclaim when it is cannot write to backing storage
1085 *
1086 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
1087 * that it is possible to migrate without blocking
1088 */
1089 if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
1090 /* All the caller can do on PageWriteback is block */
1091 if (PageWriteback(page))
1092 return ret;
1093
1094 if (PageDirty(page)) {
1095 struct address_space *mapping;
1096
1097 /* ISOLATE_CLEAN means only clean pages */
1098 if (mode & ISOLATE_CLEAN)
1099 return ret;
1100
1101 /*
1102 * Only pages without mappings or that have a
1103 * ->migratepage callback are possible to migrate
1104 * without blocking
1105 */
1106 mapping = page_mapping(page);
1107 if (mapping && !mapping->a_ops->migratepage)
1108 return ret;
1109 }
1110 }
1054 1111
1055 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) 1112 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1056 return ret; 1113 return ret;
@@ -1079,25 +1136,36 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1079 * Appropriate locks must be held before calling this function. 1136 * Appropriate locks must be held before calling this function.
1080 * 1137 *
1081 * @nr_to_scan: The number of pages to look through on the list. 1138 * @nr_to_scan: The number of pages to look through on the list.
1082 * @src: The LRU list to pull pages off. 1139 * @mz: The mem_cgroup_zone to pull pages from.
1083 * @dst: The temp list to put pages on to. 1140 * @dst: The temp list to put pages on to.
1084 * @scanned: The number of pages that were scanned. 1141 * @nr_scanned: The number of pages that were scanned.
1085 * @order: The caller's attempted allocation order 1142 * @order: The caller's attempted allocation order
1086 * @mode: One of the LRU isolation modes 1143 * @mode: One of the LRU isolation modes
1144 * @active: True [1] if isolating active pages
1087 * @file: True [1] if isolating file [!anon] pages 1145 * @file: True [1] if isolating file [!anon] pages
1088 * 1146 *
1089 * returns how many pages were moved onto *@dst. 1147 * returns how many pages were moved onto *@dst.
1090 */ 1148 */
1091static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1149static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1092 struct list_head *src, struct list_head *dst, 1150 struct mem_cgroup_zone *mz, struct list_head *dst,
1093 unsigned long *scanned, int order, isolate_mode_t mode, 1151 unsigned long *nr_scanned, int order, isolate_mode_t mode,
1094 int file) 1152 int active, int file)
1095{ 1153{
1154 struct lruvec *lruvec;
1155 struct list_head *src;
1096 unsigned long nr_taken = 0; 1156 unsigned long nr_taken = 0;
1097 unsigned long nr_lumpy_taken = 0; 1157 unsigned long nr_lumpy_taken = 0;
1098 unsigned long nr_lumpy_dirty = 0; 1158 unsigned long nr_lumpy_dirty = 0;
1099 unsigned long nr_lumpy_failed = 0; 1159 unsigned long nr_lumpy_failed = 0;
1100 unsigned long scan; 1160 unsigned long scan;
1161 int lru = LRU_BASE;
1162
1163 lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
1164 if (active)
1165 lru += LRU_ACTIVE;
1166 if (file)
1167 lru += LRU_FILE;
1168 src = &lruvec->lists[lru];
1101 1169
1102 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1170 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1103 struct page *page; 1171 struct page *page;
@@ -1113,15 +1181,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1113 1181
1114 switch (__isolate_lru_page(page, mode, file)) { 1182 switch (__isolate_lru_page(page, mode, file)) {
1115 case 0: 1183 case 0:
1184 mem_cgroup_lru_del(page);
1116 list_move(&page->lru, dst); 1185 list_move(&page->lru, dst);
1117 mem_cgroup_del_lru(page);
1118 nr_taken += hpage_nr_pages(page); 1186 nr_taken += hpage_nr_pages(page);
1119 break; 1187 break;
1120 1188
1121 case -EBUSY: 1189 case -EBUSY:
1122 /* else it is being freed elsewhere */ 1190 /* else it is being freed elsewhere */
1123 list_move(&page->lru, src); 1191 list_move(&page->lru, src);
1124 mem_cgroup_rotate_lru_list(page, page_lru(page));
1125 continue; 1192 continue;
1126 1193
1127 default: 1194 default:
@@ -1171,13 +1238,17 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1171 break; 1238 break;
1172 1239
1173 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 1240 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1241 unsigned int isolated_pages;
1242
1243 mem_cgroup_lru_del(cursor_page);
1174 list_move(&cursor_page->lru, dst); 1244 list_move(&cursor_page->lru, dst);
1175 mem_cgroup_del_lru(cursor_page); 1245 isolated_pages = hpage_nr_pages(cursor_page);
1176 nr_taken += hpage_nr_pages(cursor_page); 1246 nr_taken += isolated_pages;
1177 nr_lumpy_taken++; 1247 nr_lumpy_taken += isolated_pages;
1178 if (PageDirty(cursor_page)) 1248 if (PageDirty(cursor_page))
1179 nr_lumpy_dirty++; 1249 nr_lumpy_dirty += isolated_pages;
1180 scan++; 1250 scan++;
1251 pfn += isolated_pages - 1;
1181 } else { 1252 } else {
1182 /* 1253 /*
1183 * Check if the page is freed already. 1254 * Check if the page is freed already.
@@ -1203,57 +1274,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1203 nr_lumpy_failed++; 1274 nr_lumpy_failed++;
1204 } 1275 }
1205 1276
1206 *scanned = scan; 1277 *nr_scanned = scan;
1207 1278
1208 trace_mm_vmscan_lru_isolate(order, 1279 trace_mm_vmscan_lru_isolate(order,
1209 nr_to_scan, scan, 1280 nr_to_scan, scan,
1210 nr_taken, 1281 nr_taken,
1211 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, 1282 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1212 mode); 1283 mode, file);
1213 return nr_taken; 1284 return nr_taken;
1214} 1285}
1215 1286
1216static unsigned long isolate_pages_global(unsigned long nr,
1217 struct list_head *dst,
1218 unsigned long *scanned, int order,
1219 isolate_mode_t mode,
1220 struct zone *z, int active, int file)
1221{
1222 int lru = LRU_BASE;
1223 if (active)
1224 lru += LRU_ACTIVE;
1225 if (file)
1226 lru += LRU_FILE;
1227 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
1228 mode, file);
1229}
1230
1231/*
1232 * clear_active_flags() is a helper for shrink_active_list(), clearing
1233 * any active bits from the pages in the list.
1234 */
1235static unsigned long clear_active_flags(struct list_head *page_list,
1236 unsigned int *count)
1237{
1238 int nr_active = 0;
1239 int lru;
1240 struct page *page;
1241
1242 list_for_each_entry(page, page_list, lru) {
1243 int numpages = hpage_nr_pages(page);
1244 lru = page_lru_base_type(page);
1245 if (PageActive(page)) {
1246 lru += LRU_ACTIVE;
1247 ClearPageActive(page);
1248 nr_active += numpages;
1249 }
1250 if (count)
1251 count[lru] += numpages;
1252 }
1253
1254 return nr_active;
1255}
1256
1257/** 1287/**
1258 * isolate_lru_page - tries to isolate a page from its LRU list 1288 * isolate_lru_page - tries to isolate a page from its LRU list
1259 * @page: page to isolate from its LRU list 1289 * @page: page to isolate from its LRU list
@@ -1313,7 +1343,7 @@ static int too_many_isolated(struct zone *zone, int file,
1313 if (current_is_kswapd()) 1343 if (current_is_kswapd())
1314 return 0; 1344 return 0;
1315 1345
1316 if (!scanning_global_lru(sc)) 1346 if (!global_reclaim(sc))
1317 return 0; 1347 return 0;
1318 1348
1319 if (file) { 1349 if (file) {
@@ -1327,27 +1357,21 @@ static int too_many_isolated(struct zone *zone, int file,
1327 return isolated > inactive; 1357 return isolated > inactive;
1328} 1358}
1329 1359
1330/*
1331 * TODO: Try merging with migrations version of putback_lru_pages
1332 */
1333static noinline_for_stack void 1360static noinline_for_stack void
1334putback_lru_pages(struct zone *zone, struct scan_control *sc, 1361putback_inactive_pages(struct mem_cgroup_zone *mz,
1335 unsigned long nr_anon, unsigned long nr_file, 1362 struct list_head *page_list)
1336 struct list_head *page_list)
1337{ 1363{
1338 struct page *page; 1364 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1339 struct pagevec pvec; 1365 struct zone *zone = mz->zone;
1340 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1366 LIST_HEAD(pages_to_free);
1341
1342 pagevec_init(&pvec, 1);
1343 1367
1344 /* 1368 /*
1345 * Put back any unfreeable pages. 1369 * Put back any unfreeable pages.
1346 */ 1370 */
1347 spin_lock(&zone->lru_lock);
1348 while (!list_empty(page_list)) { 1371 while (!list_empty(page_list)) {
1372 struct page *page = lru_to_page(page_list);
1349 int lru; 1373 int lru;
1350 page = lru_to_page(page_list); 1374
1351 VM_BUG_ON(PageLRU(page)); 1375 VM_BUG_ON(PageLRU(page));
1352 list_del(&page->lru); 1376 list_del(&page->lru);
1353 if (unlikely(!page_evictable(page, NULL))) { 1377 if (unlikely(!page_evictable(page, NULL))) {
@@ -1364,30 +1388,53 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
1364 int numpages = hpage_nr_pages(page); 1388 int numpages = hpage_nr_pages(page);
1365 reclaim_stat->recent_rotated[file] += numpages; 1389 reclaim_stat->recent_rotated[file] += numpages;
1366 } 1390 }
1367 if (!pagevec_add(&pvec, page)) { 1391 if (put_page_testzero(page)) {
1368 spin_unlock_irq(&zone->lru_lock); 1392 __ClearPageLRU(page);
1369 __pagevec_release(&pvec); 1393 __ClearPageActive(page);
1370 spin_lock_irq(&zone->lru_lock); 1394 del_page_from_lru_list(zone, page, lru);
1395
1396 if (unlikely(PageCompound(page))) {
1397 spin_unlock_irq(&zone->lru_lock);
1398 (*get_compound_page_dtor(page))(page);
1399 spin_lock_irq(&zone->lru_lock);
1400 } else
1401 list_add(&page->lru, &pages_to_free);
1371 } 1402 }
1372 } 1403 }
1373 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1374 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1375 1404
1376 spin_unlock_irq(&zone->lru_lock); 1405 /*
1377 pagevec_release(&pvec); 1406 * To save our caller's stack, now use input list for pages to free.
1407 */
1408 list_splice(&pages_to_free, page_list);
1378} 1409}
1379 1410
1380static noinline_for_stack void update_isolated_counts(struct zone *zone, 1411static noinline_for_stack void
1381 struct scan_control *sc, 1412update_isolated_counts(struct mem_cgroup_zone *mz,
1382 unsigned long *nr_anon, 1413 struct list_head *page_list,
1383 unsigned long *nr_file, 1414 unsigned long *nr_anon,
1384 struct list_head *isolated_list) 1415 unsigned long *nr_file)
1385{ 1416{
1386 unsigned long nr_active; 1417 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1418 struct zone *zone = mz->zone;
1387 unsigned int count[NR_LRU_LISTS] = { 0, }; 1419 unsigned int count[NR_LRU_LISTS] = { 0, };
1388 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1420 unsigned long nr_active = 0;
1421 struct page *page;
1422 int lru;
1423
1424 /*
1425 * Count pages and clear active flags
1426 */
1427 list_for_each_entry(page, page_list, lru) {
1428 int numpages = hpage_nr_pages(page);
1429 lru = page_lru_base_type(page);
1430 if (PageActive(page)) {
1431 lru += LRU_ACTIVE;
1432 ClearPageActive(page);
1433 nr_active += numpages;
1434 }
1435 count[lru] += numpages;
1436 }
1389 1437
1390 nr_active = clear_active_flags(isolated_list, count);
1391 __count_vm_events(PGDEACTIVATE, nr_active); 1438 __count_vm_events(PGDEACTIVATE, nr_active);
1392 1439
1393 __mod_zone_page_state(zone, NR_ACTIVE_FILE, 1440 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
@@ -1401,8 +1448,6 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
1401 1448
1402 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 1449 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1403 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 1450 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1404 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1405 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1406 1451
1407 reclaim_stat->recent_scanned[0] += *nr_anon; 1452 reclaim_stat->recent_scanned[0] += *nr_anon;
1408 reclaim_stat->recent_scanned[1] += *nr_file; 1453 reclaim_stat->recent_scanned[1] += *nr_file;
@@ -1454,8 +1499,8 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1454 * of reclaimed pages 1499 * of reclaimed pages
1455 */ 1500 */
1456static noinline_for_stack unsigned long 1501static noinline_for_stack unsigned long
1457shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, 1502shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1458 struct scan_control *sc, int priority, int file) 1503 struct scan_control *sc, int priority, int file)
1459{ 1504{
1460 LIST_HEAD(page_list); 1505 LIST_HEAD(page_list);
1461 unsigned long nr_scanned; 1506 unsigned long nr_scanned;
@@ -1466,6 +1511,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1466 unsigned long nr_dirty = 0; 1511 unsigned long nr_dirty = 0;
1467 unsigned long nr_writeback = 0; 1512 unsigned long nr_writeback = 0;
1468 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; 1513 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
1514 struct zone *zone = mz->zone;
1469 1515
1470 while (unlikely(too_many_isolated(zone, file, sc))) { 1516 while (unlikely(too_many_isolated(zone, file, sc))) {
1471 congestion_wait(BLK_RW_ASYNC, HZ/10); 1517 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1488,9 +1534,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1488 1534
1489 spin_lock_irq(&zone->lru_lock); 1535 spin_lock_irq(&zone->lru_lock);
1490 1536
1491 if (scanning_global_lru(sc)) { 1537 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list,
1492 nr_taken = isolate_pages_global(nr_to_scan, &page_list, 1538 &nr_scanned, sc->order,
1493 &nr_scanned, sc->order, reclaim_mode, zone, 0, file); 1539 reclaim_mode, 0, file);
1540 if (global_reclaim(sc)) {
1494 zone->pages_scanned += nr_scanned; 1541 zone->pages_scanned += nr_scanned;
1495 if (current_is_kswapd()) 1542 if (current_is_kswapd())
1496 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1543 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1498,14 +1545,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1498 else 1545 else
1499 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1546 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1500 nr_scanned); 1547 nr_scanned);
1501 } else {
1502 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
1503 &nr_scanned, sc->order, reclaim_mode, zone,
1504 sc->mem_cgroup, 0, file);
1505 /*
1506 * mem_cgroup_isolate_pages() keeps track of
1507 * scanned pages on its own.
1508 */
1509 } 1548 }
1510 1549
1511 if (nr_taken == 0) { 1550 if (nr_taken == 0) {
@@ -1513,26 +1552,37 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1513 return 0; 1552 return 0;
1514 } 1553 }
1515 1554
1516 update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); 1555 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
1556
1557 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1558 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1517 1559
1518 spin_unlock_irq(&zone->lru_lock); 1560 spin_unlock_irq(&zone->lru_lock);
1519 1561
1520 nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, 1562 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
1521 &nr_dirty, &nr_writeback); 1563 &nr_dirty, &nr_writeback);
1522 1564
1523 /* Check if we should syncronously wait for writeback */ 1565 /* Check if we should syncronously wait for writeback */
1524 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1566 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1525 set_reclaim_mode(priority, sc, true); 1567 set_reclaim_mode(priority, sc, true);
1526 nr_reclaimed += shrink_page_list(&page_list, zone, sc, 1568 nr_reclaimed += shrink_page_list(&page_list, mz, sc,
1527 priority, &nr_dirty, &nr_writeback); 1569 priority, &nr_dirty, &nr_writeback);
1528 } 1570 }
1529 1571
1530 local_irq_disable(); 1572 spin_lock_irq(&zone->lru_lock);
1573
1531 if (current_is_kswapd()) 1574 if (current_is_kswapd())
1532 __count_vm_events(KSWAPD_STEAL, nr_reclaimed); 1575 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1533 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); 1576 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1534 1577
1535 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); 1578 putback_inactive_pages(mz, &page_list);
1579
1580 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1581 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1582
1583 spin_unlock_irq(&zone->lru_lock);
1584
1585 free_hot_cold_page_list(&page_list, 1);
1536 1586
1537 /* 1587 /*
1538 * If reclaim is isolating dirty pages under writeback, it implies 1588 * If reclaim is isolating dirty pages under writeback, it implies
@@ -1588,30 +1638,47 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1588 1638
1589static void move_active_pages_to_lru(struct zone *zone, 1639static void move_active_pages_to_lru(struct zone *zone,
1590 struct list_head *list, 1640 struct list_head *list,
1641 struct list_head *pages_to_free,
1591 enum lru_list lru) 1642 enum lru_list lru)
1592{ 1643{
1593 unsigned long pgmoved = 0; 1644 unsigned long pgmoved = 0;
1594 struct pagevec pvec;
1595 struct page *page; 1645 struct page *page;
1596 1646
1597 pagevec_init(&pvec, 1); 1647 if (buffer_heads_over_limit) {
1648 spin_unlock_irq(&zone->lru_lock);
1649 list_for_each_entry(page, list, lru) {
1650 if (page_has_private(page) && trylock_page(page)) {
1651 if (page_has_private(page))
1652 try_to_release_page(page, 0);
1653 unlock_page(page);
1654 }
1655 }
1656 spin_lock_irq(&zone->lru_lock);
1657 }
1598 1658
1599 while (!list_empty(list)) { 1659 while (!list_empty(list)) {
1660 struct lruvec *lruvec;
1661
1600 page = lru_to_page(list); 1662 page = lru_to_page(list);
1601 1663
1602 VM_BUG_ON(PageLRU(page)); 1664 VM_BUG_ON(PageLRU(page));
1603 SetPageLRU(page); 1665 SetPageLRU(page);
1604 1666
1605 list_move(&page->lru, &zone->lru[lru].list); 1667 lruvec = mem_cgroup_lru_add_list(zone, page, lru);
1606 mem_cgroup_add_lru_list(page, lru); 1668 list_move(&page->lru, &lruvec->lists[lru]);
1607 pgmoved += hpage_nr_pages(page); 1669 pgmoved += hpage_nr_pages(page);
1608 1670
1609 if (!pagevec_add(&pvec, page) || list_empty(list)) { 1671 if (put_page_testzero(page)) {
1610 spin_unlock_irq(&zone->lru_lock); 1672 __ClearPageLRU(page);
1611 if (buffer_heads_over_limit) 1673 __ClearPageActive(page);
1612 pagevec_strip(&pvec); 1674 del_page_from_lru_list(zone, page, lru);
1613 __pagevec_release(&pvec); 1675
1614 spin_lock_irq(&zone->lru_lock); 1676 if (unlikely(PageCompound(page))) {
1677 spin_unlock_irq(&zone->lru_lock);
1678 (*get_compound_page_dtor(page))(page);
1679 spin_lock_irq(&zone->lru_lock);
1680 } else
1681 list_add(&page->lru, pages_to_free);
1615 } 1682 }
1616 } 1683 }
1617 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1684 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
@@ -1619,19 +1686,22 @@ static void move_active_pages_to_lru(struct zone *zone,
1619 __count_vm_events(PGDEACTIVATE, pgmoved); 1686 __count_vm_events(PGDEACTIVATE, pgmoved);
1620} 1687}
1621 1688
1622static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1689static void shrink_active_list(unsigned long nr_to_scan,
1623 struct scan_control *sc, int priority, int file) 1690 struct mem_cgroup_zone *mz,
1691 struct scan_control *sc,
1692 int priority, int file)
1624{ 1693{
1625 unsigned long nr_taken; 1694 unsigned long nr_taken;
1626 unsigned long pgscanned; 1695 unsigned long nr_scanned;
1627 unsigned long vm_flags; 1696 unsigned long vm_flags;
1628 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1697 LIST_HEAD(l_hold); /* The pages which were snipped off */
1629 LIST_HEAD(l_active); 1698 LIST_HEAD(l_active);
1630 LIST_HEAD(l_inactive); 1699 LIST_HEAD(l_inactive);
1631 struct page *page; 1700 struct page *page;
1632 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1701 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1633 unsigned long nr_rotated = 0; 1702 unsigned long nr_rotated = 0;
1634 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; 1703 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
1704 struct zone *zone = mz->zone;
1635 1705
1636 lru_add_drain(); 1706 lru_add_drain();
1637 1707
@@ -1641,26 +1711,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1641 reclaim_mode |= ISOLATE_CLEAN; 1711 reclaim_mode |= ISOLATE_CLEAN;
1642 1712
1643 spin_lock_irq(&zone->lru_lock); 1713 spin_lock_irq(&zone->lru_lock);
1644 if (scanning_global_lru(sc)) { 1714
1645 nr_taken = isolate_pages_global(nr_pages, &l_hold, 1715 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold,
1646 &pgscanned, sc->order, 1716 &nr_scanned, sc->order,
1647 reclaim_mode, zone, 1717 reclaim_mode, 1, file);
1648 1, file); 1718 if (global_reclaim(sc))
1649 zone->pages_scanned += pgscanned; 1719 zone->pages_scanned += nr_scanned;
1650 } else {
1651 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
1652 &pgscanned, sc->order,
1653 reclaim_mode, zone,
1654 sc->mem_cgroup, 1, file);
1655 /*
1656 * mem_cgroup_isolate_pages() keeps track of
1657 * scanned pages on its own.
1658 */
1659 }
1660 1720
1661 reclaim_stat->recent_scanned[file] += nr_taken; 1721 reclaim_stat->recent_scanned[file] += nr_taken;
1662 1722
1663 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1723 __count_zone_vm_events(PGREFILL, zone, nr_scanned);
1664 if (file) 1724 if (file)
1665 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); 1725 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1666 else 1726 else
@@ -1678,7 +1738,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1678 continue; 1738 continue;
1679 } 1739 }
1680 1740
1681 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { 1741 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
1682 nr_rotated += hpage_nr_pages(page); 1742 nr_rotated += hpage_nr_pages(page);
1683 /* 1743 /*
1684 * Identify referenced, file-backed active pages and 1744 * Identify referenced, file-backed active pages and
@@ -1711,12 +1771,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1711 */ 1771 */
1712 reclaim_stat->recent_rotated[file] += nr_rotated; 1772 reclaim_stat->recent_rotated[file] += nr_rotated;
1713 1773
1714 move_active_pages_to_lru(zone, &l_active, 1774 move_active_pages_to_lru(zone, &l_active, &l_hold,
1715 LRU_ACTIVE + file * LRU_FILE); 1775 LRU_ACTIVE + file * LRU_FILE);
1716 move_active_pages_to_lru(zone, &l_inactive, 1776 move_active_pages_to_lru(zone, &l_inactive, &l_hold,
1717 LRU_BASE + file * LRU_FILE); 1777 LRU_BASE + file * LRU_FILE);
1718 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1778 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1719 spin_unlock_irq(&zone->lru_lock); 1779 spin_unlock_irq(&zone->lru_lock);
1780
1781 free_hot_cold_page_list(&l_hold, 1);
1720} 1782}
1721 1783
1722#ifdef CONFIG_SWAP 1784#ifdef CONFIG_SWAP
@@ -1741,10 +1803,8 @@ static int inactive_anon_is_low_global(struct zone *zone)
1741 * Returns true if the zone does not have enough inactive anon pages, 1803 * Returns true if the zone does not have enough inactive anon pages,
1742 * meaning some active anon pages need to be deactivated. 1804 * meaning some active anon pages need to be deactivated.
1743 */ 1805 */
1744static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) 1806static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
1745{ 1807{
1746 int low;
1747
1748 /* 1808 /*
1749 * If we don't have swap space, anonymous page deactivation 1809 * If we don't have swap space, anonymous page deactivation
1750 * is pointless. 1810 * is pointless.
@@ -1752,15 +1812,14 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1752 if (!total_swap_pages) 1812 if (!total_swap_pages)
1753 return 0; 1813 return 0;
1754 1814
1755 if (scanning_global_lru(sc)) 1815 if (!scanning_global_lru(mz))
1756 low = inactive_anon_is_low_global(zone); 1816 return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
1757 else 1817 mz->zone);
1758 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone); 1818
1759 return low; 1819 return inactive_anon_is_low_global(mz->zone);
1760} 1820}
1761#else 1821#else
1762static inline int inactive_anon_is_low(struct zone *zone, 1822static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz)
1763 struct scan_control *sc)
1764{ 1823{
1765 return 0; 1824 return 0;
1766} 1825}
@@ -1778,8 +1837,7 @@ static int inactive_file_is_low_global(struct zone *zone)
1778 1837
1779/** 1838/**
1780 * inactive_file_is_low - check if file pages need to be deactivated 1839 * inactive_file_is_low - check if file pages need to be deactivated
1781 * @zone: zone to check 1840 * @mz: memory cgroup and zone to check
1782 * @sc: scan control of this context
1783 * 1841 *
1784 * When the system is doing streaming IO, memory pressure here 1842 * When the system is doing streaming IO, memory pressure here
1785 * ensures that active file pages get deactivated, until more 1843 * ensures that active file pages get deactivated, until more
@@ -1791,45 +1849,44 @@ static int inactive_file_is_low_global(struct zone *zone)
1791 * This uses a different ratio than the anonymous pages, because 1849 * This uses a different ratio than the anonymous pages, because
1792 * the page cache uses a use-once replacement algorithm. 1850 * the page cache uses a use-once replacement algorithm.
1793 */ 1851 */
1794static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) 1852static int inactive_file_is_low(struct mem_cgroup_zone *mz)
1795{ 1853{
1796 int low; 1854 if (!scanning_global_lru(mz))
1855 return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
1856 mz->zone);
1797 1857
1798 if (scanning_global_lru(sc)) 1858 return inactive_file_is_low_global(mz->zone);
1799 low = inactive_file_is_low_global(zone);
1800 else
1801 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
1802 return low;
1803} 1859}
1804 1860
1805static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, 1861static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
1806 int file)
1807{ 1862{
1808 if (file) 1863 if (file)
1809 return inactive_file_is_low(zone, sc); 1864 return inactive_file_is_low(mz);
1810 else 1865 else
1811 return inactive_anon_is_low(zone, sc); 1866 return inactive_anon_is_low(mz);
1812} 1867}
1813 1868
1814static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1869static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1815 struct zone *zone, struct scan_control *sc, int priority) 1870 struct mem_cgroup_zone *mz,
1871 struct scan_control *sc, int priority)
1816{ 1872{
1817 int file = is_file_lru(lru); 1873 int file = is_file_lru(lru);
1818 1874
1819 if (is_active_lru(lru)) { 1875 if (is_active_lru(lru)) {
1820 if (inactive_list_is_low(zone, sc, file)) 1876 if (inactive_list_is_low(mz, file))
1821 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1877 shrink_active_list(nr_to_scan, mz, sc, priority, file);
1822 return 0; 1878 return 0;
1823 } 1879 }
1824 1880
1825 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); 1881 return shrink_inactive_list(nr_to_scan, mz, sc, priority, file);
1826} 1882}
1827 1883
1828static int vmscan_swappiness(struct scan_control *sc) 1884static int vmscan_swappiness(struct mem_cgroup_zone *mz,
1885 struct scan_control *sc)
1829{ 1886{
1830 if (scanning_global_lru(sc)) 1887 if (global_reclaim(sc))
1831 return vm_swappiness; 1888 return vm_swappiness;
1832 return mem_cgroup_swappiness(sc->mem_cgroup); 1889 return mem_cgroup_swappiness(mz->mem_cgroup);
1833} 1890}
1834 1891
1835/* 1892/*
@@ -1840,15 +1897,15 @@ static int vmscan_swappiness(struct scan_control *sc)
1840 * 1897 *
1841 * nr[0] = anon pages to scan; nr[1] = file pages to scan 1898 * nr[0] = anon pages to scan; nr[1] = file pages to scan
1842 */ 1899 */
1843static void get_scan_count(struct zone *zone, struct scan_control *sc, 1900static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1844 unsigned long *nr, int priority) 1901 unsigned long *nr, int priority)
1845{ 1902{
1846 unsigned long anon, file, free; 1903 unsigned long anon, file, free;
1847 unsigned long anon_prio, file_prio; 1904 unsigned long anon_prio, file_prio;
1848 unsigned long ap, fp; 1905 unsigned long ap, fp;
1849 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1906 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1850 u64 fraction[2], denominator; 1907 u64 fraction[2], denominator;
1851 enum lru_list l; 1908 enum lru_list lru;
1852 int noswap = 0; 1909 int noswap = 0;
1853 bool force_scan = false; 1910 bool force_scan = false;
1854 1911
@@ -1862,9 +1919,9 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1862 * latencies, so it's better to scan a minimum amount there as 1919 * latencies, so it's better to scan a minimum amount there as
1863 * well. 1920 * well.
1864 */ 1921 */
1865 if (scanning_global_lru(sc) && current_is_kswapd()) 1922 if (current_is_kswapd() && mz->zone->all_unreclaimable)
1866 force_scan = true; 1923 force_scan = true;
1867 if (!scanning_global_lru(sc)) 1924 if (!global_reclaim(sc))
1868 force_scan = true; 1925 force_scan = true;
1869 1926
1870 /* If we have no swap space, do not bother scanning anon pages. */ 1927 /* If we have no swap space, do not bother scanning anon pages. */
@@ -1876,16 +1933,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1876 goto out; 1933 goto out;
1877 } 1934 }
1878 1935
1879 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1936 anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) +
1880 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1937 zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
1881 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + 1938 file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) +
1882 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 1939 zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
1883 1940
1884 if (scanning_global_lru(sc)) { 1941 if (global_reclaim(sc)) {
1885 free = zone_page_state(zone, NR_FREE_PAGES); 1942 free = zone_page_state(mz->zone, NR_FREE_PAGES);
1886 /* If we have very few page cache pages, 1943 /* If we have very few page cache pages,
1887 force-scan anon pages. */ 1944 force-scan anon pages. */
1888 if (unlikely(file + free <= high_wmark_pages(zone))) { 1945 if (unlikely(file + free <= high_wmark_pages(mz->zone))) {
1889 fraction[0] = 1; 1946 fraction[0] = 1;
1890 fraction[1] = 0; 1947 fraction[1] = 0;
1891 denominator = 1; 1948 denominator = 1;
@@ -1897,8 +1954,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1897 * With swappiness at 100, anonymous and file have the same priority. 1954 * With swappiness at 100, anonymous and file have the same priority.
1898 * This scanning priority is essentially the inverse of IO cost. 1955 * This scanning priority is essentially the inverse of IO cost.
1899 */ 1956 */
1900 anon_prio = vmscan_swappiness(sc); 1957 anon_prio = vmscan_swappiness(mz, sc);
1901 file_prio = 200 - vmscan_swappiness(sc); 1958 file_prio = 200 - vmscan_swappiness(mz, sc);
1902 1959
1903 /* 1960 /*
1904 * OK, so we have swap space and a fair amount of page cache 1961 * OK, so we have swap space and a fair amount of page cache
@@ -1911,7 +1968,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1911 * 1968 *
1912 * anon in [0], file in [1] 1969 * anon in [0], file in [1]
1913 */ 1970 */
1914 spin_lock_irq(&zone->lru_lock); 1971 spin_lock_irq(&mz->zone->lru_lock);
1915 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1972 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1916 reclaim_stat->recent_scanned[0] /= 2; 1973 reclaim_stat->recent_scanned[0] /= 2;
1917 reclaim_stat->recent_rotated[0] /= 2; 1974 reclaim_stat->recent_rotated[0] /= 2;
@@ -1932,24 +1989,24 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1932 1989
1933 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1990 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1934 fp /= reclaim_stat->recent_rotated[1] + 1; 1991 fp /= reclaim_stat->recent_rotated[1] + 1;
1935 spin_unlock_irq(&zone->lru_lock); 1992 spin_unlock_irq(&mz->zone->lru_lock);
1936 1993
1937 fraction[0] = ap; 1994 fraction[0] = ap;
1938 fraction[1] = fp; 1995 fraction[1] = fp;
1939 denominator = ap + fp + 1; 1996 denominator = ap + fp + 1;
1940out: 1997out:
1941 for_each_evictable_lru(l) { 1998 for_each_evictable_lru(lru) {
1942 int file = is_file_lru(l); 1999 int file = is_file_lru(lru);
1943 unsigned long scan; 2000 unsigned long scan;
1944 2001
1945 scan = zone_nr_lru_pages(zone, sc, l); 2002 scan = zone_nr_lru_pages(mz, lru);
1946 if (priority || noswap) { 2003 if (priority || noswap) {
1947 scan >>= priority; 2004 scan >>= priority;
1948 if (!scan && force_scan) 2005 if (!scan && force_scan)
1949 scan = SWAP_CLUSTER_MAX; 2006 scan = SWAP_CLUSTER_MAX;
1950 scan = div64_u64(scan * fraction[file], denominator); 2007 scan = div64_u64(scan * fraction[file], denominator);
1951 } 2008 }
1952 nr[l] = scan; 2009 nr[lru] = scan;
1953 } 2010 }
1954} 2011}
1955 2012
@@ -1960,7 +2017,7 @@ out:
1960 * back to the allocator and call try_to_compact_zone(), we ensure that 2017 * back to the allocator and call try_to_compact_zone(), we ensure that
1961 * there are enough free pages for it to be likely successful 2018 * there are enough free pages for it to be likely successful
1962 */ 2019 */
1963static inline bool should_continue_reclaim(struct zone *zone, 2020static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
1964 unsigned long nr_reclaimed, 2021 unsigned long nr_reclaimed,
1965 unsigned long nr_scanned, 2022 unsigned long nr_scanned,
1966 struct scan_control *sc) 2023 struct scan_control *sc)
@@ -2000,15 +2057,15 @@ static inline bool should_continue_reclaim(struct zone *zone,
2000 * inactive lists are large enough, continue reclaiming 2057 * inactive lists are large enough, continue reclaiming
2001 */ 2058 */
2002 pages_for_compaction = (2UL << sc->order); 2059 pages_for_compaction = (2UL << sc->order);
2003 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 2060 inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
2004 if (nr_swap_pages > 0) 2061 if (nr_swap_pages > 0)
2005 inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 2062 inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
2006 if (sc->nr_reclaimed < pages_for_compaction && 2063 if (sc->nr_reclaimed < pages_for_compaction &&
2007 inactive_lru_pages > pages_for_compaction) 2064 inactive_lru_pages > pages_for_compaction)
2008 return true; 2065 return true;
2009 2066
2010 /* If compaction would go ahead or the allocation would succeed, stop */ 2067 /* If compaction would go ahead or the allocation would succeed, stop */
2011 switch (compaction_suitable(zone, sc->order)) { 2068 switch (compaction_suitable(mz->zone, sc->order)) {
2012 case COMPACT_PARTIAL: 2069 case COMPACT_PARTIAL:
2013 case COMPACT_CONTINUE: 2070 case COMPACT_CONTINUE:
2014 return false; 2071 return false;
@@ -2020,12 +2077,12 @@ static inline bool should_continue_reclaim(struct zone *zone,
2020/* 2077/*
2021 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 2078 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
2022 */ 2079 */
2023static void shrink_zone(int priority, struct zone *zone, 2080static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
2024 struct scan_control *sc) 2081 struct scan_control *sc)
2025{ 2082{
2026 unsigned long nr[NR_LRU_LISTS]; 2083 unsigned long nr[NR_LRU_LISTS];
2027 unsigned long nr_to_scan; 2084 unsigned long nr_to_scan;
2028 enum lru_list l; 2085 enum lru_list lru;
2029 unsigned long nr_reclaimed, nr_scanned; 2086 unsigned long nr_reclaimed, nr_scanned;
2030 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 2087 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2031 struct blk_plug plug; 2088 struct blk_plug plug;
@@ -2033,19 +2090,19 @@ static void shrink_zone(int priority, struct zone *zone,
2033restart: 2090restart:
2034 nr_reclaimed = 0; 2091 nr_reclaimed = 0;
2035 nr_scanned = sc->nr_scanned; 2092 nr_scanned = sc->nr_scanned;
2036 get_scan_count(zone, sc, nr, priority); 2093 get_scan_count(mz, sc, nr, priority);
2037 2094
2038 blk_start_plug(&plug); 2095 blk_start_plug(&plug);
2039 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2096 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2040 nr[LRU_INACTIVE_FILE]) { 2097 nr[LRU_INACTIVE_FILE]) {
2041 for_each_evictable_lru(l) { 2098 for_each_evictable_lru(lru) {
2042 if (nr[l]) { 2099 if (nr[lru]) {
2043 nr_to_scan = min_t(unsigned long, 2100 nr_to_scan = min_t(unsigned long,
2044 nr[l], SWAP_CLUSTER_MAX); 2101 nr[lru], SWAP_CLUSTER_MAX);
2045 nr[l] -= nr_to_scan; 2102 nr[lru] -= nr_to_scan;
2046 2103
2047 nr_reclaimed += shrink_list(l, nr_to_scan, 2104 nr_reclaimed += shrink_list(lru, nr_to_scan,
2048 zone, sc, priority); 2105 mz, sc, priority);
2049 } 2106 }
2050 } 2107 }
2051 /* 2108 /*
@@ -2066,17 +2123,89 @@ restart:
2066 * Even if we did not try to evict anon pages at all, we want to 2123 * Even if we did not try to evict anon pages at all, we want to
2067 * rebalance the anon lru active/inactive ratio. 2124 * rebalance the anon lru active/inactive ratio.
2068 */ 2125 */
2069 if (inactive_anon_is_low(zone, sc)) 2126 if (inactive_anon_is_low(mz))
2070 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 2127 shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
2071 2128
2072 /* reclaim/compaction might need reclaim to continue */ 2129 /* reclaim/compaction might need reclaim to continue */
2073 if (should_continue_reclaim(zone, nr_reclaimed, 2130 if (should_continue_reclaim(mz, nr_reclaimed,
2074 sc->nr_scanned - nr_scanned, sc)) 2131 sc->nr_scanned - nr_scanned, sc))
2075 goto restart; 2132 goto restart;
2076 2133
2077 throttle_vm_writeout(sc->gfp_mask); 2134 throttle_vm_writeout(sc->gfp_mask);
2078} 2135}
2079 2136
2137static void shrink_zone(int priority, struct zone *zone,
2138 struct scan_control *sc)
2139{
2140 struct mem_cgroup *root = sc->target_mem_cgroup;
2141 struct mem_cgroup_reclaim_cookie reclaim = {
2142 .zone = zone,
2143 .priority = priority,
2144 };
2145 struct mem_cgroup *memcg;
2146
2147 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2148 do {
2149 struct mem_cgroup_zone mz = {
2150 .mem_cgroup = memcg,
2151 .zone = zone,
2152 };
2153
2154 shrink_mem_cgroup_zone(priority, &mz, sc);
2155 /*
2156 * Limit reclaim has historically picked one memcg and
2157 * scanned it with decreasing priority levels until
2158 * nr_to_reclaim had been reclaimed. This priority
2159 * cycle is thus over after a single memcg.
2160 *
2161 * Direct reclaim and kswapd, on the other hand, have
2162 * to scan all memory cgroups to fulfill the overall
2163 * scan target for the zone.
2164 */
2165 if (!global_reclaim(sc)) {
2166 mem_cgroup_iter_break(root, memcg);
2167 break;
2168 }
2169 memcg = mem_cgroup_iter(root, memcg, &reclaim);
2170 } while (memcg);
2171}
2172
2173/* Returns true if compaction should go ahead for a high-order request */
2174static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2175{
2176 unsigned long balance_gap, watermark;
2177 bool watermark_ok;
2178
2179 /* Do not consider compaction for orders reclaim is meant to satisfy */
2180 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
2181 return false;
2182
2183 /*
2184 * Compaction takes time to run and there are potentially other
2185 * callers using the pages just freed. Continue reclaiming until
2186 * there is a buffer of free pages available to give compaction
2187 * a reasonable chance of completing and allocating the page
2188 */
2189 balance_gap = min(low_wmark_pages(zone),
2190 (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2191 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2192 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
2193 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
2194
2195 /*
2196 * If compaction is deferred, reclaim up to a point where
2197 * compaction will have a chance of success when re-enabled
2198 */
2199 if (compaction_deferred(zone))
2200 return watermark_ok;
2201
2202 /* If compaction is not ready to start, keep reclaiming */
2203 if (!compaction_suitable(zone, sc->order))
2204 return false;
2205
2206 return watermark_ok;
2207}
2208
2080/* 2209/*
2081 * This is the direct reclaim path, for page-allocating processes. We only 2210 * This is the direct reclaim path, for page-allocating processes. We only
2082 * try to reclaim pages from zones which will satisfy the caller's allocation 2211 * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2094,8 +2223,9 @@ restart:
2094 * scan then give up on it. 2223 * scan then give up on it.
2095 * 2224 *
2096 * This function returns true if a zone is being reclaimed for a costly 2225 * This function returns true if a zone is being reclaimed for a costly
2097 * high-order allocation and compaction is either ready to begin or deferred. 2226 * high-order allocation and compaction is ready to begin. This indicates to
2098 * This indicates to the caller that it should retry the allocation or fail. 2227 * the caller that it should consider retrying the allocation instead of
2228 * further reclaim.
2099 */ 2229 */
2100static bool shrink_zones(int priority, struct zonelist *zonelist, 2230static bool shrink_zones(int priority, struct zonelist *zonelist,
2101 struct scan_control *sc) 2231 struct scan_control *sc)
@@ -2104,7 +2234,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2104 struct zone *zone; 2234 struct zone *zone;
2105 unsigned long nr_soft_reclaimed; 2235 unsigned long nr_soft_reclaimed;
2106 unsigned long nr_soft_scanned; 2236 unsigned long nr_soft_scanned;
2107 bool should_abort_reclaim = false; 2237 bool aborted_reclaim = false;
2108 2238
2109 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2239 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2110 gfp_zone(sc->gfp_mask), sc->nodemask) { 2240 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2114,7 +2244,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2114 * Take care memory controller reclaiming has small influence 2244 * Take care memory controller reclaiming has small influence
2115 * to global LRU. 2245 * to global LRU.
2116 */ 2246 */
2117 if (scanning_global_lru(sc)) { 2247 if (global_reclaim(sc)) {
2118 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2248 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2119 continue; 2249 continue;
2120 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2250 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -2129,10 +2259,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2129 * noticable problem, like transparent huge page 2259 * noticable problem, like transparent huge page
2130 * allocations. 2260 * allocations.
2131 */ 2261 */
2132 if (sc->order > PAGE_ALLOC_COSTLY_ORDER && 2262 if (compaction_ready(zone, sc)) {
2133 (compaction_suitable(zone, sc->order) || 2263 aborted_reclaim = true;
2134 compaction_deferred(zone))) {
2135 should_abort_reclaim = true;
2136 continue; 2264 continue;
2137 } 2265 }
2138 } 2266 }
@@ -2154,7 +2282,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2154 shrink_zone(priority, zone, sc); 2282 shrink_zone(priority, zone, sc);
2155 } 2283 }
2156 2284
2157 return should_abort_reclaim; 2285 return aborted_reclaim;
2158} 2286}
2159 2287
2160static bool zone_reclaimable(struct zone *zone) 2288static bool zone_reclaimable(struct zone *zone)
@@ -2208,25 +2336,25 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2208 struct zoneref *z; 2336 struct zoneref *z;
2209 struct zone *zone; 2337 struct zone *zone;
2210 unsigned long writeback_threshold; 2338 unsigned long writeback_threshold;
2339 bool aborted_reclaim;
2211 2340
2212 get_mems_allowed(); 2341 get_mems_allowed();
2213 delayacct_freepages_start(); 2342 delayacct_freepages_start();
2214 2343
2215 if (scanning_global_lru(sc)) 2344 if (global_reclaim(sc))
2216 count_vm_event(ALLOCSTALL); 2345 count_vm_event(ALLOCSTALL);
2217 2346
2218 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2347 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2219 sc->nr_scanned = 0; 2348 sc->nr_scanned = 0;
2220 if (!priority) 2349 if (!priority)
2221 disable_swap_token(sc->mem_cgroup); 2350 disable_swap_token(sc->target_mem_cgroup);
2222 if (shrink_zones(priority, zonelist, sc)) 2351 aborted_reclaim = shrink_zones(priority, zonelist, sc);
2223 break;
2224 2352
2225 /* 2353 /*
2226 * Don't shrink slabs when reclaiming memory from 2354 * Don't shrink slabs when reclaiming memory from
2227 * over limit cgroups 2355 * over limit cgroups
2228 */ 2356 */
2229 if (scanning_global_lru(sc)) { 2357 if (global_reclaim(sc)) {
2230 unsigned long lru_pages = 0; 2358 unsigned long lru_pages = 0;
2231 for_each_zone_zonelist(zone, z, zonelist, 2359 for_each_zone_zonelist(zone, z, zonelist,
2232 gfp_zone(sc->gfp_mask)) { 2360 gfp_zone(sc->gfp_mask)) {
@@ -2287,8 +2415,12 @@ out:
2287 if (oom_killer_disabled) 2415 if (oom_killer_disabled)
2288 return 0; 2416 return 0;
2289 2417
2418 /* Aborted reclaim to try compaction? don't OOM, then */
2419 if (aborted_reclaim)
2420 return 1;
2421
2290 /* top priority shrink_zones still had more to do? don't OOM, then */ 2422 /* top priority shrink_zones still had more to do? don't OOM, then */
2291 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) 2423 if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
2292 return 1; 2424 return 1;
2293 2425
2294 return 0; 2426 return 0;
@@ -2305,7 +2437,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2305 .may_unmap = 1, 2437 .may_unmap = 1,
2306 .may_swap = 1, 2438 .may_swap = 1,
2307 .order = order, 2439 .order = order,
2308 .mem_cgroup = NULL, 2440 .target_mem_cgroup = NULL,
2309 .nodemask = nodemask, 2441 .nodemask = nodemask,
2310 }; 2442 };
2311 struct shrink_control shrink = { 2443 struct shrink_control shrink = {
@@ -2325,7 +2457,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2325 2457
2326#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2458#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2327 2459
2328unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2460unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2329 gfp_t gfp_mask, bool noswap, 2461 gfp_t gfp_mask, bool noswap,
2330 struct zone *zone, 2462 struct zone *zone,
2331 unsigned long *nr_scanned) 2463 unsigned long *nr_scanned)
@@ -2337,7 +2469,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2337 .may_unmap = 1, 2469 .may_unmap = 1,
2338 .may_swap = !noswap, 2470 .may_swap = !noswap,
2339 .order = 0, 2471 .order = 0,
2340 .mem_cgroup = mem, 2472 .target_mem_cgroup = memcg,
2473 };
2474 struct mem_cgroup_zone mz = {
2475 .mem_cgroup = memcg,
2476 .zone = zone,
2341 }; 2477 };
2342 2478
2343 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2479 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2354,7 +2490,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2354 * will pick up pages from other mem cgroup's as well. We hack 2490 * will pick up pages from other mem cgroup's as well. We hack
2355 * the priority and make it zero. 2491 * the priority and make it zero.
2356 */ 2492 */
2357 shrink_zone(0, zone, &sc); 2493 shrink_mem_cgroup_zone(0, &mz, &sc);
2358 2494
2359 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2495 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2360 2496
@@ -2362,7 +2498,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2362 return sc.nr_reclaimed; 2498 return sc.nr_reclaimed;
2363} 2499}
2364 2500
2365unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 2501unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2366 gfp_t gfp_mask, 2502 gfp_t gfp_mask,
2367 bool noswap) 2503 bool noswap)
2368{ 2504{
@@ -2375,7 +2511,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2375 .may_swap = !noswap, 2511 .may_swap = !noswap,
2376 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2512 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2377 .order = 0, 2513 .order = 0,
2378 .mem_cgroup = mem_cont, 2514 .target_mem_cgroup = memcg,
2379 .nodemask = NULL, /* we don't care the placement */ 2515 .nodemask = NULL, /* we don't care the placement */
2380 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2516 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2381 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2517 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
@@ -2389,7 +2525,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2389 * take care of from where we get pages. So the node where we start the 2525 * take care of from where we get pages. So the node where we start the
2390 * scan does not need to be the current node. 2526 * scan does not need to be the current node.
2391 */ 2527 */
2392 nid = mem_cgroup_select_victim_node(mem_cont); 2528 nid = mem_cgroup_select_victim_node(memcg);
2393 2529
2394 zonelist = NODE_DATA(nid)->node_zonelists; 2530 zonelist = NODE_DATA(nid)->node_zonelists;
2395 2531
@@ -2405,6 +2541,29 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2405} 2541}
2406#endif 2542#endif
2407 2543
2544static void age_active_anon(struct zone *zone, struct scan_control *sc,
2545 int priority)
2546{
2547 struct mem_cgroup *memcg;
2548
2549 if (!total_swap_pages)
2550 return;
2551
2552 memcg = mem_cgroup_iter(NULL, NULL, NULL);
2553 do {
2554 struct mem_cgroup_zone mz = {
2555 .mem_cgroup = memcg,
2556 .zone = zone,
2557 };
2558
2559 if (inactive_anon_is_low(&mz))
2560 shrink_active_list(SWAP_CLUSTER_MAX, &mz,
2561 sc, priority, 0);
2562
2563 memcg = mem_cgroup_iter(NULL, memcg, NULL);
2564 } while (memcg);
2565}
2566
2408/* 2567/*
2409 * pgdat_balanced is used when checking if a node is balanced for high-order 2568 * pgdat_balanced is used when checking if a node is balanced for high-order
2410 * allocations. Only zones that meet watermarks and are in a zone allowed 2569 * allocations. Only zones that meet watermarks and are in a zone allowed
@@ -2525,7 +2684,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2525 */ 2684 */
2526 .nr_to_reclaim = ULONG_MAX, 2685 .nr_to_reclaim = ULONG_MAX,
2527 .order = order, 2686 .order = order,
2528 .mem_cgroup = NULL, 2687 .target_mem_cgroup = NULL,
2529 }; 2688 };
2530 struct shrink_control shrink = { 2689 struct shrink_control shrink = {
2531 .gfp_mask = sc.gfp_mask, 2690 .gfp_mask = sc.gfp_mask,
@@ -2564,9 +2723,7 @@ loop_again:
2564 * Do some background aging of the anon list, to give 2723 * Do some background aging of the anon list, to give
2565 * pages a chance to be referenced before reclaiming. 2724 * pages a chance to be referenced before reclaiming.
2566 */ 2725 */
2567 if (inactive_anon_is_low(zone, &sc)) 2726 age_active_anon(zone, &sc, priority);
2568 shrink_active_list(SWAP_CLUSTER_MAX, zone,
2569 &sc, priority, 0);
2570 2727
2571 if (!zone_watermark_ok_safe(zone, order, 2728 if (!zone_watermark_ok_safe(zone, order,
2572 high_wmark_pages(zone), 0, 0)) { 2729 high_wmark_pages(zone), 0, 0)) {
@@ -3355,16 +3512,18 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
3355 */ 3512 */
3356static void check_move_unevictable_page(struct page *page, struct zone *zone) 3513static void check_move_unevictable_page(struct page *page, struct zone *zone)
3357{ 3514{
3358 VM_BUG_ON(PageActive(page)); 3515 struct lruvec *lruvec;
3359 3516
3517 VM_BUG_ON(PageActive(page));
3360retry: 3518retry:
3361 ClearPageUnevictable(page); 3519 ClearPageUnevictable(page);
3362 if (page_evictable(page, NULL)) { 3520 if (page_evictable(page, NULL)) {
3363 enum lru_list l = page_lru_base_type(page); 3521 enum lru_list l = page_lru_base_type(page);
3364 3522
3365 __dec_zone_state(zone, NR_UNEVICTABLE); 3523 __dec_zone_state(zone, NR_UNEVICTABLE);
3366 list_move(&page->lru, &zone->lru[l].list); 3524 lruvec = mem_cgroup_lru_move_lists(zone, page,
3367 mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); 3525 LRU_UNEVICTABLE, l);
3526 list_move(&page->lru, &lruvec->lists[l]);
3368 __inc_zone_state(zone, NR_INACTIVE_ANON + l); 3527 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
3369 __count_vm_event(UNEVICTABLE_PGRESCUED); 3528 __count_vm_event(UNEVICTABLE_PGRESCUED);
3370 } else { 3529 } else {
@@ -3372,8 +3531,9 @@ retry:
3372 * rotate unevictable list 3531 * rotate unevictable list
3373 */ 3532 */
3374 SetPageUnevictable(page); 3533 SetPageUnevictable(page);
3375 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); 3534 lruvec = mem_cgroup_lru_move_lists(zone, page, LRU_UNEVICTABLE,
3376 mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); 3535 LRU_UNEVICTABLE);
3536 list_move(&page->lru, &lruvec->lists[LRU_UNEVICTABLE]);
3377 if (page_evictable(page, NULL)) 3537 if (page_evictable(page, NULL))
3378 goto retry; 3538 goto retry;
3379 } 3539 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8fd603b1665e..f600557a7659 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -295,7 +295,7 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
295} 295}
296EXPORT_SYMBOL(__dec_zone_page_state); 296EXPORT_SYMBOL(__dec_zone_page_state);
297 297
298#ifdef CONFIG_CMPXCHG_LOCAL 298#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
299/* 299/*
300 * If we have cmpxchg_local support then we do not need to incur the overhead 300 * If we have cmpxchg_local support then we do not need to incur the overhead
301 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. 301 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
new file mode 100644
index 000000000000..4ec84018cc13
--- /dev/null
+++ b/tools/testing/selftests/Makefile
@@ -0,0 +1,11 @@
1TARGETS = breakpoints
2
3all:
4 for TARGET in $(TARGETS); do \
5 make -C $$TARGET; \
6 done;
7
8clean:
9 for TARGET in $(TARGETS); do \
10 make -C $$TARGET clean; \
11 done;
diff --git a/tools/testing/selftests/breakpoints/Makefile b/tools/testing/selftests/breakpoints/Makefile
new file mode 100644
index 000000000000..f362722cdce7
--- /dev/null
+++ b/tools/testing/selftests/breakpoints/Makefile
@@ -0,0 +1,20 @@
1# Taken from perf makefile
2uname_M := $(shell uname -m 2>/dev/null || echo not)
3ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
4ifeq ($(ARCH),i386)
5 ARCH := x86
6endif
7ifeq ($(ARCH),x86_64)
8 ARCH := x86
9endif
10
11
12all:
13ifeq ($(ARCH),x86)
14 gcc breakpoint_test.c -o run_test
15else
16 echo "Not an x86 target, can't build breakpoints selftests"
17endif
18
19clean:
20 rm -fr run_test
diff --git a/tools/testing/selftests/breakpoints/breakpoint_test.c b/tools/testing/selftests/breakpoints/breakpoint_test.c
new file mode 100644
index 000000000000..a0743f3b2b57
--- /dev/null
+++ b/tools/testing/selftests/breakpoints/breakpoint_test.c
@@ -0,0 +1,394 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
3 *
4 * Licensed under the terms of the GNU GPL License version 2
5 *
6 * Selftests for breakpoints (and more generally the do_debug() path) in x86.
7 */
8
9
10#include <sys/ptrace.h>
11#include <unistd.h>
12#include <stddef.h>
13#include <sys/user.h>
14#include <stdio.h>
15#include <stdlib.h>
16#include <signal.h>
17#include <sys/types.h>
18#include <sys/wait.h>
19
20
21/* Breakpoint access modes */
22enum {
23 BP_X = 1,
24 BP_RW = 2,
25 BP_W = 4,
26};
27
28static pid_t child_pid;
29
30/*
31 * Ensures the child and parent are always "talking" about
32 * the same test sequence. (ie: that we haven't forgotten
33 * to call check_trapped() somewhere).
34 */
35static int nr_tests;
36
37static void set_breakpoint_addr(void *addr, int n)
38{
39 int ret;
40
41 ret = ptrace(PTRACE_POKEUSER, child_pid,
42 offsetof(struct user, u_debugreg[n]), addr);
43 if (ret) {
44 perror("Can't set breakpoint addr\n");
45 exit(-1);
46 }
47}
48
49static void toggle_breakpoint(int n, int type, int len,
50 int local, int global, int set)
51{
52 int ret;
53
54 int xtype, xlen;
55 unsigned long vdr7, dr7;
56
57 switch (type) {
58 case BP_X:
59 xtype = 0;
60 break;
61 case BP_W:
62 xtype = 1;
63 break;
64 case BP_RW:
65 xtype = 3;
66 break;
67 }
68
69 switch (len) {
70 case 1:
71 xlen = 0;
72 break;
73 case 2:
74 xlen = 4;
75 break;
76 case 4:
77 xlen = 0xc;
78 break;
79 case 8:
80 xlen = 8;
81 break;
82 }
83
84 dr7 = ptrace(PTRACE_PEEKUSER, child_pid,
85 offsetof(struct user, u_debugreg[7]), 0);
86
87 vdr7 = (xlen | xtype) << 16;
88 vdr7 <<= 4 * n;
89
90 if (local) {
91 vdr7 |= 1 << (2 * n);
92 vdr7 |= 1 << 8;
93 }
94 if (global) {
95 vdr7 |= 2 << (2 * n);
96 vdr7 |= 1 << 9;
97 }
98
99 if (set)
100 dr7 |= vdr7;
101 else
102 dr7 &= ~vdr7;
103
104 ret = ptrace(PTRACE_POKEUSER, child_pid,
105 offsetof(struct user, u_debugreg[7]), dr7);
106 if (ret) {
107 perror("Can't set dr7");
108 exit(-1);
109 }
110}
111
112/* Dummy variables to test read/write accesses */
113static unsigned long long dummy_var[4];
114
115/* Dummy functions to test execution accesses */
116static void dummy_func(void) { }
117static void dummy_func1(void) { }
118static void dummy_func2(void) { }
119static void dummy_func3(void) { }
120
121static void (*dummy_funcs[])(void) = {
122 dummy_func,
123 dummy_func1,
124 dummy_func2,
125 dummy_func3,
126};
127
128static int trapped;
129
130static void check_trapped(void)
131{
132 /*
133 * If we haven't trapped, wake up the parent
134 * so that it notices the failure.
135 */
136 if (!trapped)
137 kill(getpid(), SIGUSR1);
138 trapped = 0;
139
140 nr_tests++;
141}
142
143static void write_var(int len)
144{
145 char *pcval; short *psval; int *pival; long long *plval;
146 int i;
147
148 for (i = 0; i < 4; i++) {
149 switch (len) {
150 case 1:
151 pcval = (char *)&dummy_var[i];
152 *pcval = 0xff;
153 break;
154 case 2:
155 psval = (short *)&dummy_var[i];
156 *psval = 0xffff;
157 break;
158 case 4:
159 pival = (int *)&dummy_var[i];
160 *pival = 0xffffffff;
161 break;
162 case 8:
163 plval = (long long *)&dummy_var[i];
164 *plval = 0xffffffffffffffffLL;
165 break;
166 }
167 check_trapped();
168 }
169}
170
171static void read_var(int len)
172{
173 char cval; short sval; int ival; long long lval;
174 int i;
175
176 for (i = 0; i < 4; i++) {
177 switch (len) {
178 case 1:
179 cval = *(char *)&dummy_var[i];
180 break;
181 case 2:
182 sval = *(short *)&dummy_var[i];
183 break;
184 case 4:
185 ival = *(int *)&dummy_var[i];
186 break;
187 case 8:
188 lval = *(long long *)&dummy_var[i];
189 break;
190 }
191 check_trapped();
192 }
193}
194
195/*
196 * Do the r/w/x accesses to trigger the breakpoints. And run
197 * the usual traps.
198 */
199static void trigger_tests(void)
200{
201 int len, local, global, i;
202 char val;
203 int ret;
204
205 ret = ptrace(PTRACE_TRACEME, 0, NULL, 0);
206 if (ret) {
207 perror("Can't be traced?\n");
208 return;
209 }
210
211 /* Wake up father so that it sets up the first test */
212 kill(getpid(), SIGUSR1);
213
214 /* Test instruction breakpoints */
215 for (local = 0; local < 2; local++) {
216 for (global = 0; global < 2; global++) {
217 if (!local && !global)
218 continue;
219
220 for (i = 0; i < 4; i++) {
221 dummy_funcs[i]();
222 check_trapped();
223 }
224 }
225 }
226
227 /* Test write watchpoints */
228 for (len = 1; len <= sizeof(long); len <<= 1) {
229 for (local = 0; local < 2; local++) {
230 for (global = 0; global < 2; global++) {
231 if (!local && !global)
232 continue;
233 write_var(len);
234 }
235 }
236 }
237
238 /* Test read/write watchpoints (on read accesses) */
239 for (len = 1; len <= sizeof(long); len <<= 1) {
240 for (local = 0; local < 2; local++) {
241 for (global = 0; global < 2; global++) {
242 if (!local && !global)
243 continue;
244 read_var(len);
245 }
246 }
247 }
248
249 /* Icebp trap */
250 asm(".byte 0xf1\n");
251 check_trapped();
252
253 /* Int 3 trap */
254 asm("int $3\n");
255 check_trapped();
256
257 kill(getpid(), SIGUSR1);
258}
259
260static void check_success(const char *msg)
261{
262 const char *msg2;
263 int child_nr_tests;
264 int status;
265
266 /* Wait for the child to SIGTRAP */
267 wait(&status);
268
269 msg2 = "Failed";
270
271 if (WSTOPSIG(status) == SIGTRAP) {
272 child_nr_tests = ptrace(PTRACE_PEEKDATA, child_pid,
273 &nr_tests, 0);
274 if (child_nr_tests == nr_tests)
275 msg2 = "Ok";
276 if (ptrace(PTRACE_POKEDATA, child_pid, &trapped, 1)) {
277 perror("Can't poke\n");
278 exit(-1);
279 }
280 }
281
282 nr_tests++;
283
284 printf("%s [%s]\n", msg, msg2);
285}
286
287static void launch_instruction_breakpoints(char *buf, int local, int global)
288{
289 int i;
290
291 for (i = 0; i < 4; i++) {
292 set_breakpoint_addr(dummy_funcs[i], i);
293 toggle_breakpoint(i, BP_X, 1, local, global, 1);
294 ptrace(PTRACE_CONT, child_pid, NULL, 0);
295 sprintf(buf, "Test breakpoint %d with local: %d global: %d",
296 i, local, global);
297 check_success(buf);
298 toggle_breakpoint(i, BP_X, 1, local, global, 0);
299 }
300}
301
302static void launch_watchpoints(char *buf, int mode, int len,
303 int local, int global)
304{
305 const char *mode_str;
306 int i;
307
308 if (mode == BP_W)
309 mode_str = "write";
310 else
311 mode_str = "read";
312
313 for (i = 0; i < 4; i++) {
314 set_breakpoint_addr(&dummy_var[i], i);
315 toggle_breakpoint(i, mode, len, local, global, 1);
316 ptrace(PTRACE_CONT, child_pid, NULL, 0);
317 sprintf(buf, "Test %s watchpoint %d with len: %d local: "
318 "%d global: %d", mode_str, i, len, local, global);
319 check_success(buf);
320 toggle_breakpoint(i, mode, len, local, global, 0);
321 }
322}
323
324/* Set the breakpoints and check the child successfully trigger them */
325static void launch_tests(void)
326{
327 char buf[1024];
328 int len, local, global, i;
329
330 /* Instruction breakpoints */
331 for (local = 0; local < 2; local++) {
332 for (global = 0; global < 2; global++) {
333 if (!local && !global)
334 continue;
335 launch_instruction_breakpoints(buf, local, global);
336 }
337 }
338
339 /* Write watchpoint */
340 for (len = 1; len <= sizeof(long); len <<= 1) {
341 for (local = 0; local < 2; local++) {
342 for (global = 0; global < 2; global++) {
343 if (!local && !global)
344 continue;
345 launch_watchpoints(buf, BP_W, len,
346 local, global);
347 }
348 }
349 }
350
351 /* Read-Write watchpoint */
352 for (len = 1; len <= sizeof(long); len <<= 1) {
353 for (local = 0; local < 2; local++) {
354 for (global = 0; global < 2; global++) {
355 if (!local && !global)
356 continue;
357 launch_watchpoints(buf, BP_RW, len,
358 local, global);
359 }
360 }
361 }
362
363 /* Icebp traps */
364 ptrace(PTRACE_CONT, child_pid, NULL, 0);
365 check_success("Test icebp");
366
367 /* Int 3 traps */
368 ptrace(PTRACE_CONT, child_pid, NULL, 0);
369 check_success("Test int 3 trap");
370
371 ptrace(PTRACE_CONT, child_pid, NULL, 0);
372}
373
374int main(int argc, char **argv)
375{
376 pid_t pid;
377 int ret;
378
379 pid = fork();
380 if (!pid) {
381 trigger_tests();
382 return 0;
383 }
384
385 child_pid = pid;
386
387 wait(NULL);
388
389 launch_tests();
390
391 wait(NULL);
392
393 return 0;
394}
diff --git a/tools/testing/selftests/run_tests b/tools/testing/selftests/run_tests
new file mode 100644
index 000000000000..320718a4e6bf
--- /dev/null
+++ b/tools/testing/selftests/run_tests
@@ -0,0 +1,8 @@
1#!/bin/bash
2
3TARGETS=breakpoints
4
5for TARGET in $TARGETS
6do
7 $TARGET/run_test
8done