diff options
100 files changed, 2589 insertions, 1562 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index 8b093f8222d3..91bd6ca5440f 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab | |||
@@ -346,6 +346,10 @@ Description: | |||
346 | number of objects per slab. If a slab cannot be allocated | 346 | number of objects per slab. If a slab cannot be allocated |
347 | because of fragmentation, SLUB will retry with the minimum order | 347 | because of fragmentation, SLUB will retry with the minimum order |
348 | possible depending on its characteristics. | 348 | possible depending on its characteristics. |
349 | When debug_guardpage_minorder=N (N > 0) parameter is specified | ||
350 | (see Documentation/kernel-parameters.txt), the minimum possible | ||
351 | order is used and this sysfs entry can not be used to change | ||
352 | the order at run time. | ||
349 | 353 | ||
350 | What: /sys/kernel/slab/cache/order_fallback | 354 | What: /sys/kernel/slab/cache/order_fallback |
351 | Date: April 2008 | 355 | Date: April 2008 |
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 4d8774f6f48a..4c95c0034a4b 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -61,7 +61,7 @@ Brief summary of control files. | |||
61 | memory.failcnt # show the number of memory usage hits limits | 61 | memory.failcnt # show the number of memory usage hits limits |
62 | memory.memsw.failcnt # show the number of memory+Swap hits limits | 62 | memory.memsw.failcnt # show the number of memory+Swap hits limits |
63 | memory.max_usage_in_bytes # show max memory usage recorded | 63 | memory.max_usage_in_bytes # show max memory usage recorded |
64 | memory.memsw.usage_in_bytes # show max memory+Swap usage recorded | 64 | memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded |
65 | memory.soft_limit_in_bytes # set/show soft limit of memory usage | 65 | memory.soft_limit_in_bytes # set/show soft limit of memory usage |
66 | memory.stat # show various statistics | 66 | memory.stat # show various statistics |
67 | memory.use_hierarchy # set/show hierarchical account enabled | 67 | memory.use_hierarchy # set/show hierarchical account enabled |
@@ -410,8 +410,11 @@ memory.stat file includes following statistics | |||
410 | cache - # of bytes of page cache memory. | 410 | cache - # of bytes of page cache memory. |
411 | rss - # of bytes of anonymous and swap cache memory. | 411 | rss - # of bytes of anonymous and swap cache memory. |
412 | mapped_file - # of bytes of mapped file (includes tmpfs/shmem) | 412 | mapped_file - # of bytes of mapped file (includes tmpfs/shmem) |
413 | pgpgin - # of pages paged in (equivalent to # of charging events). | 413 | pgpgin - # of charging events to the memory cgroup. The charging |
414 | pgpgout - # of pages paged out (equivalent to # of uncharging events). | 414 | event happens each time a page is accounted as either mapped |
415 | anon page(RSS) or cache page(Page Cache) to the cgroup. | ||
416 | pgpgout - # of uncharging events to the memory cgroup. The uncharging | ||
417 | event happens each time a page is unaccounted from the cgroup. | ||
415 | swap - # of bytes of swap usage | 418 | swap - # of bytes of swap usage |
416 | inactive_anon - # of bytes of anonymous memory and swap cache memory on | 419 | inactive_anon - # of bytes of anonymous memory and swap cache memory on |
417 | LRU list. | 420 | LRU list. |
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 12fee132fbe2..a76a26a1db8a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -307,6 +307,9 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7) | |||
307 | blkio_ticks time spent waiting for block IO | 307 | blkio_ticks time spent waiting for block IO |
308 | gtime guest time of the task in jiffies | 308 | gtime guest time of the task in jiffies |
309 | cgtime guest time of the task children in jiffies | 309 | cgtime guest time of the task children in jiffies |
310 | start_data address above which program data+bss is placed | ||
311 | end_data address below which program data+bss is placed | ||
312 | start_brk address above which program heap can be expanded with brk() | ||
310 | .............................................................................. | 313 | .............................................................................. |
311 | 314 | ||
312 | The /proc/PID/maps file containing the currently mapped memory regions and | 315 | The /proc/PID/maps file containing the currently mapped memory regions and |
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 6d8cd8b2c30d..8c20fbd8b42d 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -415,6 +415,14 @@ PIDs of value pid_max or larger are not allocated. | |||
415 | 415 | ||
416 | ============================================================== | 416 | ============================================================== |
417 | 417 | ||
418 | ns_last_pid: | ||
419 | |||
420 | The last pid allocated in the current (the one task using this sysctl | ||
421 | lives in) pid namespace. When selecting a pid for a next task on fork | ||
422 | kernel tries to allocate a number starting from this one. | ||
423 | |||
424 | ============================================================== | ||
425 | |||
418 | powersave-nap: (PPC only) | 426 | powersave-nap: (PPC only) |
419 | 427 | ||
420 | If set, Linux-PPC will use the 'nap' mode of powersaving, | 428 | If set, Linux-PPC will use the 'nap' mode of powersaving, |
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt index 2acdda9601b0..6752870c4970 100644 --- a/Documentation/vm/slub.txt +++ b/Documentation/vm/slub.txt | |||
@@ -131,7 +131,10 @@ slub_min_objects. | |||
131 | slub_max_order specified the order at which slub_min_objects should no | 131 | slub_max_order specified the order at which slub_min_objects should no |
132 | longer be checked. This is useful to avoid SLUB trying to generate | 132 | longer be checked. This is useful to avoid SLUB trying to generate |
133 | super large order pages to fit slub_min_objects of a slab cache with | 133 | super large order pages to fit slub_min_objects of a slab cache with |
134 | large object sizes into one high order page. | 134 | large object sizes into one high order page. Setting command line |
135 | parameter debug_guardpage_minorder=N (N > 0), forces setting | ||
136 | slub_max_order to 0, what cause minimum possible order of slabs | ||
137 | allocation. | ||
135 | 138 | ||
136 | SLUB Debug output | 139 | SLUB Debug output |
137 | ----------------- | 140 | ----------------- |
diff --git a/arch/Kconfig b/arch/Kconfig index 2505740b81d2..4f55c736be11 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
@@ -185,4 +185,18 @@ config HAVE_RCU_TABLE_FREE | |||
185 | config ARCH_HAVE_NMI_SAFE_CMPXCHG | 185 | config ARCH_HAVE_NMI_SAFE_CMPXCHG |
186 | bool | 186 | bool |
187 | 187 | ||
188 | config HAVE_ALIGNED_STRUCT_PAGE | ||
189 | bool | ||
190 | help | ||
191 | This makes sure that struct pages are double word aligned and that | ||
192 | e.g. the SLUB allocator can perform double word atomic operations | ||
193 | on a struct page for better performance. However selecting this | ||
194 | might increase the size of a struct page by a word. | ||
195 | |||
196 | config HAVE_CMPXCHG_LOCAL | ||
197 | bool | ||
198 | |||
199 | config HAVE_CMPXCHG_DOUBLE | ||
200 | bool | ||
201 | |||
188 | source "kernel/gcov/Kconfig" | 202 | source "kernel/gcov/Kconfig" |
diff --git a/arch/avr32/include/asm/system.h b/arch/avr32/include/asm/system.h index 9702c2213e1e..62d9ded01635 100644 --- a/arch/avr32/include/asm/system.h +++ b/arch/avr32/include/asm/system.h | |||
@@ -169,7 +169,7 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr, | |||
169 | #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n)) | 169 | #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n)) |
170 | 170 | ||
171 | struct pt_regs; | 171 | struct pt_regs; |
172 | void NORET_TYPE die(const char *str, struct pt_regs *regs, long err); | 172 | void die(const char *str, struct pt_regs *regs, long err); |
173 | void _exception(long signr, struct pt_regs *regs, int code, | 173 | void _exception(long signr, struct pt_regs *regs, int code, |
174 | unsigned long addr); | 174 | unsigned long addr); |
175 | 175 | ||
diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c index 7aa25756412f..3d760c06f024 100644 --- a/arch/avr32/kernel/traps.c +++ b/arch/avr32/kernel/traps.c | |||
@@ -24,7 +24,7 @@ | |||
24 | 24 | ||
25 | static DEFINE_SPINLOCK(die_lock); | 25 | static DEFINE_SPINLOCK(die_lock); |
26 | 26 | ||
27 | void NORET_TYPE die(const char *str, struct pt_regs *regs, long err) | 27 | void die(const char *str, struct pt_regs *regs, long err) |
28 | { | 28 | { |
29 | static int die_counter; | 29 | static int die_counter; |
30 | 30 | ||
diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index d9f397fae03e..691be0b95c1e 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h | |||
@@ -309,7 +309,6 @@ struct thread_struct { | |||
309 | } | 309 | } |
310 | 310 | ||
311 | #define start_thread(regs,new_ip,new_sp) do { \ | 311 | #define start_thread(regs,new_ip,new_sp) do { \ |
312 | set_fs(USER_DS); \ | ||
313 | regs->cr_ipsr = ((regs->cr_ipsr | (IA64_PSR_BITS_TO_SET | IA64_PSR_CPL)) \ | 312 | regs->cr_ipsr = ((regs->cr_ipsr | (IA64_PSR_BITS_TO_SET | IA64_PSR_CPL)) \ |
314 | & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_RI | IA64_PSR_IS)); \ | 313 | & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_RI | IA64_PSR_IS)); \ |
315 | regs->cr_iip = new_ip; \ | 314 | regs->cr_iip = new_ip; \ |
diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c index 3d3aeef46947..4eed35814994 100644 --- a/arch/ia64/kernel/machine_kexec.c +++ b/arch/ia64/kernel/machine_kexec.c | |||
@@ -27,11 +27,11 @@ | |||
27 | #include <asm/sal.h> | 27 | #include <asm/sal.h> |
28 | #include <asm/mca.h> | 28 | #include <asm/mca.h> |
29 | 29 | ||
30 | typedef NORET_TYPE void (*relocate_new_kernel_t)( | 30 | typedef void (*relocate_new_kernel_t)( |
31 | unsigned long indirection_page, | 31 | unsigned long indirection_page, |
32 | unsigned long start_address, | 32 | unsigned long start_address, |
33 | struct ia64_boot_param *boot_param, | 33 | struct ia64_boot_param *boot_param, |
34 | unsigned long pal_addr) ATTRIB_NORET; | 34 | unsigned long pal_addr) __noreturn; |
35 | 35 | ||
36 | struct kimage *ia64_kimage; | 36 | struct kimage *ia64_kimage; |
37 | 37 | ||
diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c index 82a4bb51d5d8..b95a451b1c3a 100644 --- a/arch/m68k/amiga/config.c +++ b/arch/m68k/amiga/config.c | |||
@@ -511,8 +511,7 @@ static unsigned long amiga_gettimeoffset(void) | |||
511 | return ticks + offset; | 511 | return ticks + offset; |
512 | } | 512 | } |
513 | 513 | ||
514 | static NORET_TYPE void amiga_reset(void) | 514 | static void amiga_reset(void) __noreturn; |
515 | ATTRIB_NORET; | ||
516 | 515 | ||
517 | static void amiga_reset(void) | 516 | static void amiga_reset(void) |
518 | { | 517 | { |
diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index de39b1f343ea..7b99c670e478 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h | |||
@@ -144,7 +144,7 @@ extern int ptrace_set_watch_regs(struct task_struct *child, | |||
144 | extern asmlinkage void syscall_trace_enter(struct pt_regs *regs); | 144 | extern asmlinkage void syscall_trace_enter(struct pt_regs *regs); |
145 | extern asmlinkage void syscall_trace_leave(struct pt_regs *regs); | 145 | extern asmlinkage void syscall_trace_leave(struct pt_regs *regs); |
146 | 146 | ||
147 | extern NORET_TYPE void die(const char *, struct pt_regs *) ATTRIB_NORET; | 147 | extern void die(const char *, struct pt_regs *) __noreturn; |
148 | 148 | ||
149 | static inline void die_if_kernel(const char *str, struct pt_regs *regs) | 149 | static inline void die_if_kernel(const char *str, struct pt_regs *regs) |
150 | { | 150 | { |
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 5c8a49d55054..bbddb86c1fa1 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c | |||
@@ -1340,7 +1340,7 @@ void ejtag_exception_handler(struct pt_regs *regs) | |||
1340 | /* | 1340 | /* |
1341 | * NMI exception handler. | 1341 | * NMI exception handler. |
1342 | */ | 1342 | */ |
1343 | NORET_TYPE void ATTRIB_NORET nmi_exception_handler(struct pt_regs *regs) | 1343 | void __noreturn nmi_exception_handler(struct pt_regs *regs) |
1344 | { | 1344 | { |
1345 | bust_spinlocks(1); | 1345 | bust_spinlocks(1); |
1346 | printk("NMI taken!!!!\n"); | 1346 | printk("NMI taken!!!!\n"); |
diff --git a/arch/mn10300/include/asm/exceptions.h b/arch/mn10300/include/asm/exceptions.h index ca3e20508c77..95a4d42c3a06 100644 --- a/arch/mn10300/include/asm/exceptions.h +++ b/arch/mn10300/include/asm/exceptions.h | |||
@@ -110,7 +110,7 @@ extern asmlinkage void nmi_handler(void); | |||
110 | extern asmlinkage void misalignment(struct pt_regs *, enum exception_code); | 110 | extern asmlinkage void misalignment(struct pt_regs *, enum exception_code); |
111 | 111 | ||
112 | extern void die(const char *, struct pt_regs *, enum exception_code) | 112 | extern void die(const char *, struct pt_regs *, enum exception_code) |
113 | ATTRIB_NORET; | 113 | __noreturn; |
114 | 114 | ||
115 | extern int die_if_no_fixup(const char *, struct pt_regs *, enum exception_code); | 115 | extern int die_if_no_fixup(const char *, struct pt_regs *, enum exception_code); |
116 | 116 | ||
diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index 9ce66e9d1c2b..7213ec9e594c 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h | |||
@@ -196,7 +196,6 @@ typedef unsigned int elf_caddr_t; | |||
196 | /* offset pc for priv. level */ \ | 196 | /* offset pc for priv. level */ \ |
197 | pc |= 3; \ | 197 | pc |= 3; \ |
198 | \ | 198 | \ |
199 | set_fs(USER_DS); \ | ||
200 | regs->iasq[0] = spaceid; \ | 199 | regs->iasq[0] = spaceid; \ |
201 | regs->iasq[1] = spaceid; \ | 200 | regs->iasq[1] = spaceid; \ |
202 | regs->iaoq[0] = pc; \ | 201 | regs->iaoq[0] = pc; \ |
@@ -299,7 +298,6 @@ on downward growing arches, it looks like this: | |||
299 | elf_addr_t pc = (elf_addr_t)new_pc | 3; \ | 298 | elf_addr_t pc = (elf_addr_t)new_pc | 3; \ |
300 | elf_caddr_t *argv = (elf_caddr_t *)bprm->exec + 1; \ | 299 | elf_caddr_t *argv = (elf_caddr_t *)bprm->exec + 1; \ |
301 | \ | 300 | \ |
302 | set_fs(USER_DS); \ | ||
303 | regs->iasq[0] = spaceid; \ | 301 | regs->iasq[0] = spaceid; \ |
304 | regs->iasq[1] = spaceid; \ | 302 | regs->iasq[1] = spaceid; \ |
305 | regs->iaoq[0] = pc; \ | 303 | regs->iaoq[0] = pc; \ |
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 4b4b9181a1a0..62c60b87d039 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c | |||
@@ -192,7 +192,6 @@ void flush_thread(void) | |||
192 | /* Only needs to handle fpu stuff or perf monitors. | 192 | /* Only needs to handle fpu stuff or perf monitors. |
193 | ** REVISIT: several arches implement a "lazy fpu state". | 193 | ** REVISIT: several arches implement a "lazy fpu state". |
194 | */ | 194 | */ |
195 | set_fs(USER_DS); | ||
196 | } | 195 | } |
197 | 196 | ||
198 | void release_thread(struct task_struct *dead_task) | 197 | void release_thread(struct task_struct *dead_task) |
diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c index e63f2e7d2efb..affe5dcce7f4 100644 --- a/arch/powerpc/kernel/machine_kexec_32.c +++ b/arch/powerpc/kernel/machine_kexec_32.c | |||
@@ -16,10 +16,10 @@ | |||
16 | #include <asm/hw_irq.h> | 16 | #include <asm/hw_irq.h> |
17 | #include <asm/io.h> | 17 | #include <asm/io.h> |
18 | 18 | ||
19 | typedef NORET_TYPE void (*relocate_new_kernel_t)( | 19 | typedef void (*relocate_new_kernel_t)( |
20 | unsigned long indirection_page, | 20 | unsigned long indirection_page, |
21 | unsigned long reboot_code_buffer, | 21 | unsigned long reboot_code_buffer, |
22 | unsigned long start_address) ATTRIB_NORET; | 22 | unsigned long start_address) __noreturn; |
23 | 23 | ||
24 | /* | 24 | /* |
25 | * This is a generic machine_kexec function suitable at least for | 25 | * This is a generic machine_kexec function suitable at least for |
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 26ccbf77dd41..d7f609086a99 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c | |||
@@ -307,9 +307,9 @@ static union thread_union kexec_stack __init_task_data = | |||
307 | struct paca_struct kexec_paca; | 307 | struct paca_struct kexec_paca; |
308 | 308 | ||
309 | /* Our assembly helper, in kexec_stub.S */ | 309 | /* Our assembly helper, in kexec_stub.S */ |
310 | extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start, | 310 | extern void kexec_sequence(void *newstack, unsigned long start, |
311 | void *image, void *control, | 311 | void *image, void *control, |
312 | void (*clear_all)(void)) ATTRIB_NORET; | 312 | void (*clear_all)(void)) __noreturn; |
313 | 313 | ||
314 | /* too late to fail here */ | 314 | /* too late to fail here */ |
315 | void default_machine_kexec(struct kimage *image) | 315 | void default_machine_kexec(struct kimage *image) |
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 4ff3d8e411a7..3feefc3842a8 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c | |||
@@ -58,7 +58,7 @@ static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; | |||
58 | * Allocate node_to_cpumask_map based on number of available nodes | 58 | * Allocate node_to_cpumask_map based on number of available nodes |
59 | * Requires node_possible_map to be valid. | 59 | * Requires node_possible_map to be valid. |
60 | * | 60 | * |
61 | * Note: node_to_cpumask() is not valid until after this is done. | 61 | * Note: cpumask_of_node() is not valid until after this is done. |
62 | */ | 62 | */ |
63 | static void __init setup_node_to_cpumask_map(void) | 63 | static void __init setup_node_to_cpumask_map(void) |
64 | { | 64 | { |
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 330a57b7c17c..36f957f31842 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c | |||
@@ -638,7 +638,6 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, | |||
638 | /* These are almost always orderly shutdowns. */ | 638 | /* These are almost always orderly shutdowns. */ |
639 | return; | 639 | return; |
640 | case KMSG_DUMP_OOPS: | 640 | case KMSG_DUMP_OOPS: |
641 | case KMSG_DUMP_KEXEC: | ||
642 | break; | 641 | break; |
643 | case KMSG_DUMP_PANIC: | 642 | case KMSG_DUMP_PANIC: |
644 | panicking = true; | 643 | panicking = true; |
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 27272f6a14c2..d25843a6a915 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h | |||
@@ -236,7 +236,7 @@ static inline unsigned long __rewind_psw(psw_t psw, unsigned long ilc) | |||
236 | /* | 236 | /* |
237 | * Function to drop a processor into disabled wait state | 237 | * Function to drop a processor into disabled wait state |
238 | */ | 238 | */ |
239 | static inline void ATTRIB_NORET disabled_wait(unsigned long code) | 239 | static inline void __noreturn disabled_wait(unsigned long code) |
240 | { | 240 | { |
241 | unsigned long ctl_buf; | 241 | unsigned long ctl_buf; |
242 | psw_t dw_psw; | 242 | psw_t dw_psw; |
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index fab88431a06f..0fd2e863e114 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c | |||
@@ -30,7 +30,7 @@ struct mcck_struct { | |||
30 | 30 | ||
31 | static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); | 31 | static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); |
32 | 32 | ||
33 | static NORET_TYPE void s390_handle_damage(char *msg) | 33 | static void s390_handle_damage(char *msg) |
34 | { | 34 | { |
35 | smp_send_stop(); | 35 | smp_send_stop(); |
36 | disabled_wait((unsigned long) __builtin_return_address(0)); | 36 | disabled_wait((unsigned long) __builtin_return_address(0)); |
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index aaf6d59c2012..7ec665178125 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c | |||
@@ -70,7 +70,7 @@ void show_regs(struct pt_regs * regs) | |||
70 | /* | 70 | /* |
71 | * Create a kernel thread | 71 | * Create a kernel thread |
72 | */ | 72 | */ |
73 | ATTRIB_NORET void kernel_thread_helper(void *arg, int (*fn)(void *)) | 73 | __noreturn void kernel_thread_helper(void *arg, int (*fn)(void *)) |
74 | { | 74 | { |
75 | do_exit(fn(arg)); | 75 | do_exit(fn(arg)); |
76 | } | 76 | } |
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index 210c1cabcb7f..cbd4e4bb9fc5 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c | |||
@@ -285,7 +285,7 @@ void show_regs(struct pt_regs *regs) | |||
285 | /* | 285 | /* |
286 | * Create a kernel thread | 286 | * Create a kernel thread |
287 | */ | 287 | */ |
288 | ATTRIB_NORET void kernel_thread_helper(void *arg, int (*fn)(void *)) | 288 | __noreturn void kernel_thread_helper(void *arg, int (*fn)(void *)) |
289 | { | 289 | { |
290 | do_exit(fn(arg)); | 290 | do_exit(fn(arg)); |
291 | } | 291 | } |
diff --git a/arch/tile/kernel/machine_kexec.c b/arch/tile/kernel/machine_kexec.c index e00d7179989e..6255f2eab112 100644 --- a/arch/tile/kernel/machine_kexec.c +++ b/arch/tile/kernel/machine_kexec.c | |||
@@ -248,11 +248,11 @@ static void setup_quasi_va_is_pa(void) | |||
248 | } | 248 | } |
249 | 249 | ||
250 | 250 | ||
251 | NORET_TYPE void machine_kexec(struct kimage *image) | 251 | void machine_kexec(struct kimage *image) |
252 | { | 252 | { |
253 | void *reboot_code_buffer; | 253 | void *reboot_code_buffer; |
254 | NORET_TYPE void (*rnk)(unsigned long, void *, unsigned long) | 254 | void (*rnk)(unsigned long, void *, unsigned long) |
255 | ATTRIB_NORET; | 255 | __noreturn; |
256 | 256 | ||
257 | /* Mask all interrupts before starting to reboot. */ | 257 | /* Mask all interrupts before starting to reboot. */ |
258 | interrupt_mask_set_mask(~0ULL); | 258 | interrupt_mask_set_mask(~0ULL); |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a150f4c35e94..6c14ecd851d0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -60,6 +60,9 @@ config X86 | |||
60 | select PERF_EVENTS | 60 | select PERF_EVENTS |
61 | select HAVE_PERF_EVENTS_NMI | 61 | select HAVE_PERF_EVENTS_NMI |
62 | select ANON_INODES | 62 | select ANON_INODES |
63 | select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 | ||
64 | select HAVE_CMPXCHG_LOCAL if !M386 | ||
65 | select HAVE_CMPXCHG_DOUBLE | ||
63 | select HAVE_ARCH_KMEMCHECK | 66 | select HAVE_ARCH_KMEMCHECK |
64 | select HAVE_USER_RETURN_NOTIFIER | 67 | select HAVE_USER_RETURN_NOTIFIER |
65 | select ARCH_BINFMT_ELF_RANDOMIZE_PIE | 68 | select ARCH_BINFMT_ELF_RANDOMIZE_PIE |
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index e3ca7e0d858c..3c57033e2211 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu | |||
@@ -309,12 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT | |||
309 | config X86_CMPXCHG | 309 | config X86_CMPXCHG |
310 | def_bool X86_64 || (X86_32 && !M386) | 310 | def_bool X86_64 || (X86_32 && !M386) |
311 | 311 | ||
312 | config CMPXCHG_LOCAL | ||
313 | def_bool X86_64 || (X86_32 && !M386) | ||
314 | |||
315 | config CMPXCHG_DOUBLE | ||
316 | def_bool y | ||
317 | |||
318 | config X86_L1_CACHE_SHIFT | 312 | config X86_L1_CACHE_SHIFT |
319 | int | 313 | int |
320 | default "7" if MPENTIUM4 || MPSC | 314 | default "7" if MPENTIUM4 || MPSC |
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 020cd2e80873..19d3fa08b119 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c | |||
@@ -110,7 +110,7 @@ void __cpuinit numa_clear_node(int cpu) | |||
110 | * Allocate node_to_cpumask_map based on number of available nodes | 110 | * Allocate node_to_cpumask_map based on number of available nodes |
111 | * Requires node_possible_map to be valid. | 111 | * Requires node_possible_map to be valid. |
112 | * | 112 | * |
113 | * Note: node_to_cpumask() is not valid until after this is done. | 113 | * Note: cpumask_of_node() is not valid until after this is done. |
114 | * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) | 114 | * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) |
115 | */ | 115 | */ |
116 | void __init setup_node_to_cpumask_map(void) | 116 | void __init setup_node_to_cpumask_map(void) |
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 1d97bd84b6fb..b2b54d2edf53 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig | |||
@@ -6,14 +6,6 @@ menu "UML-specific options" | |||
6 | 6 | ||
7 | menu "Host processor type and features" | 7 | menu "Host processor type and features" |
8 | 8 | ||
9 | config CMPXCHG_LOCAL | ||
10 | bool | ||
11 | default n | ||
12 | |||
13 | config CMPXCHG_DOUBLE | ||
14 | bool | ||
15 | default n | ||
16 | |||
17 | source "arch/x86/Kconfig.cpu" | 9 | source "arch/x86/Kconfig.cpu" |
18 | 10 | ||
19 | endmenu | 11 | endmenu |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index f17e3ea041c0..ed5de58c340f 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
@@ -295,11 +295,22 @@ static int memory_block_change_state(struct memory_block *mem, | |||
295 | 295 | ||
296 | ret = memory_block_action(mem->start_section_nr, to_state); | 296 | ret = memory_block_action(mem->start_section_nr, to_state); |
297 | 297 | ||
298 | if (ret) | 298 | if (ret) { |
299 | mem->state = from_state_req; | 299 | mem->state = from_state_req; |
300 | else | 300 | goto out; |
301 | mem->state = to_state; | 301 | } |
302 | 302 | ||
303 | mem->state = to_state; | ||
304 | switch (mem->state) { | ||
305 | case MEM_OFFLINE: | ||
306 | kobject_uevent(&mem->dev.kobj, KOBJ_OFFLINE); | ||
307 | break; | ||
308 | case MEM_ONLINE: | ||
309 | kobject_uevent(&mem->dev.kobj, KOBJ_ONLINE); | ||
310 | break; | ||
311 | default: | ||
312 | break; | ||
313 | } | ||
303 | out: | 314 | out: |
304 | mutex_unlock(&mem->state_mutex); | 315 | mutex_unlock(&mem->state_mutex); |
305 | return ret; | 316 | return ret; |
diff --git a/drivers/char/ramoops.c b/drivers/char/ramoops.c index 7c7f42a1f880..9fec3232b736 100644 --- a/drivers/char/ramoops.c +++ b/drivers/char/ramoops.c | |||
@@ -83,8 +83,7 @@ static void ramoops_do_dump(struct kmsg_dumper *dumper, | |||
83 | struct timeval timestamp; | 83 | struct timeval timestamp; |
84 | 84 | ||
85 | if (reason != KMSG_DUMP_OOPS && | 85 | if (reason != KMSG_DUMP_OOPS && |
86 | reason != KMSG_DUMP_PANIC && | 86 | reason != KMSG_DUMP_PANIC) |
87 | reason != KMSG_DUMP_KEXEC) | ||
88 | return; | 87 | return; |
89 | 88 | ||
90 | /* Only dump oopses if dump_oops is set */ | 89 | /* Only dump oopses if dump_oops is set */ |
@@ -126,8 +125,8 @@ static int __init ramoops_probe(struct platform_device *pdev) | |||
126 | goto fail3; | 125 | goto fail3; |
127 | } | 126 | } |
128 | 127 | ||
129 | rounddown_pow_of_two(pdata->mem_size); | 128 | pdata->mem_size = rounddown_pow_of_two(pdata->mem_size); |
130 | rounddown_pow_of_two(pdata->record_size); | 129 | pdata->record_size = rounddown_pow_of_two(pdata->record_size); |
131 | 130 | ||
132 | /* Check for the minimum memory size */ | 131 | /* Check for the minimum memory size */ |
133 | if (pdata->mem_size < MIN_MEM_SIZE && | 132 | if (pdata->mem_size < MIN_MEM_SIZE && |
@@ -148,14 +147,6 @@ static int __init ramoops_probe(struct platform_device *pdev) | |||
148 | cxt->phys_addr = pdata->mem_address; | 147 | cxt->phys_addr = pdata->mem_address; |
149 | cxt->record_size = pdata->record_size; | 148 | cxt->record_size = pdata->record_size; |
150 | cxt->dump_oops = pdata->dump_oops; | 149 | cxt->dump_oops = pdata->dump_oops; |
151 | /* | ||
152 | * Update the module parameter variables as well so they are visible | ||
153 | * through /sys/module/ramoops/parameters/ | ||
154 | */ | ||
155 | mem_size = pdata->mem_size; | ||
156 | mem_address = pdata->mem_address; | ||
157 | record_size = pdata->record_size; | ||
158 | dump_oops = pdata->dump_oops; | ||
159 | 150 | ||
160 | if (!request_mem_region(cxt->phys_addr, cxt->size, "ramoops")) { | 151 | if (!request_mem_region(cxt->phys_addr, cxt->size, "ramoops")) { |
161 | pr_err("request mem region failed\n"); | 152 | pr_err("request mem region failed\n"); |
@@ -176,6 +167,15 @@ static int __init ramoops_probe(struct platform_device *pdev) | |||
176 | goto fail1; | 167 | goto fail1; |
177 | } | 168 | } |
178 | 169 | ||
170 | /* | ||
171 | * Update the module parameter variables as well so they are visible | ||
172 | * through /sys/module/ramoops/parameters/ | ||
173 | */ | ||
174 | mem_size = pdata->mem_size; | ||
175 | mem_address = pdata->mem_address; | ||
176 | record_size = pdata->record_size; | ||
177 | dump_oops = pdata->dump_oops; | ||
178 | |||
179 | return 0; | 179 | return 0; |
180 | 180 | ||
181 | fail1: | 181 | fail1: |
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c index db8e8272d69b..3ce99e00a49e 100644 --- a/drivers/mtd/mtdoops.c +++ b/drivers/mtd/mtdoops.c | |||
@@ -315,8 +315,7 @@ static void mtdoops_do_dump(struct kmsg_dumper *dumper, | |||
315 | char *dst; | 315 | char *dst; |
316 | 316 | ||
317 | if (reason != KMSG_DUMP_OOPS && | 317 | if (reason != KMSG_DUMP_OOPS && |
318 | reason != KMSG_DUMP_PANIC && | 318 | reason != KMSG_DUMP_PANIC) |
319 | reason != KMSG_DUMP_KEXEC) | ||
320 | return; | 319 | return; |
321 | 320 | ||
322 | /* Only dump oopses if dump_oops is set */ | 321 | /* Only dump oopses if dump_oops is set */ |
diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c index d0b597b50398..0cb64f50cecd 100644 --- a/drivers/parport/parport_pc.c +++ b/drivers/parport/parport_pc.c | |||
@@ -3404,8 +3404,8 @@ static int __init parport_init_mode_setup(char *str) | |||
3404 | #endif | 3404 | #endif |
3405 | 3405 | ||
3406 | #ifdef MODULE | 3406 | #ifdef MODULE |
3407 | static const char *irq[PARPORT_PC_MAX_PORTS]; | 3407 | static char *irq[PARPORT_PC_MAX_PORTS]; |
3408 | static const char *dma[PARPORT_PC_MAX_PORTS]; | 3408 | static char *dma[PARPORT_PC_MAX_PORTS]; |
3409 | 3409 | ||
3410 | MODULE_PARM_DESC(io, "Base I/O address (SPP regs)"); | 3410 | MODULE_PARM_DESC(io, "Base I/O address (SPP regs)"); |
3411 | module_param_array(io, int, NULL, 0); | 3411 | module_param_array(io, int, NULL, 0); |
diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c index 081dc4745274..fe13ac567d54 100644 --- a/drivers/video/nvidia/nvidia.c +++ b/drivers/video/nvidia/nvidia.c | |||
@@ -81,7 +81,7 @@ static int vram __devinitdata = 0; | |||
81 | static int bpp __devinitdata = 8; | 81 | static int bpp __devinitdata = 8; |
82 | static int reverse_i2c __devinitdata; | 82 | static int reverse_i2c __devinitdata; |
83 | #ifdef CONFIG_MTRR | 83 | #ifdef CONFIG_MTRR |
84 | static int nomtrr __devinitdata = 0; | 84 | static bool nomtrr __devinitdata = false; |
85 | #endif | 85 | #endif |
86 | #ifdef CONFIG_PMAC_BACKLIGHT | 86 | #ifdef CONFIG_PMAC_BACKLIGHT |
87 | static int backlight __devinitdata = 1; | 87 | static int backlight __devinitdata = 1; |
@@ -1509,7 +1509,7 @@ static int __devinit nvidiafb_setup(char *options) | |||
1509 | backlight = simple_strtoul(this_opt+10, NULL, 0); | 1509 | backlight = simple_strtoul(this_opt+10, NULL, 0); |
1510 | #ifdef CONFIG_MTRR | 1510 | #ifdef CONFIG_MTRR |
1511 | } else if (!strncmp(this_opt, "nomtrr", 6)) { | 1511 | } else if (!strncmp(this_opt, "nomtrr", 6)) { |
1512 | nomtrr = 1; | 1512 | nomtrr = true; |
1513 | #endif | 1513 | #endif |
1514 | } else if (!strncmp(this_opt, "fpdither:", 9)) { | 1514 | } else if (!strncmp(this_opt, "fpdither:", 9)) { |
1515 | fpdither = simple_strtol(this_opt+9, NULL, 0); | 1515 | fpdither = simple_strtol(this_opt+9, NULL, 0); |
@@ -1599,7 +1599,7 @@ MODULE_PARM_DESC(bpp, "pixel width in bits" | |||
1599 | module_param(reverse_i2c, int, 0); | 1599 | module_param(reverse_i2c, int, 0); |
1600 | MODULE_PARM_DESC(reverse_i2c, "reverse port assignment of the i2c bus"); | 1600 | MODULE_PARM_DESC(reverse_i2c, "reverse port assignment of the i2c bus"); |
1601 | #ifdef CONFIG_MTRR | 1601 | #ifdef CONFIG_MTRR |
1602 | module_param(nomtrr, bool, 0); | 1602 | module_param(nomtrr, bool, false); |
1603 | MODULE_PARM_DESC(nomtrr, "Disables MTRR support (0 or 1=disabled) " | 1603 | MODULE_PARM_DESC(nomtrr, "Disables MTRR support (0 or 1=disabled) " |
1604 | "(default=0)"); | 1604 | "(default=0)"); |
1605 | #endif | 1605 | #endif |
diff --git a/fs/block_dev.c b/fs/block_dev.c index afe74dda632b..0e575d1304b4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -1139,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | |||
1139 | mutex_lock_nested(&bdev->bd_mutex, for_part); | 1139 | mutex_lock_nested(&bdev->bd_mutex, for_part); |
1140 | if (!bdev->bd_openers) { | 1140 | if (!bdev->bd_openers) { |
1141 | bdev->bd_disk = disk; | 1141 | bdev->bd_disk = disk; |
1142 | bdev->bd_queue = disk->queue; | ||
1142 | bdev->bd_contains = bdev; | 1143 | bdev->bd_contains = bdev; |
1143 | if (!partno) { | 1144 | if (!partno) { |
1144 | struct backing_dev_info *bdi; | 1145 | struct backing_dev_info *bdi; |
@@ -1159,6 +1160,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | |||
1159 | disk_put_part(bdev->bd_part); | 1160 | disk_put_part(bdev->bd_part); |
1160 | bdev->bd_part = NULL; | 1161 | bdev->bd_part = NULL; |
1161 | bdev->bd_disk = NULL; | 1162 | bdev->bd_disk = NULL; |
1163 | bdev->bd_queue = NULL; | ||
1162 | mutex_unlock(&bdev->bd_mutex); | 1164 | mutex_unlock(&bdev->bd_mutex); |
1163 | disk_unblock_events(disk); | 1165 | disk_unblock_events(disk); |
1164 | put_disk(disk); | 1166 | put_disk(disk); |
@@ -1232,6 +1234,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | |||
1232 | disk_put_part(bdev->bd_part); | 1234 | disk_put_part(bdev->bd_part); |
1233 | bdev->bd_disk = NULL; | 1235 | bdev->bd_disk = NULL; |
1234 | bdev->bd_part = NULL; | 1236 | bdev->bd_part = NULL; |
1237 | bdev->bd_queue = NULL; | ||
1235 | bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); | 1238 | bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); |
1236 | if (bdev != bdev->bd_contains) | 1239 | if (bdev != bdev->bd_contains) |
1237 | __blkdev_put(bdev->bd_contains, mode, 1); | 1240 | __blkdev_put(bdev->bd_contains, mode, 1); |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f99a099a7747..d8525662ca7a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -872,7 +872,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, | |||
872 | 872 | ||
873 | #ifdef CONFIG_MIGRATION | 873 | #ifdef CONFIG_MIGRATION |
874 | static int btree_migratepage(struct address_space *mapping, | 874 | static int btree_migratepage(struct address_space *mapping, |
875 | struct page *newpage, struct page *page) | 875 | struct page *newpage, struct page *page, |
876 | enum migrate_mode mode) | ||
876 | { | 877 | { |
877 | /* | 878 | /* |
878 | * we can't safely write a btree page from here, | 879 | * we can't safely write a btree page from here, |
@@ -887,7 +888,7 @@ static int btree_migratepage(struct address_space *mapping, | |||
887 | if (page_has_private(page) && | 888 | if (page_has_private(page) && |
888 | !try_to_release_page(page, GFP_KERNEL)) | 889 | !try_to_release_page(page, GFP_KERNEL)) |
889 | return -EAGAIN; | 890 | return -EAGAIN; |
890 | return migrate_page(mapping, newpage, page); | 891 | return migrate_page(mapping, newpage, page, mode); |
891 | } | 892 | } |
892 | #endif | 893 | #endif |
893 | 894 | ||
diff --git a/fs/direct-io.c b/fs/direct-io.c index d740ab67ff6e..4a588dbd11bf 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/rwsem.h> | 36 | #include <linux/rwsem.h> |
37 | #include <linux/uio.h> | 37 | #include <linux/uio.h> |
38 | #include <linux/atomic.h> | 38 | #include <linux/atomic.h> |
39 | #include <linux/prefetch.h> | ||
39 | 40 | ||
40 | /* | 41 | /* |
41 | * How many user pages to map in one call to get_user_pages(). This determines | 42 | * How many user pages to map in one call to get_user_pages(). This determines |
@@ -580,9 +581,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, | |||
580 | { | 581 | { |
581 | int ret; | 582 | int ret; |
582 | sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ | 583 | sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ |
584 | sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ | ||
583 | unsigned long fs_count; /* Number of filesystem-sized blocks */ | 585 | unsigned long fs_count; /* Number of filesystem-sized blocks */ |
584 | unsigned long dio_count;/* Number of dio_block-sized blocks */ | ||
585 | unsigned long blkmask; | ||
586 | int create; | 586 | int create; |
587 | 587 | ||
588 | /* | 588 | /* |
@@ -593,11 +593,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, | |||
593 | if (ret == 0) { | 593 | if (ret == 0) { |
594 | BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); | 594 | BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); |
595 | fs_startblk = sdio->block_in_file >> sdio->blkfactor; | 595 | fs_startblk = sdio->block_in_file >> sdio->blkfactor; |
596 | dio_count = sdio->final_block_in_request - sdio->block_in_file; | 596 | fs_endblk = (sdio->final_block_in_request - 1) >> |
597 | fs_count = dio_count >> sdio->blkfactor; | 597 | sdio->blkfactor; |
598 | blkmask = (1 << sdio->blkfactor) - 1; | 598 | fs_count = fs_endblk - fs_startblk + 1; |
599 | if (dio_count & blkmask) | ||
600 | fs_count++; | ||
601 | 599 | ||
602 | map_bh->b_state = 0; | 600 | map_bh->b_state = 0; |
603 | map_bh->b_size = fs_count << dio->inode->i_blkbits; | 601 | map_bh->b_size = fs_count << dio->inode->i_blkbits; |
@@ -1090,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio) | |||
1090 | * individual fields and will generate much worse code. This is important | 1088 | * individual fields and will generate much worse code. This is important |
1091 | * for the whole file. | 1089 | * for the whole file. |
1092 | */ | 1090 | */ |
1093 | ssize_t | 1091 | static inline ssize_t |
1094 | __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | 1092 | do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, |
1095 | struct block_device *bdev, const struct iovec *iov, loff_t offset, | 1093 | struct block_device *bdev, const struct iovec *iov, loff_t offset, |
1096 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, | 1094 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, |
1097 | dio_submit_t submit_io, int flags) | 1095 | dio_submit_t submit_io, int flags) |
@@ -1100,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1100 | size_t size; | 1098 | size_t size; |
1101 | unsigned long addr; | 1099 | unsigned long addr; |
1102 | unsigned blkbits = inode->i_blkbits; | 1100 | unsigned blkbits = inode->i_blkbits; |
1103 | unsigned bdev_blkbits = 0; | ||
1104 | unsigned blocksize_mask = (1 << blkbits) - 1; | 1101 | unsigned blocksize_mask = (1 << blkbits) - 1; |
1105 | ssize_t retval = -EINVAL; | 1102 | ssize_t retval = -EINVAL; |
1106 | loff_t end = offset; | 1103 | loff_t end = offset; |
@@ -1113,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1113 | if (rw & WRITE) | 1110 | if (rw & WRITE) |
1114 | rw = WRITE_ODIRECT; | 1111 | rw = WRITE_ODIRECT; |
1115 | 1112 | ||
1116 | if (bdev) | 1113 | /* |
1117 | bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); | 1114 | * Avoid references to bdev if not absolutely needed to give |
1115 | * the early prefetch in the caller enough time. | ||
1116 | */ | ||
1118 | 1117 | ||
1119 | if (offset & blocksize_mask) { | 1118 | if (offset & blocksize_mask) { |
1120 | if (bdev) | 1119 | if (bdev) |
1121 | blkbits = bdev_blkbits; | 1120 | blkbits = blksize_bits(bdev_logical_block_size(bdev)); |
1122 | blocksize_mask = (1 << blkbits) - 1; | 1121 | blocksize_mask = (1 << blkbits) - 1; |
1123 | if (offset & blocksize_mask) | 1122 | if (offset & blocksize_mask) |
1124 | goto out; | 1123 | goto out; |
@@ -1129,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1129 | addr = (unsigned long)iov[seg].iov_base; | 1128 | addr = (unsigned long)iov[seg].iov_base; |
1130 | size = iov[seg].iov_len; | 1129 | size = iov[seg].iov_len; |
1131 | end += size; | 1130 | end += size; |
1132 | if ((addr & blocksize_mask) || (size & blocksize_mask)) { | 1131 | if (unlikely((addr & blocksize_mask) || |
1132 | (size & blocksize_mask))) { | ||
1133 | if (bdev) | 1133 | if (bdev) |
1134 | blkbits = bdev_blkbits; | 1134 | blkbits = blksize_bits( |
1135 | bdev_logical_block_size(bdev)); | ||
1135 | blocksize_mask = (1 << blkbits) - 1; | 1136 | blocksize_mask = (1 << blkbits) - 1; |
1136 | if ((addr & blocksize_mask) || (size & blocksize_mask)) | 1137 | if ((addr & blocksize_mask) || (size & blocksize_mask)) |
1137 | goto out; | 1138 | goto out; |
1138 | } | 1139 | } |
1139 | } | 1140 | } |
@@ -1316,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1316 | out: | 1317 | out: |
1317 | return retval; | 1318 | return retval; |
1318 | } | 1319 | } |
1320 | |||
1321 | ssize_t | ||
1322 | __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | ||
1323 | struct block_device *bdev, const struct iovec *iov, loff_t offset, | ||
1324 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, | ||
1325 | dio_submit_t submit_io, int flags) | ||
1326 | { | ||
1327 | /* | ||
1328 | * The block device state is needed in the end to finally | ||
1329 | * submit everything. Since it's likely to be cache cold | ||
1330 | * prefetch it here as first thing to hide some of the | ||
1331 | * latency. | ||
1332 | * | ||
1333 | * Attempt to prefetch the pieces we likely need later. | ||
1334 | */ | ||
1335 | prefetch(&bdev->bd_disk->part_tbl); | ||
1336 | prefetch(bdev->bd_queue); | ||
1337 | prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); | ||
1338 | |||
1339 | return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, | ||
1340 | nr_segs, get_block, end_io, | ||
1341 | submit_io, flags); | ||
1342 | } | ||
1343 | |||
1319 | EXPORT_SYMBOL(__blockdev_direct_IO); | 1344 | EXPORT_SYMBOL(__blockdev_direct_IO); |
1320 | 1345 | ||
1321 | static __init int dio_init(void) | 1346 | static __init int dio_init(void) |
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 828e750af23a..aabdfc38cf24 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c | |||
@@ -197,6 +197,12 @@ struct eventpoll { | |||
197 | 197 | ||
198 | /* The user that created the eventpoll descriptor */ | 198 | /* The user that created the eventpoll descriptor */ |
199 | struct user_struct *user; | 199 | struct user_struct *user; |
200 | |||
201 | struct file *file; | ||
202 | |||
203 | /* used to optimize loop detection check */ | ||
204 | int visited; | ||
205 | struct list_head visited_list_link; | ||
200 | }; | 206 | }; |
201 | 207 | ||
202 | /* Wait structure used by the poll hooks */ | 208 | /* Wait structure used by the poll hooks */ |
@@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly; | |||
255 | /* Slab cache used to allocate "struct eppoll_entry" */ | 261 | /* Slab cache used to allocate "struct eppoll_entry" */ |
256 | static struct kmem_cache *pwq_cache __read_mostly; | 262 | static struct kmem_cache *pwq_cache __read_mostly; |
257 | 263 | ||
264 | /* Visited nodes during ep_loop_check(), so we can unset them when we finish */ | ||
265 | static LIST_HEAD(visited_list); | ||
266 | |||
267 | /* | ||
268 | * List of files with newly added links, where we may need to limit the number | ||
269 | * of emanating paths. Protected by the epmutex. | ||
270 | */ | ||
271 | static LIST_HEAD(tfile_check_list); | ||
272 | |||
258 | #ifdef CONFIG_SYSCTL | 273 | #ifdef CONFIG_SYSCTL |
259 | 274 | ||
260 | #include <linux/sysctl.h> | 275 | #include <linux/sysctl.h> |
@@ -276,6 +291,12 @@ ctl_table epoll_table[] = { | |||
276 | }; | 291 | }; |
277 | #endif /* CONFIG_SYSCTL */ | 292 | #endif /* CONFIG_SYSCTL */ |
278 | 293 | ||
294 | static const struct file_operations eventpoll_fops; | ||
295 | |||
296 | static inline int is_file_epoll(struct file *f) | ||
297 | { | ||
298 | return f->f_op == &eventpoll_fops; | ||
299 | } | ||
279 | 300 | ||
280 | /* Setup the structure that is used as key for the RB tree */ | 301 | /* Setup the structure that is used as key for the RB tree */ |
281 | static inline void ep_set_ffd(struct epoll_filefd *ffd, | 302 | static inline void ep_set_ffd(struct epoll_filefd *ffd, |
@@ -711,12 +732,6 @@ static const struct file_operations eventpoll_fops = { | |||
711 | .llseek = noop_llseek, | 732 | .llseek = noop_llseek, |
712 | }; | 733 | }; |
713 | 734 | ||
714 | /* Fast test to see if the file is an eventpoll file */ | ||
715 | static inline int is_file_epoll(struct file *f) | ||
716 | { | ||
717 | return f->f_op == &eventpoll_fops; | ||
718 | } | ||
719 | |||
720 | /* | 735 | /* |
721 | * This is called from eventpoll_release() to unlink files from the eventpoll | 736 | * This is called from eventpoll_release() to unlink files from the eventpoll |
722 | * interface. We need to have this facility to cleanup correctly files that are | 737 | * interface. We need to have this facility to cleanup correctly files that are |
@@ -926,6 +941,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) | |||
926 | rb_insert_color(&epi->rbn, &ep->rbr); | 941 | rb_insert_color(&epi->rbn, &ep->rbr); |
927 | } | 942 | } |
928 | 943 | ||
944 | |||
945 | |||
946 | #define PATH_ARR_SIZE 5 | ||
947 | /* | ||
948 | * These are the number paths of length 1 to 5, that we are allowing to emanate | ||
949 | * from a single file of interest. For example, we allow 1000 paths of length | ||
950 | * 1, to emanate from each file of interest. This essentially represents the | ||
951 | * potential wakeup paths, which need to be limited in order to avoid massive | ||
952 | * uncontrolled wakeup storms. The common use case should be a single ep which | ||
953 | * is connected to n file sources. In this case each file source has 1 path | ||
954 | * of length 1. Thus, the numbers below should be more than sufficient. These | ||
955 | * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify | ||
956 | * and delete can't add additional paths. Protected by the epmutex. | ||
957 | */ | ||
958 | static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; | ||
959 | static int path_count[PATH_ARR_SIZE]; | ||
960 | |||
961 | static int path_count_inc(int nests) | ||
962 | { | ||
963 | if (++path_count[nests] > path_limits[nests]) | ||
964 | return -1; | ||
965 | return 0; | ||
966 | } | ||
967 | |||
968 | static void path_count_init(void) | ||
969 | { | ||
970 | int i; | ||
971 | |||
972 | for (i = 0; i < PATH_ARR_SIZE; i++) | ||
973 | path_count[i] = 0; | ||
974 | } | ||
975 | |||
976 | static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) | ||
977 | { | ||
978 | int error = 0; | ||
979 | struct file *file = priv; | ||
980 | struct file *child_file; | ||
981 | struct epitem *epi; | ||
982 | |||
983 | list_for_each_entry(epi, &file->f_ep_links, fllink) { | ||
984 | child_file = epi->ep->file; | ||
985 | if (is_file_epoll(child_file)) { | ||
986 | if (list_empty(&child_file->f_ep_links)) { | ||
987 | if (path_count_inc(call_nests)) { | ||
988 | error = -1; | ||
989 | break; | ||
990 | } | ||
991 | } else { | ||
992 | error = ep_call_nested(&poll_loop_ncalls, | ||
993 | EP_MAX_NESTS, | ||
994 | reverse_path_check_proc, | ||
995 | child_file, child_file, | ||
996 | current); | ||
997 | } | ||
998 | if (error != 0) | ||
999 | break; | ||
1000 | } else { | ||
1001 | printk(KERN_ERR "reverse_path_check_proc: " | ||
1002 | "file is not an ep!\n"); | ||
1003 | } | ||
1004 | } | ||
1005 | return error; | ||
1006 | } | ||
1007 | |||
1008 | /** | ||
1009 | * reverse_path_check - The tfile_check_list is list of file *, which have | ||
1010 | * links that are proposed to be newly added. We need to | ||
1011 | * make sure that those added links don't add too many | ||
1012 | * paths such that we will spend all our time waking up | ||
1013 | * eventpoll objects. | ||
1014 | * | ||
1015 | * Returns: Returns zero if the proposed links don't create too many paths, | ||
1016 | * -1 otherwise. | ||
1017 | */ | ||
1018 | static int reverse_path_check(void) | ||
1019 | { | ||
1020 | int length = 0; | ||
1021 | int error = 0; | ||
1022 | struct file *current_file; | ||
1023 | |||
1024 | /* let's call this for all tfiles */ | ||
1025 | list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { | ||
1026 | length++; | ||
1027 | path_count_init(); | ||
1028 | error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, | ||
1029 | reverse_path_check_proc, current_file, | ||
1030 | current_file, current); | ||
1031 | if (error) | ||
1032 | break; | ||
1033 | } | ||
1034 | return error; | ||
1035 | } | ||
1036 | |||
929 | /* | 1037 | /* |
930 | * Must be called with "mtx" held. | 1038 | * Must be called with "mtx" held. |
931 | */ | 1039 | */ |
@@ -987,6 +1095,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | |||
987 | */ | 1095 | */ |
988 | ep_rbtree_insert(ep, epi); | 1096 | ep_rbtree_insert(ep, epi); |
989 | 1097 | ||
1098 | /* now check if we've created too many backpaths */ | ||
1099 | error = -EINVAL; | ||
1100 | if (reverse_path_check()) | ||
1101 | goto error_remove_epi; | ||
1102 | |||
990 | /* We have to drop the new item inside our item list to keep track of it */ | 1103 | /* We have to drop the new item inside our item list to keep track of it */ |
991 | spin_lock_irqsave(&ep->lock, flags); | 1104 | spin_lock_irqsave(&ep->lock, flags); |
992 | 1105 | ||
@@ -1011,6 +1124,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | |||
1011 | 1124 | ||
1012 | return 0; | 1125 | return 0; |
1013 | 1126 | ||
1127 | error_remove_epi: | ||
1128 | spin_lock(&tfile->f_lock); | ||
1129 | if (ep_is_linked(&epi->fllink)) | ||
1130 | list_del_init(&epi->fllink); | ||
1131 | spin_unlock(&tfile->f_lock); | ||
1132 | |||
1133 | rb_erase(&epi->rbn, &ep->rbr); | ||
1134 | |||
1014 | error_unregister: | 1135 | error_unregister: |
1015 | ep_unregister_pollwait(ep, epi); | 1136 | ep_unregister_pollwait(ep, epi); |
1016 | 1137 | ||
@@ -1275,18 +1396,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) | |||
1275 | int error = 0; | 1396 | int error = 0; |
1276 | struct file *file = priv; | 1397 | struct file *file = priv; |
1277 | struct eventpoll *ep = file->private_data; | 1398 | struct eventpoll *ep = file->private_data; |
1399 | struct eventpoll *ep_tovisit; | ||
1278 | struct rb_node *rbp; | 1400 | struct rb_node *rbp; |
1279 | struct epitem *epi; | 1401 | struct epitem *epi; |
1280 | 1402 | ||
1281 | mutex_lock_nested(&ep->mtx, call_nests + 1); | 1403 | mutex_lock_nested(&ep->mtx, call_nests + 1); |
1404 | ep->visited = 1; | ||
1405 | list_add(&ep->visited_list_link, &visited_list); | ||
1282 | for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { | 1406 | for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { |
1283 | epi = rb_entry(rbp, struct epitem, rbn); | 1407 | epi = rb_entry(rbp, struct epitem, rbn); |
1284 | if (unlikely(is_file_epoll(epi->ffd.file))) { | 1408 | if (unlikely(is_file_epoll(epi->ffd.file))) { |
1409 | ep_tovisit = epi->ffd.file->private_data; | ||
1410 | if (ep_tovisit->visited) | ||
1411 | continue; | ||
1285 | error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, | 1412 | error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, |
1286 | ep_loop_check_proc, epi->ffd.file, | 1413 | ep_loop_check_proc, epi->ffd.file, |
1287 | epi->ffd.file->private_data, current); | 1414 | ep_tovisit, current); |
1288 | if (error != 0) | 1415 | if (error != 0) |
1289 | break; | 1416 | break; |
1417 | } else { | ||
1418 | /* | ||
1419 | * If we've reached a file that is not associated with | ||
1420 | * an ep, then we need to check if the newly added | ||
1421 | * links are going to add too many wakeup paths. We do | ||
1422 | * this by adding it to the tfile_check_list, if it's | ||
1423 | * not already there, and calling reverse_path_check() | ||
1424 | * during ep_insert(). | ||
1425 | */ | ||
1426 | if (list_empty(&epi->ffd.file->f_tfile_llink)) | ||
1427 | list_add(&epi->ffd.file->f_tfile_llink, | ||
1428 | &tfile_check_list); | ||
1290 | } | 1429 | } |
1291 | } | 1430 | } |
1292 | mutex_unlock(&ep->mtx); | 1431 | mutex_unlock(&ep->mtx); |
@@ -1307,8 +1446,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) | |||
1307 | */ | 1446 | */ |
1308 | static int ep_loop_check(struct eventpoll *ep, struct file *file) | 1447 | static int ep_loop_check(struct eventpoll *ep, struct file *file) |
1309 | { | 1448 | { |
1310 | return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, | 1449 | int ret; |
1450 | struct eventpoll *ep_cur, *ep_next; | ||
1451 | |||
1452 | ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, | ||
1311 | ep_loop_check_proc, file, ep, current); | 1453 | ep_loop_check_proc, file, ep, current); |
1454 | /* clear visited list */ | ||
1455 | list_for_each_entry_safe(ep_cur, ep_next, &visited_list, | ||
1456 | visited_list_link) { | ||
1457 | ep_cur->visited = 0; | ||
1458 | list_del(&ep_cur->visited_list_link); | ||
1459 | } | ||
1460 | return ret; | ||
1461 | } | ||
1462 | |||
1463 | static void clear_tfile_check_list(void) | ||
1464 | { | ||
1465 | struct file *file; | ||
1466 | |||
1467 | /* first clear the tfile_check_list */ | ||
1468 | while (!list_empty(&tfile_check_list)) { | ||
1469 | file = list_first_entry(&tfile_check_list, struct file, | ||
1470 | f_tfile_llink); | ||
1471 | list_del_init(&file->f_tfile_llink); | ||
1472 | } | ||
1473 | INIT_LIST_HEAD(&tfile_check_list); | ||
1312 | } | 1474 | } |
1313 | 1475 | ||
1314 | /* | 1476 | /* |
@@ -1316,8 +1478,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file) | |||
1316 | */ | 1478 | */ |
1317 | SYSCALL_DEFINE1(epoll_create1, int, flags) | 1479 | SYSCALL_DEFINE1(epoll_create1, int, flags) |
1318 | { | 1480 | { |
1319 | int error; | 1481 | int error, fd; |
1320 | struct eventpoll *ep = NULL; | 1482 | struct eventpoll *ep = NULL; |
1483 | struct file *file; | ||
1321 | 1484 | ||
1322 | /* Check the EPOLL_* constant for consistency. */ | 1485 | /* Check the EPOLL_* constant for consistency. */ |
1323 | BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); | 1486 | BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); |
@@ -1334,11 +1497,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) | |||
1334 | * Creates all the items needed to setup an eventpoll file. That is, | 1497 | * Creates all the items needed to setup an eventpoll file. That is, |
1335 | * a file structure and a free file descriptor. | 1498 | * a file structure and a free file descriptor. |
1336 | */ | 1499 | */ |
1337 | error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, | 1500 | fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); |
1501 | if (fd < 0) { | ||
1502 | error = fd; | ||
1503 | goto out_free_ep; | ||
1504 | } | ||
1505 | file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, | ||
1338 | O_RDWR | (flags & O_CLOEXEC)); | 1506 | O_RDWR | (flags & O_CLOEXEC)); |
1339 | if (error < 0) | 1507 | if (IS_ERR(file)) { |
1340 | ep_free(ep); | 1508 | error = PTR_ERR(file); |
1341 | 1509 | goto out_free_fd; | |
1510 | } | ||
1511 | fd_install(fd, file); | ||
1512 | ep->file = file; | ||
1513 | return fd; | ||
1514 | |||
1515 | out_free_fd: | ||
1516 | put_unused_fd(fd); | ||
1517 | out_free_ep: | ||
1518 | ep_free(ep); | ||
1342 | return error; | 1519 | return error; |
1343 | } | 1520 | } |
1344 | 1521 | ||
@@ -1404,21 +1581,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, | |||
1404 | /* | 1581 | /* |
1405 | * When we insert an epoll file descriptor, inside another epoll file | 1582 | * When we insert an epoll file descriptor, inside another epoll file |
1406 | * descriptor, there is the change of creating closed loops, which are | 1583 | * descriptor, there is the change of creating closed loops, which are |
1407 | * better be handled here, than in more critical paths. | 1584 | * better be handled here, than in more critical paths. While we are |
1585 | * checking for loops we also determine the list of files reachable | ||
1586 | * and hang them on the tfile_check_list, so we can check that we | ||
1587 | * haven't created too many possible wakeup paths. | ||
1408 | * | 1588 | * |
1409 | * We hold epmutex across the loop check and the insert in this case, in | 1589 | * We need to hold the epmutex across both ep_insert and ep_remove |
1410 | * order to prevent two separate inserts from racing and each doing the | 1590 | * b/c we want to make sure we are looking at a coherent view of |
1411 | * insert "at the same time" such that ep_loop_check passes on both | 1591 | * epoll network. |
1412 | * before either one does the insert, thereby creating a cycle. | ||
1413 | */ | 1592 | */ |
1414 | if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { | 1593 | if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { |
1415 | mutex_lock(&epmutex); | 1594 | mutex_lock(&epmutex); |
1416 | did_lock_epmutex = 1; | 1595 | did_lock_epmutex = 1; |
1417 | error = -ELOOP; | ||
1418 | if (ep_loop_check(ep, tfile) != 0) | ||
1419 | goto error_tgt_fput; | ||
1420 | } | 1596 | } |
1421 | 1597 | if (op == EPOLL_CTL_ADD) { | |
1598 | if (is_file_epoll(tfile)) { | ||
1599 | error = -ELOOP; | ||
1600 | if (ep_loop_check(ep, tfile) != 0) | ||
1601 | goto error_tgt_fput; | ||
1602 | } else | ||
1603 | list_add(&tfile->f_tfile_llink, &tfile_check_list); | ||
1604 | } | ||
1422 | 1605 | ||
1423 | mutex_lock_nested(&ep->mtx, 0); | 1606 | mutex_lock_nested(&ep->mtx, 0); |
1424 | 1607 | ||
@@ -1437,6 +1620,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, | |||
1437 | error = ep_insert(ep, &epds, tfile, fd); | 1620 | error = ep_insert(ep, &epds, tfile, fd); |
1438 | } else | 1621 | } else |
1439 | error = -EEXIST; | 1622 | error = -EEXIST; |
1623 | clear_tfile_check_list(); | ||
1440 | break; | 1624 | break; |
1441 | case EPOLL_CTL_DEL: | 1625 | case EPOLL_CTL_DEL: |
1442 | if (epi) | 1626 | if (epi) |
@@ -1455,7 +1639,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, | |||
1455 | mutex_unlock(&ep->mtx); | 1639 | mutex_unlock(&ep->mtx); |
1456 | 1640 | ||
1457 | error_tgt_fput: | 1641 | error_tgt_fput: |
1458 | if (unlikely(did_lock_epmutex)) | 1642 | if (did_lock_epmutex) |
1459 | mutex_unlock(&epmutex); | 1643 | mutex_unlock(&epmutex); |
1460 | 1644 | ||
1461 | fput(tfile); | 1645 | fput(tfile); |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e425ad9d0490..1e85a7ac0217 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -583,7 +583,8 @@ static int hugetlbfs_set_page_dirty(struct page *page) | |||
583 | } | 583 | } |
584 | 584 | ||
585 | static int hugetlbfs_migrate_page(struct address_space *mapping, | 585 | static int hugetlbfs_migrate_page(struct address_space *mapping, |
586 | struct page *newpage, struct page *page) | 586 | struct page *newpage, struct page *page, |
587 | enum migrate_mode mode) | ||
587 | { | 588 | { |
588 | int rc; | 589 | int rc; |
589 | 590 | ||
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5ee92538b063..8102db9b926c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h | |||
@@ -332,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data); | |||
332 | 332 | ||
333 | #ifdef CONFIG_MIGRATION | 333 | #ifdef CONFIG_MIGRATION |
334 | extern int nfs_migrate_page(struct address_space *, | 334 | extern int nfs_migrate_page(struct address_space *, |
335 | struct page *, struct page *); | 335 | struct page *, struct page *, enum migrate_mode); |
336 | #else | 336 | #else |
337 | #define nfs_migrate_page NULL | 337 | #define nfs_migrate_page NULL |
338 | #endif | 338 | #endif |
diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0c3885255f97..834f0fe96f89 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c | |||
@@ -1688,7 +1688,7 @@ out_error: | |||
1688 | 1688 | ||
1689 | #ifdef CONFIG_MIGRATION | 1689 | #ifdef CONFIG_MIGRATION |
1690 | int nfs_migrate_page(struct address_space *mapping, struct page *newpage, | 1690 | int nfs_migrate_page(struct address_space *mapping, struct page *newpage, |
1691 | struct page *page) | 1691 | struct page *page, enum migrate_mode mode) |
1692 | { | 1692 | { |
1693 | /* | 1693 | /* |
1694 | * If PagePrivate is set, then the page is currently associated with | 1694 | * If PagePrivate is set, then the page is currently associated with |
@@ -1703,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, | |||
1703 | 1703 | ||
1704 | nfs_fscache_release_page(page, GFP_KERNEL); | 1704 | nfs_fscache_release_page(page, GFP_KERNEL); |
1705 | 1705 | ||
1706 | return migrate_page(mapping, newpage, page); | 1706 | return migrate_page(mapping, newpage, page, mode); |
1707 | } | 1707 | } |
1708 | #endif | 1708 | #endif |
1709 | 1709 | ||
@@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) | |||
1137 | if (nr_pages < pipe->nrbufs) | 1137 | if (nr_pages < pipe->nrbufs) |
1138 | return -EBUSY; | 1138 | return -EBUSY; |
1139 | 1139 | ||
1140 | bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); | 1140 | bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); |
1141 | if (unlikely(!bufs)) | 1141 | if (unlikely(!bufs)) |
1142 | return -ENOMEM; | 1142 | return -ENOMEM; |
1143 | 1143 | ||
diff --git a/fs/proc/array.c b/fs/proc/array.c index 8c344f037bd0..9252ee3b71e3 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c | |||
@@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, | |||
464 | 464 | ||
465 | seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ | 465 | seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ |
466 | %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ | 466 | %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ |
467 | %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", | 467 | %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n", |
468 | pid_nr_ns(pid, ns), | 468 | pid_nr_ns(pid, ns), |
469 | tcomm, | 469 | tcomm, |
470 | state, | 470 | state, |
@@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, | |||
511 | task->policy, | 511 | task->policy, |
512 | (unsigned long long)delayacct_blkio_ticks(task), | 512 | (unsigned long long)delayacct_blkio_ticks(task), |
513 | cputime_to_clock_t(gtime), | 513 | cputime_to_clock_t(gtime), |
514 | cputime_to_clock_t(cgtime)); | 514 | cputime_to_clock_t(cgtime), |
515 | (mm && permitted) ? mm->start_data : 0, | ||
516 | (mm && permitted) ? mm->end_data : 0, | ||
517 | (mm && permitted) ? mm->start_brk : 0); | ||
515 | if (mm) | 518 | if (mm) |
516 | mmput(mm); | 519 | mmput(mm); |
517 | return 0; | 520 | return 0; |
diff --git a/fs/proc/base.c b/fs/proc/base.c index 8173dfd89cb2..5485a5388ecb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -654,6 +654,8 @@ static int proc_pid_permission(struct inode *inode, int mask) | |||
654 | bool has_perms; | 654 | bool has_perms; |
655 | 655 | ||
656 | task = get_proc_task(inode); | 656 | task = get_proc_task(inode); |
657 | if (!task) | ||
658 | return -ESRCH; | ||
657 | has_perms = has_pid_permissions(pid, task, 1); | 659 | has_perms = has_pid_permissions(pid, task, 1); |
658 | put_task_struct(task); | 660 | put_task_struct(task); |
659 | 661 | ||
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index e58fa777fa09..f96a5b58a975 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h | |||
@@ -139,6 +139,20 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) | |||
139 | __tlb_remove_tlb_entry(tlb, ptep, address); \ | 139 | __tlb_remove_tlb_entry(tlb, ptep, address); \ |
140 | } while (0) | 140 | } while (0) |
141 | 141 | ||
142 | /** | ||
143 | * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation | ||
144 | * This is a nop so far, because only x86 needs it. | ||
145 | */ | ||
146 | #ifndef __tlb_remove_pmd_tlb_entry | ||
147 | #define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0) | ||
148 | #endif | ||
149 | |||
150 | #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ | ||
151 | do { \ | ||
152 | tlb->need_flush = 1; \ | ||
153 | __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ | ||
154 | } while (0) | ||
155 | |||
142 | #define pte_free_tlb(tlb, ptep, address) \ | 156 | #define pte_free_tlb(tlb, ptep, address) \ |
143 | do { \ | 157 | do { \ |
144 | tlb->need_flush = 1; \ | 158 | tlb->need_flush = 1; \ |
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 5c4abce94ad1..b936763f2236 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/kexec.h> | 5 | #include <linux/kexec.h> |
6 | #include <linux/device.h> | 6 | #include <linux/device.h> |
7 | #include <linux/proc_fs.h> | 7 | #include <linux/proc_fs.h> |
8 | #include <linux/elf.h> | ||
8 | 9 | ||
9 | #define ELFCORE_ADDR_MAX (-1ULL) | 10 | #define ELFCORE_ADDR_MAX (-1ULL) |
10 | #define ELFCORE_ADDR_ERR (-2ULL) | 11 | #define ELFCORE_ADDR_ERR (-2ULL) |
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index f362733186a5..657ab55beda0 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h | |||
@@ -61,6 +61,7 @@ struct file; | |||
61 | static inline void eventpoll_init_file(struct file *file) | 61 | static inline void eventpoll_init_file(struct file *file) |
62 | { | 62 | { |
63 | INIT_LIST_HEAD(&file->f_ep_links); | 63 | INIT_LIST_HEAD(&file->f_ep_links); |
64 | INIT_LIST_HEAD(&file->f_tfile_llink); | ||
64 | } | 65 | } |
65 | 66 | ||
66 | 67 | ||
diff --git a/include/linux/fs.h b/include/linux/fs.h index 7aacf31418fe..4bc8169fb5a1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -525,6 +525,7 @@ enum positive_aop_returns { | |||
525 | struct page; | 525 | struct page; |
526 | struct address_space; | 526 | struct address_space; |
527 | struct writeback_control; | 527 | struct writeback_control; |
528 | enum migrate_mode; | ||
528 | 529 | ||
529 | struct iov_iter { | 530 | struct iov_iter { |
530 | const struct iovec *iov; | 531 | const struct iovec *iov; |
@@ -609,9 +610,12 @@ struct address_space_operations { | |||
609 | loff_t offset, unsigned long nr_segs); | 610 | loff_t offset, unsigned long nr_segs); |
610 | int (*get_xip_mem)(struct address_space *, pgoff_t, int, | 611 | int (*get_xip_mem)(struct address_space *, pgoff_t, int, |
611 | void **, unsigned long *); | 612 | void **, unsigned long *); |
612 | /* migrate the contents of a page to the specified target */ | 613 | /* |
614 | * migrate the contents of a page to the specified target. If sync | ||
615 | * is false, it must not block. | ||
616 | */ | ||
613 | int (*migratepage) (struct address_space *, | 617 | int (*migratepage) (struct address_space *, |
614 | struct page *, struct page *); | 618 | struct page *, struct page *, enum migrate_mode); |
615 | int (*launder_page) (struct page *); | 619 | int (*launder_page) (struct page *); |
616 | int (*is_partially_uptodate) (struct page *, read_descriptor_t *, | 620 | int (*is_partially_uptodate) (struct page *, read_descriptor_t *, |
617 | unsigned long); | 621 | unsigned long); |
@@ -656,6 +660,7 @@ struct address_space { | |||
656 | * must be enforced here for CRIS, to let the least significant bit | 660 | * must be enforced here for CRIS, to let the least significant bit |
657 | * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. | 661 | * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. |
658 | */ | 662 | */ |
663 | struct request_queue; | ||
659 | 664 | ||
660 | struct block_device { | 665 | struct block_device { |
661 | dev_t bd_dev; /* not a kdev_t - it's a search key */ | 666 | dev_t bd_dev; /* not a kdev_t - it's a search key */ |
@@ -678,6 +683,7 @@ struct block_device { | |||
678 | unsigned bd_part_count; | 683 | unsigned bd_part_count; |
679 | int bd_invalidated; | 684 | int bd_invalidated; |
680 | struct gendisk * bd_disk; | 685 | struct gendisk * bd_disk; |
686 | struct request_queue * bd_queue; | ||
681 | struct list_head bd_list; | 687 | struct list_head bd_list; |
682 | /* | 688 | /* |
683 | * Private data. You must have bd_claim'ed the block_device | 689 | * Private data. You must have bd_claim'ed the block_device |
@@ -1001,6 +1007,7 @@ struct file { | |||
1001 | #ifdef CONFIG_EPOLL | 1007 | #ifdef CONFIG_EPOLL |
1002 | /* Used by fs/eventpoll.c to link all the hooks to this file */ | 1008 | /* Used by fs/eventpoll.c to link all the hooks to this file */ |
1003 | struct list_head f_ep_links; | 1009 | struct list_head f_ep_links; |
1010 | struct list_head f_tfile_llink; | ||
1004 | #endif /* #ifdef CONFIG_EPOLL */ | 1011 | #endif /* #ifdef CONFIG_EPOLL */ |
1005 | struct address_space *f_mapping; | 1012 | struct address_space *f_mapping; |
1006 | #ifdef CONFIG_DEBUG_WRITECOUNT | 1013 | #ifdef CONFIG_DEBUG_WRITECOUNT |
@@ -2536,7 +2543,8 @@ extern int generic_check_addressable(unsigned, u64); | |||
2536 | 2543 | ||
2537 | #ifdef CONFIG_MIGRATION | 2544 | #ifdef CONFIG_MIGRATION |
2538 | extern int buffer_migrate_page(struct address_space *, | 2545 | extern int buffer_migrate_page(struct address_space *, |
2539 | struct page *, struct page *); | 2546 | struct page *, struct page *, |
2547 | enum migrate_mode); | ||
2540 | #else | 2548 | #else |
2541 | #define buffer_migrate_page NULL | 2549 | #define buffer_migrate_page NULL |
2542 | #endif | 2550 | #endif |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a9ace9c32507..1b921299abc4 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -18,7 +18,7 @@ extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, | |||
18 | unsigned int flags); | 18 | unsigned int flags); |
19 | extern int zap_huge_pmd(struct mmu_gather *tlb, | 19 | extern int zap_huge_pmd(struct mmu_gather *tlb, |
20 | struct vm_area_struct *vma, | 20 | struct vm_area_struct *vma, |
21 | pmd_t *pmd); | 21 | pmd_t *pmd, unsigned long addr); |
22 | extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | 22 | extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
23 | unsigned long addr, unsigned long end, | 23 | unsigned long addr, unsigned long end, |
24 | unsigned char *vec); | 24 | unsigned char *vec); |
diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d0a7a0c71661..e8343422240a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h | |||
@@ -185,16 +185,17 @@ static inline void might_fault(void) | |||
185 | 185 | ||
186 | extern struct atomic_notifier_head panic_notifier_list; | 186 | extern struct atomic_notifier_head panic_notifier_list; |
187 | extern long (*panic_blink)(int state); | 187 | extern long (*panic_blink)(int state); |
188 | NORET_TYPE void panic(const char * fmt, ...) | 188 | __printf(1, 2) |
189 | __attribute__ ((NORET_AND format (printf, 1, 2))) __cold; | 189 | void panic(const char *fmt, ...) |
190 | __noreturn __cold; | ||
190 | extern void oops_enter(void); | 191 | extern void oops_enter(void); |
191 | extern void oops_exit(void); | 192 | extern void oops_exit(void); |
192 | void print_oops_end_marker(void); | 193 | void print_oops_end_marker(void); |
193 | extern int oops_may_print(void); | 194 | extern int oops_may_print(void); |
194 | NORET_TYPE void do_exit(long error_code) | 195 | void do_exit(long error_code) |
195 | ATTRIB_NORET; | 196 | __noreturn; |
196 | NORET_TYPE void complete_and_exit(struct completion *, long) | 197 | void complete_and_exit(struct completion *, long) |
197 | ATTRIB_NORET; | 198 | __noreturn; |
198 | 199 | ||
199 | /* Internal, do not use. */ | 200 | /* Internal, do not use. */ |
200 | int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); | 201 | int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); |
diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index ee0c952188de..fee66317e071 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h | |||
@@ -18,7 +18,6 @@ | |||
18 | enum kmsg_dump_reason { | 18 | enum kmsg_dump_reason { |
19 | KMSG_DUMP_OOPS, | 19 | KMSG_DUMP_OOPS, |
20 | KMSG_DUMP_PANIC, | 20 | KMSG_DUMP_PANIC, |
21 | KMSG_DUMP_KEXEC, | ||
22 | KMSG_DUMP_RESTART, | 21 | KMSG_DUMP_RESTART, |
23 | KMSG_DUMP_HALT, | 22 | KMSG_DUMP_HALT, |
24 | KMSG_DUMP_POWEROFF, | 23 | KMSG_DUMP_POWEROFF, |
diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 3f46aedea42f..807f1e533226 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h | |||
@@ -88,8 +88,4 @@ | |||
88 | 88 | ||
89 | #endif | 89 | #endif |
90 | 90 | ||
91 | #define NORET_TYPE /**/ | ||
92 | #define ATTRIB_NORET __attribute__((noreturn)) | ||
93 | #define NORET_AND noreturn, | ||
94 | |||
95 | #endif | 91 | #endif |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f944591765eb..4d34356fe644 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -32,13 +32,11 @@ enum mem_cgroup_page_stat_item { | |||
32 | MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ | 32 | MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ |
33 | }; | 33 | }; |
34 | 34 | ||
35 | extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | 35 | struct mem_cgroup_reclaim_cookie { |
36 | struct list_head *dst, | 36 | struct zone *zone; |
37 | unsigned long *scanned, int order, | 37 | int priority; |
38 | isolate_mode_t mode, | 38 | unsigned int generation; |
39 | struct zone *z, | 39 | }; |
40 | struct mem_cgroup *mem_cont, | ||
41 | int active, int file); | ||
42 | 40 | ||
43 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 41 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
44 | /* | 42 | /* |
@@ -56,20 +54,21 @@ extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, | |||
56 | gfp_t gfp_mask); | 54 | gfp_t gfp_mask); |
57 | /* for swap handling */ | 55 | /* for swap handling */ |
58 | extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | 56 | extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm, |
59 | struct page *page, gfp_t mask, struct mem_cgroup **ptr); | 57 | struct page *page, gfp_t mask, struct mem_cgroup **memcgp); |
60 | extern void mem_cgroup_commit_charge_swapin(struct page *page, | 58 | extern void mem_cgroup_commit_charge_swapin(struct page *page, |
61 | struct mem_cgroup *ptr); | 59 | struct mem_cgroup *memcg); |
62 | extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr); | 60 | extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg); |
63 | 61 | ||
64 | extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 62 | extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
65 | gfp_t gfp_mask); | 63 | gfp_t gfp_mask); |
66 | extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru); | 64 | |
67 | extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru); | 65 | struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); |
68 | extern void mem_cgroup_rotate_reclaimable_page(struct page *page); | 66 | struct lruvec *mem_cgroup_lru_add_list(struct zone *, struct page *, |
69 | extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru); | 67 | enum lru_list); |
70 | extern void mem_cgroup_del_lru(struct page *page); | 68 | void mem_cgroup_lru_del_list(struct page *, enum lru_list); |
71 | extern void mem_cgroup_move_lists(struct page *page, | 69 | void mem_cgroup_lru_del(struct page *); |
72 | enum lru_list from, enum lru_list to); | 70 | struct lruvec *mem_cgroup_lru_move_lists(struct zone *, struct page *, |
71 | enum lru_list, enum lru_list); | ||
73 | 72 | ||
74 | /* For coalescing uncharge for reducing memcg' overhead*/ | 73 | /* For coalescing uncharge for reducing memcg' overhead*/ |
75 | extern void mem_cgroup_uncharge_start(void); | 74 | extern void mem_cgroup_uncharge_start(void); |
@@ -102,10 +101,15 @@ extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); | |||
102 | 101 | ||
103 | extern int | 102 | extern int |
104 | mem_cgroup_prepare_migration(struct page *page, | 103 | mem_cgroup_prepare_migration(struct page *page, |
105 | struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask); | 104 | struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask); |
106 | extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, | 105 | extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, |
107 | struct page *oldpage, struct page *newpage, bool migration_ok); | 106 | struct page *oldpage, struct page *newpage, bool migration_ok); |
108 | 107 | ||
108 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, | ||
109 | struct mem_cgroup *, | ||
110 | struct mem_cgroup_reclaim_cookie *); | ||
111 | void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); | ||
112 | |||
109 | /* | 113 | /* |
110 | * For memory reclaim. | 114 | * For memory reclaim. |
111 | */ | 115 | */ |
@@ -122,7 +126,10 @@ struct zone_reclaim_stat* | |||
122 | mem_cgroup_get_reclaim_stat_from_page(struct page *page); | 126 | mem_cgroup_get_reclaim_stat_from_page(struct page *page); |
123 | extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, | 127 | extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, |
124 | struct task_struct *p); | 128 | struct task_struct *p); |
129 | extern void mem_cgroup_replace_page_cache(struct page *oldpage, | ||
130 | struct page *newpage); | ||
125 | 131 | ||
132 | extern void mem_cgroup_reset_owner(struct page *page); | ||
126 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 133 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
127 | extern int do_swap_account; | 134 | extern int do_swap_account; |
128 | #endif | 135 | #endif |
@@ -157,7 +164,7 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg); | |||
157 | 164 | ||
158 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); | 165 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); |
159 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 166 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
160 | void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); | 167 | void mem_cgroup_split_huge_fixup(struct page *head); |
161 | #endif | 168 | #endif |
162 | 169 | ||
163 | #ifdef CONFIG_DEBUG_VM | 170 | #ifdef CONFIG_DEBUG_VM |
@@ -180,17 +187,17 @@ static inline int mem_cgroup_cache_charge(struct page *page, | |||
180 | } | 187 | } |
181 | 188 | ||
182 | static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | 189 | static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm, |
183 | struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr) | 190 | struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp) |
184 | { | 191 | { |
185 | return 0; | 192 | return 0; |
186 | } | 193 | } |
187 | 194 | ||
188 | static inline void mem_cgroup_commit_charge_swapin(struct page *page, | 195 | static inline void mem_cgroup_commit_charge_swapin(struct page *page, |
189 | struct mem_cgroup *ptr) | 196 | struct mem_cgroup *memcg) |
190 | { | 197 | { |
191 | } | 198 | } |
192 | 199 | ||
193 | static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr) | 200 | static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) |
194 | { | 201 | { |
195 | } | 202 | } |
196 | 203 | ||
@@ -210,33 +217,33 @@ static inline void mem_cgroup_uncharge_cache_page(struct page *page) | |||
210 | { | 217 | { |
211 | } | 218 | } |
212 | 219 | ||
213 | static inline void mem_cgroup_add_lru_list(struct page *page, int lru) | 220 | static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, |
214 | { | 221 | struct mem_cgroup *memcg) |
215 | } | ||
216 | |||
217 | static inline void mem_cgroup_del_lru_list(struct page *page, int lru) | ||
218 | { | 222 | { |
219 | return ; | 223 | return &zone->lruvec; |
220 | } | 224 | } |
221 | 225 | ||
222 | static inline void mem_cgroup_rotate_reclaimable_page(struct page *page) | 226 | static inline struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, |
227 | struct page *page, | ||
228 | enum lru_list lru) | ||
223 | { | 229 | { |
224 | return ; | 230 | return &zone->lruvec; |
225 | } | 231 | } |
226 | 232 | ||
227 | static inline void mem_cgroup_rotate_lru_list(struct page *page, int lru) | 233 | static inline void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) |
228 | { | 234 | { |
229 | return ; | ||
230 | } | 235 | } |
231 | 236 | ||
232 | static inline void mem_cgroup_del_lru(struct page *page) | 237 | static inline void mem_cgroup_lru_del(struct page *page) |
233 | { | 238 | { |
234 | return ; | ||
235 | } | 239 | } |
236 | 240 | ||
237 | static inline void | 241 | static inline struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, |
238 | mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to) | 242 | struct page *page, |
243 | enum lru_list from, | ||
244 | enum lru_list to) | ||
239 | { | 245 | { |
246 | return &zone->lruvec; | ||
240 | } | 247 | } |
241 | 248 | ||
242 | static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | 249 | static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
@@ -269,7 +276,7 @@ static inline struct cgroup_subsys_state | |||
269 | 276 | ||
270 | static inline int | 277 | static inline int |
271 | mem_cgroup_prepare_migration(struct page *page, struct page *newpage, | 278 | mem_cgroup_prepare_migration(struct page *page, struct page *newpage, |
272 | struct mem_cgroup **ptr, gfp_t gfp_mask) | 279 | struct mem_cgroup **memcgp, gfp_t gfp_mask) |
273 | { | 280 | { |
274 | return 0; | 281 | return 0; |
275 | } | 282 | } |
@@ -279,6 +286,19 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
279 | { | 286 | { |
280 | } | 287 | } |
281 | 288 | ||
289 | static inline struct mem_cgroup * | ||
290 | mem_cgroup_iter(struct mem_cgroup *root, | ||
291 | struct mem_cgroup *prev, | ||
292 | struct mem_cgroup_reclaim_cookie *reclaim) | ||
293 | { | ||
294 | return NULL; | ||
295 | } | ||
296 | |||
297 | static inline void mem_cgroup_iter_break(struct mem_cgroup *root, | ||
298 | struct mem_cgroup *prev) | ||
299 | { | ||
300 | } | ||
301 | |||
282 | static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg) | 302 | static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg) |
283 | { | 303 | { |
284 | return 0; | 304 | return 0; |
@@ -360,8 +380,7 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
360 | return 0; | 380 | return 0; |
361 | } | 381 | } |
362 | 382 | ||
363 | static inline void mem_cgroup_split_huge_fixup(struct page *head, | 383 | static inline void mem_cgroup_split_huge_fixup(struct page *head) |
364 | struct page *tail) | ||
365 | { | 384 | { |
366 | } | 385 | } |
367 | 386 | ||
@@ -369,6 +388,14 @@ static inline | |||
369 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | 388 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) |
370 | { | 389 | { |
371 | } | 390 | } |
391 | static inline void mem_cgroup_replace_page_cache(struct page *oldpage, | ||
392 | struct page *newpage) | ||
393 | { | ||
394 | } | ||
395 | |||
396 | static inline void mem_cgroup_reset_owner(struct page *page) | ||
397 | { | ||
398 | } | ||
372 | #endif /* CONFIG_CGROUP_MEM_CONT */ | 399 | #endif /* CONFIG_CGROUP_MEM_CONT */ |
373 | 400 | ||
374 | #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) | 401 | #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index e39aeecfe9a2..eaf867412f7a 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -6,18 +6,31 @@ | |||
6 | 6 | ||
7 | typedef struct page *new_page_t(struct page *, unsigned long private, int **); | 7 | typedef struct page *new_page_t(struct page *, unsigned long private, int **); |
8 | 8 | ||
9 | /* | ||
10 | * MIGRATE_ASYNC means never block | ||
11 | * MIGRATE_SYNC_LIGHT in the current implementation means to allow blocking | ||
12 | * on most operations but not ->writepage as the potential stall time | ||
13 | * is too significant | ||
14 | * MIGRATE_SYNC will block when migrating pages | ||
15 | */ | ||
16 | enum migrate_mode { | ||
17 | MIGRATE_ASYNC, | ||
18 | MIGRATE_SYNC_LIGHT, | ||
19 | MIGRATE_SYNC, | ||
20 | }; | ||
21 | |||
9 | #ifdef CONFIG_MIGRATION | 22 | #ifdef CONFIG_MIGRATION |
10 | #define PAGE_MIGRATION 1 | 23 | #define PAGE_MIGRATION 1 |
11 | 24 | ||
12 | extern void putback_lru_pages(struct list_head *l); | 25 | extern void putback_lru_pages(struct list_head *l); |
13 | extern int migrate_page(struct address_space *, | 26 | extern int migrate_page(struct address_space *, |
14 | struct page *, struct page *); | 27 | struct page *, struct page *, enum migrate_mode); |
15 | extern int migrate_pages(struct list_head *l, new_page_t x, | 28 | extern int migrate_pages(struct list_head *l, new_page_t x, |
16 | unsigned long private, bool offlining, | 29 | unsigned long private, bool offlining, |
17 | bool sync); | 30 | enum migrate_mode mode); |
18 | extern int migrate_huge_pages(struct list_head *l, new_page_t x, | 31 | extern int migrate_huge_pages(struct list_head *l, new_page_t x, |
19 | unsigned long private, bool offlining, | 32 | unsigned long private, bool offlining, |
20 | bool sync); | 33 | enum migrate_mode mode); |
21 | 34 | ||
22 | extern int fail_migrate_page(struct address_space *, | 35 | extern int fail_migrate_page(struct address_space *, |
23 | struct page *, struct page *); | 36 | struct page *, struct page *); |
@@ -36,10 +49,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
36 | static inline void putback_lru_pages(struct list_head *l) {} | 49 | static inline void putback_lru_pages(struct list_head *l) {} |
37 | static inline int migrate_pages(struct list_head *l, new_page_t x, | 50 | static inline int migrate_pages(struct list_head *l, new_page_t x, |
38 | unsigned long private, bool offlining, | 51 | unsigned long private, bool offlining, |
39 | bool sync) { return -ENOSYS; } | 52 | enum migrate_mode mode) { return -ENOSYS; } |
40 | static inline int migrate_huge_pages(struct list_head *l, new_page_t x, | 53 | static inline int migrate_huge_pages(struct list_head *l, new_page_t x, |
41 | unsigned long private, bool offlining, | 54 | unsigned long private, bool offlining, |
42 | bool sync) { return -ENOSYS; } | 55 | enum migrate_mode mode) { return -ENOSYS; } |
43 | 56 | ||
44 | static inline int migrate_prep(void) { return -ENOSYS; } | 57 | static inline int migrate_prep(void) { return -ENOSYS; } |
45 | static inline int migrate_prep_local(void) { return -ENOSYS; } | 58 | static inline int migrate_prep_local(void) { return -ENOSYS; } |
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8f7d24712dc1..227fd3e9a9c9 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h | |||
@@ -22,26 +22,21 @@ static inline int page_is_file_cache(struct page *page) | |||
22 | } | 22 | } |
23 | 23 | ||
24 | static inline void | 24 | static inline void |
25 | __add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, | 25 | add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list lru) |
26 | struct list_head *head) | ||
27 | { | 26 | { |
28 | list_add(&page->lru, head); | 27 | struct lruvec *lruvec; |
29 | __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); | ||
30 | mem_cgroup_add_lru_list(page, l); | ||
31 | } | ||
32 | 28 | ||
33 | static inline void | 29 | lruvec = mem_cgroup_lru_add_list(zone, page, lru); |
34 | add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) | 30 | list_add(&page->lru, &lruvec->lists[lru]); |
35 | { | 31 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, hpage_nr_pages(page)); |
36 | __add_page_to_lru_list(zone, page, l, &zone->lru[l].list); | ||
37 | } | 32 | } |
38 | 33 | ||
39 | static inline void | 34 | static inline void |
40 | del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) | 35 | del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list lru) |
41 | { | 36 | { |
37 | mem_cgroup_lru_del_list(page, lru); | ||
42 | list_del(&page->lru); | 38 | list_del(&page->lru); |
43 | __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); | 39 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, -hpage_nr_pages(page)); |
44 | mem_cgroup_del_lru_list(page, l); | ||
45 | } | 40 | } |
46 | 41 | ||
47 | /** | 42 | /** |
@@ -59,24 +54,28 @@ static inline enum lru_list page_lru_base_type(struct page *page) | |||
59 | return LRU_INACTIVE_ANON; | 54 | return LRU_INACTIVE_ANON; |
60 | } | 55 | } |
61 | 56 | ||
62 | static inline void | 57 | /** |
63 | del_page_from_lru(struct zone *zone, struct page *page) | 58 | * page_off_lru - which LRU list was page on? clearing its lru flags. |
59 | * @page: the page to test | ||
60 | * | ||
61 | * Returns the LRU list a page was on, as an index into the array of LRU | ||
62 | * lists; and clears its Unevictable or Active flags, ready for freeing. | ||
63 | */ | ||
64 | static inline enum lru_list page_off_lru(struct page *page) | ||
64 | { | 65 | { |
65 | enum lru_list l; | 66 | enum lru_list lru; |
66 | 67 | ||
67 | list_del(&page->lru); | ||
68 | if (PageUnevictable(page)) { | 68 | if (PageUnevictable(page)) { |
69 | __ClearPageUnevictable(page); | 69 | __ClearPageUnevictable(page); |
70 | l = LRU_UNEVICTABLE; | 70 | lru = LRU_UNEVICTABLE; |
71 | } else { | 71 | } else { |
72 | l = page_lru_base_type(page); | 72 | lru = page_lru_base_type(page); |
73 | if (PageActive(page)) { | 73 | if (PageActive(page)) { |
74 | __ClearPageActive(page); | 74 | __ClearPageActive(page); |
75 | l += LRU_ACTIVE; | 75 | lru += LRU_ACTIVE; |
76 | } | 76 | } |
77 | } | 77 | } |
78 | __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); | 78 | return lru; |
79 | mem_cgroup_del_lru_list(page, l); | ||
80 | } | 79 | } |
81 | 80 | ||
82 | /** | 81 | /** |
@@ -97,7 +96,6 @@ static inline enum lru_list page_lru(struct page *page) | |||
97 | if (PageActive(page)) | 96 | if (PageActive(page)) |
98 | lru += LRU_ACTIVE; | 97 | lru += LRU_ACTIVE; |
99 | } | 98 | } |
100 | |||
101 | return lru; | 99 | return lru; |
102 | } | 100 | } |
103 | 101 | ||
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5b42f1b34eb7..3cc3062b3767 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -151,12 +151,11 @@ struct page { | |||
151 | #endif | 151 | #endif |
152 | } | 152 | } |
153 | /* | 153 | /* |
154 | * If another subsystem starts using the double word pairing for atomic | 154 | * The struct page can be forced to be double word aligned so that atomic ops |
155 | * operations on struct page then it must change the #if to ensure | 155 | * on double words work. The SLUB allocator can make use of such a feature. |
156 | * proper alignment of the page struct. | ||
157 | */ | 156 | */ |
158 | #if defined(CONFIG_SLUB) && defined(CONFIG_CMPXCHG_LOCAL) | 157 | #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE |
159 | __attribute__((__aligned__(2*sizeof(unsigned long)))) | 158 | __aligned(2 * sizeof(unsigned long)) |
160 | #endif | 159 | #endif |
161 | ; | 160 | ; |
162 | 161 | ||
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ca6ca92418a6..650ba2fb3301 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -140,25 +140,29 @@ enum lru_list { | |||
140 | NR_LRU_LISTS | 140 | NR_LRU_LISTS |
141 | }; | 141 | }; |
142 | 142 | ||
143 | #define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++) | 143 | #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) |
144 | 144 | ||
145 | #define for_each_evictable_lru(l) for (l = 0; l <= LRU_ACTIVE_FILE; l++) | 145 | #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) |
146 | 146 | ||
147 | static inline int is_file_lru(enum lru_list l) | 147 | static inline int is_file_lru(enum lru_list lru) |
148 | { | 148 | { |
149 | return (l == LRU_INACTIVE_FILE || l == LRU_ACTIVE_FILE); | 149 | return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); |
150 | } | 150 | } |
151 | 151 | ||
152 | static inline int is_active_lru(enum lru_list l) | 152 | static inline int is_active_lru(enum lru_list lru) |
153 | { | 153 | { |
154 | return (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE); | 154 | return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); |
155 | } | 155 | } |
156 | 156 | ||
157 | static inline int is_unevictable_lru(enum lru_list l) | 157 | static inline int is_unevictable_lru(enum lru_list lru) |
158 | { | 158 | { |
159 | return (l == LRU_UNEVICTABLE); | 159 | return (lru == LRU_UNEVICTABLE); |
160 | } | 160 | } |
161 | 161 | ||
162 | struct lruvec { | ||
163 | struct list_head lists[NR_LRU_LISTS]; | ||
164 | }; | ||
165 | |||
162 | /* Mask used at gathering information at once (see memcontrol.c) */ | 166 | /* Mask used at gathering information at once (see memcontrol.c) */ |
163 | #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) | 167 | #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) |
164 | #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) | 168 | #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) |
@@ -173,6 +177,8 @@ static inline int is_unevictable_lru(enum lru_list l) | |||
173 | #define ISOLATE_CLEAN ((__force isolate_mode_t)0x4) | 177 | #define ISOLATE_CLEAN ((__force isolate_mode_t)0x4) |
174 | /* Isolate unmapped file */ | 178 | /* Isolate unmapped file */ |
175 | #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x8) | 179 | #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x8) |
180 | /* Isolate for asynchronous migration */ | ||
181 | #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x10) | ||
176 | 182 | ||
177 | /* LRU Isolation modes. */ | 183 | /* LRU Isolation modes. */ |
178 | typedef unsigned __bitwise__ isolate_mode_t; | 184 | typedef unsigned __bitwise__ isolate_mode_t; |
@@ -364,10 +370,8 @@ struct zone { | |||
364 | ZONE_PADDING(_pad1_) | 370 | ZONE_PADDING(_pad1_) |
365 | 371 | ||
366 | /* Fields commonly accessed by the page reclaim scanner */ | 372 | /* Fields commonly accessed by the page reclaim scanner */ |
367 | spinlock_t lru_lock; | 373 | spinlock_t lru_lock; |
368 | struct zone_lru { | 374 | struct lruvec lruvec; |
369 | struct list_head list; | ||
370 | } lru[NR_LRU_LISTS]; | ||
371 | 375 | ||
372 | struct zone_reclaim_stat reclaim_stat; | 376 | struct zone_reclaim_stat reclaim_stat; |
373 | 377 | ||
diff --git a/include/linux/oom.h b/include/linux/oom.h index 6f9d04a85336..552fba9c7d5a 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
@@ -43,7 +43,7 @@ enum oom_constraint { | |||
43 | extern void compare_swap_oom_score_adj(int old_val, int new_val); | 43 | extern void compare_swap_oom_score_adj(int old_val, int new_val); |
44 | extern int test_set_oom_score_adj(int new_val); | 44 | extern int test_set_oom_score_adj(int new_val); |
45 | 45 | ||
46 | extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, | 46 | extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, |
47 | const nodemask_t *nodemask, unsigned long totalpages); | 47 | const nodemask_t *nodemask, unsigned long totalpages); |
48 | extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); | 48 | extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); |
49 | extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); | 49 | extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); |
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 961ecc7d30bc..a2d11771c84b 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h | |||
@@ -10,8 +10,6 @@ enum { | |||
10 | /* flags for mem_cgroup and file and I/O status */ | 10 | /* flags for mem_cgroup and file and I/O status */ |
11 | PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ | 11 | PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ |
12 | PCG_FILE_MAPPED, /* page is accounted as "mapped" */ | 12 | PCG_FILE_MAPPED, /* page is accounted as "mapped" */ |
13 | /* No lock in page_cgroup */ | ||
14 | PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ | ||
15 | __NR_PCG_FLAGS, | 13 | __NR_PCG_FLAGS, |
16 | }; | 14 | }; |
17 | 15 | ||
@@ -31,7 +29,6 @@ enum { | |||
31 | struct page_cgroup { | 29 | struct page_cgroup { |
32 | unsigned long flags; | 30 | unsigned long flags; |
33 | struct mem_cgroup *mem_cgroup; | 31 | struct mem_cgroup *mem_cgroup; |
34 | struct list_head lru; /* per cgroup LRU list */ | ||
35 | }; | 32 | }; |
36 | 33 | ||
37 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat); | 34 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat); |
@@ -76,12 +73,6 @@ TESTPCGFLAG(Used, USED) | |||
76 | CLEARPCGFLAG(Used, USED) | 73 | CLEARPCGFLAG(Used, USED) |
77 | SETPCGFLAG(Used, USED) | 74 | SETPCGFLAG(Used, USED) |
78 | 75 | ||
79 | SETPCGFLAG(AcctLRU, ACCT_LRU) | ||
80 | CLEARPCGFLAG(AcctLRU, ACCT_LRU) | ||
81 | TESTPCGFLAG(AcctLRU, ACCT_LRU) | ||
82 | TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU) | ||
83 | |||
84 | |||
85 | SETPCGFLAG(FileMapped, FILE_MAPPED) | 76 | SETPCGFLAG(FileMapped, FILE_MAPPED) |
86 | CLEARPCGFLAG(FileMapped, FILE_MAPPED) | 77 | CLEARPCGFLAG(FileMapped, FILE_MAPPED) |
87 | TESTPCGFLAG(FileMapped, FILE_MAPPED) | 78 | TESTPCGFLAG(FileMapped, FILE_MAPPED) |
@@ -122,39 +113,6 @@ static inline void move_unlock_page_cgroup(struct page_cgroup *pc, | |||
122 | local_irq_restore(*flags); | 113 | local_irq_restore(*flags); |
123 | } | 114 | } |
124 | 115 | ||
125 | #ifdef CONFIG_SPARSEMEM | ||
126 | #define PCG_ARRAYID_WIDTH SECTIONS_SHIFT | ||
127 | #else | ||
128 | #define PCG_ARRAYID_WIDTH NODES_SHIFT | ||
129 | #endif | ||
130 | |||
131 | #if (PCG_ARRAYID_WIDTH > BITS_PER_LONG - NR_PCG_FLAGS) | ||
132 | #error Not enough space left in pc->flags to store page_cgroup array IDs | ||
133 | #endif | ||
134 | |||
135 | /* pc->flags: ARRAY-ID | FLAGS */ | ||
136 | |||
137 | #define PCG_ARRAYID_MASK ((1UL << PCG_ARRAYID_WIDTH) - 1) | ||
138 | |||
139 | #define PCG_ARRAYID_OFFSET (BITS_PER_LONG - PCG_ARRAYID_WIDTH) | ||
140 | /* | ||
141 | * Zero the shift count for non-existent fields, to prevent compiler | ||
142 | * warnings and ensure references are optimized away. | ||
143 | */ | ||
144 | #define PCG_ARRAYID_SHIFT (PCG_ARRAYID_OFFSET * (PCG_ARRAYID_WIDTH != 0)) | ||
145 | |||
146 | static inline void set_page_cgroup_array_id(struct page_cgroup *pc, | ||
147 | unsigned long id) | ||
148 | { | ||
149 | pc->flags &= ~(PCG_ARRAYID_MASK << PCG_ARRAYID_SHIFT); | ||
150 | pc->flags |= (id & PCG_ARRAYID_MASK) << PCG_ARRAYID_SHIFT; | ||
151 | } | ||
152 | |||
153 | static inline unsigned long page_cgroup_array_id(struct page_cgroup *pc) | ||
154 | { | ||
155 | return (pc->flags >> PCG_ARRAYID_SHIFT) & PCG_ARRAYID_MASK; | ||
156 | } | ||
157 | |||
158 | #else /* CONFIG_CGROUP_MEM_RES_CTLR */ | 116 | #else /* CONFIG_CGROUP_MEM_RES_CTLR */ |
159 | struct page_cgroup; | 117 | struct page_cgroup; |
160 | 118 | ||
@@ -183,7 +141,7 @@ static inline void __init page_cgroup_init_flatmem(void) | |||
183 | extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | 141 | extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, |
184 | unsigned short old, unsigned short new); | 142 | unsigned short old, unsigned short new); |
185 | extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); | 143 | extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); |
186 | extern unsigned short lookup_swap_cgroup(swp_entry_t ent); | 144 | extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); |
187 | extern int swap_cgroup_swapon(int type, unsigned long max_pages); | 145 | extern int swap_cgroup_swapon(int type, unsigned long max_pages); |
188 | extern void swap_cgroup_swapoff(int type); | 146 | extern void swap_cgroup_swapoff(int type); |
189 | #else | 147 | #else |
@@ -195,7 +153,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | |||
195 | } | 153 | } |
196 | 154 | ||
197 | static inline | 155 | static inline |
198 | unsigned short lookup_swap_cgroup(swp_entry_t ent) | 156 | unsigned short lookup_swap_cgroup_id(swp_entry_t ent) |
199 | { | 157 | { |
200 | return 0; | 158 | return 0; |
201 | } | 159 | } |
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index ed17024d2ebe..2aa12b8499c0 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h | |||
@@ -21,8 +21,7 @@ struct pagevec { | |||
21 | }; | 21 | }; |
22 | 22 | ||
23 | void __pagevec_release(struct pagevec *pvec); | 23 | void __pagevec_release(struct pagevec *pvec); |
24 | void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); | 24 | void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); |
25 | void pagevec_strip(struct pagevec *pvec); | ||
26 | unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, | 25 | unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, |
27 | pgoff_t start, unsigned nr_pages); | 26 | pgoff_t start, unsigned nr_pages); |
28 | unsigned pagevec_lookup_tag(struct pagevec *pvec, | 27 | unsigned pagevec_lookup_tag(struct pagevec *pvec, |
@@ -59,7 +58,6 @@ static inline unsigned pagevec_add(struct pagevec *pvec, struct page *page) | |||
59 | return pagevec_space(pvec); | 58 | return pagevec_space(pvec); |
60 | } | 59 | } |
61 | 60 | ||
62 | |||
63 | static inline void pagevec_release(struct pagevec *pvec) | 61 | static inline void pagevec_release(struct pagevec *pvec) |
64 | { | 62 | { |
65 | if (pagevec_count(pvec)) | 63 | if (pagevec_count(pvec)) |
@@ -68,22 +66,22 @@ static inline void pagevec_release(struct pagevec *pvec) | |||
68 | 66 | ||
69 | static inline void __pagevec_lru_add_anon(struct pagevec *pvec) | 67 | static inline void __pagevec_lru_add_anon(struct pagevec *pvec) |
70 | { | 68 | { |
71 | ____pagevec_lru_add(pvec, LRU_INACTIVE_ANON); | 69 | __pagevec_lru_add(pvec, LRU_INACTIVE_ANON); |
72 | } | 70 | } |
73 | 71 | ||
74 | static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec) | 72 | static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec) |
75 | { | 73 | { |
76 | ____pagevec_lru_add(pvec, LRU_ACTIVE_ANON); | 74 | __pagevec_lru_add(pvec, LRU_ACTIVE_ANON); |
77 | } | 75 | } |
78 | 76 | ||
79 | static inline void __pagevec_lru_add_file(struct pagevec *pvec) | 77 | static inline void __pagevec_lru_add_file(struct pagevec *pvec) |
80 | { | 78 | { |
81 | ____pagevec_lru_add(pvec, LRU_INACTIVE_FILE); | 79 | __pagevec_lru_add(pvec, LRU_INACTIVE_FILE); |
82 | } | 80 | } |
83 | 81 | ||
84 | static inline void __pagevec_lru_add_active_file(struct pagevec *pvec) | 82 | static inline void __pagevec_lru_add_active_file(struct pagevec *pvec) |
85 | { | 83 | { |
86 | ____pagevec_lru_add(pvec, LRU_ACTIVE_FILE); | 84 | __pagevec_lru_add(pvec, LRU_ACTIVE_FILE); |
87 | } | 85 | } |
88 | 86 | ||
89 | static inline void pagevec_lru_add_file(struct pagevec *pvec) | 87 | static inline void pagevec_lru_add_file(struct pagevec *pvec) |
diff --git a/include/linux/prctl.h b/include/linux/prctl.h index a3baeb2c2161..7ddc7f1b480f 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h | |||
@@ -102,4 +102,16 @@ | |||
102 | 102 | ||
103 | #define PR_MCE_KILL_GET 34 | 103 | #define PR_MCE_KILL_GET 34 |
104 | 104 | ||
105 | /* | ||
106 | * Tune up process memory map specifics. | ||
107 | */ | ||
108 | #define PR_SET_MM 35 | ||
109 | # define PR_SET_MM_START_CODE 1 | ||
110 | # define PR_SET_MM_END_CODE 2 | ||
111 | # define PR_SET_MM_START_DATA 3 | ||
112 | # define PR_SET_MM_END_DATA 4 | ||
113 | # define PR_SET_MM_START_STACK 5 | ||
114 | # define PR_SET_MM_START_BRK 6 | ||
115 | # define PR_SET_MM_BRK 7 | ||
116 | |||
105 | #endif /* _LINUX_PRCTL_H */ | 117 | #endif /* _LINUX_PRCTL_H */ |
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 9d4539c52e53..07e360b1b282 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h | |||
@@ -49,9 +49,6 @@ | |||
49 | #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 | 49 | #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 |
50 | #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 | 50 | #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 |
51 | 51 | ||
52 | #define radix_tree_indirect_to_ptr(ptr) \ | ||
53 | radix_tree_indirect_to_ptr((void __force *)(ptr)) | ||
54 | |||
55 | static inline int radix_tree_is_indirect_ptr(void *ptr) | 52 | static inline int radix_tree_is_indirect_ptr(void *ptr) |
56 | { | 53 | { |
57 | return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); | 54 | return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); |
diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1afb9954bbf1..1cdd62a2788a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h | |||
@@ -158,7 +158,7 @@ static inline void page_dup_rmap(struct page *page) | |||
158 | * Called from mm/vmscan.c to handle paging out | 158 | * Called from mm/vmscan.c to handle paging out |
159 | */ | 159 | */ |
160 | int page_referenced(struct page *, int is_locked, | 160 | int page_referenced(struct page *, int is_locked, |
161 | struct mem_cgroup *cnt, unsigned long *vm_flags); | 161 | struct mem_cgroup *memcg, unsigned long *vm_flags); |
162 | int page_referenced_one(struct page *, struct vm_area_struct *, | 162 | int page_referenced_one(struct page *, struct vm_area_struct *, |
163 | unsigned long address, unsigned int *mapcount, unsigned long *vm_flags); | 163 | unsigned long address, unsigned int *mapcount, unsigned long *vm_flags); |
164 | 164 | ||
@@ -236,7 +236,7 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *, | |||
236 | #define anon_vma_link(vma) do {} while (0) | 236 | #define anon_vma_link(vma) do {} while (0) |
237 | 237 | ||
238 | static inline int page_referenced(struct page *page, int is_locked, | 238 | static inline int page_referenced(struct page *page, int is_locked, |
239 | struct mem_cgroup *cnt, | 239 | struct mem_cgroup *memcg, |
240 | unsigned long *vm_flags) | 240 | unsigned long *vm_flags) |
241 | { | 241 | { |
242 | *vm_flags = 0; | 242 | *vm_flags = 0; |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 21cd0303af51..4032ec1cf836 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -2275,7 +2275,7 @@ extern void __cleanup_sighand(struct sighand_struct *); | |||
2275 | extern void exit_itimers(struct signal_struct *); | 2275 | extern void exit_itimers(struct signal_struct *); |
2276 | extern void flush_itimer_signals(void); | 2276 | extern void flush_itimer_signals(void); |
2277 | 2277 | ||
2278 | extern NORET_TYPE void do_group_exit(int); | 2278 | extern void do_group_exit(int); |
2279 | 2279 | ||
2280 | extern void daemonize(const char *, ...); | 2280 | extern void daemonize(const char *, ...); |
2281 | extern int allow_signal(int); | 2281 | extern int allow_signal(int); |
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index edc4b3d25a2d..f64560e204bc 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h | |||
@@ -266,9 +266,10 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, | |||
266 | unsigned long nr_lumpy_taken, | 266 | unsigned long nr_lumpy_taken, |
267 | unsigned long nr_lumpy_dirty, | 267 | unsigned long nr_lumpy_dirty, |
268 | unsigned long nr_lumpy_failed, | 268 | unsigned long nr_lumpy_failed, |
269 | isolate_mode_t isolate_mode), | 269 | isolate_mode_t isolate_mode, |
270 | int file), | ||
270 | 271 | ||
271 | TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode), | 272 | TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file), |
272 | 273 | ||
273 | TP_STRUCT__entry( | 274 | TP_STRUCT__entry( |
274 | __field(int, order) | 275 | __field(int, order) |
@@ -279,6 +280,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, | |||
279 | __field(unsigned long, nr_lumpy_dirty) | 280 | __field(unsigned long, nr_lumpy_dirty) |
280 | __field(unsigned long, nr_lumpy_failed) | 281 | __field(unsigned long, nr_lumpy_failed) |
281 | __field(isolate_mode_t, isolate_mode) | 282 | __field(isolate_mode_t, isolate_mode) |
283 | __field(int, file) | ||
282 | ), | 284 | ), |
283 | 285 | ||
284 | TP_fast_assign( | 286 | TP_fast_assign( |
@@ -290,9 +292,10 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, | |||
290 | __entry->nr_lumpy_dirty = nr_lumpy_dirty; | 292 | __entry->nr_lumpy_dirty = nr_lumpy_dirty; |
291 | __entry->nr_lumpy_failed = nr_lumpy_failed; | 293 | __entry->nr_lumpy_failed = nr_lumpy_failed; |
292 | __entry->isolate_mode = isolate_mode; | 294 | __entry->isolate_mode = isolate_mode; |
295 | __entry->file = file; | ||
293 | ), | 296 | ), |
294 | 297 | ||
295 | TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu contig_taken=%lu contig_dirty=%lu contig_failed=%lu", | 298 | TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu contig_taken=%lu contig_dirty=%lu contig_failed=%lu file=%d", |
296 | __entry->isolate_mode, | 299 | __entry->isolate_mode, |
297 | __entry->order, | 300 | __entry->order, |
298 | __entry->nr_requested, | 301 | __entry->nr_requested, |
@@ -300,7 +303,8 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, | |||
300 | __entry->nr_taken, | 303 | __entry->nr_taken, |
301 | __entry->nr_lumpy_taken, | 304 | __entry->nr_lumpy_taken, |
302 | __entry->nr_lumpy_dirty, | 305 | __entry->nr_lumpy_dirty, |
303 | __entry->nr_lumpy_failed) | 306 | __entry->nr_lumpy_failed, |
307 | __entry->file) | ||
304 | ); | 308 | ); |
305 | 309 | ||
306 | DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, | 310 | DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, |
@@ -312,9 +316,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, | |||
312 | unsigned long nr_lumpy_taken, | 316 | unsigned long nr_lumpy_taken, |
313 | unsigned long nr_lumpy_dirty, | 317 | unsigned long nr_lumpy_dirty, |
314 | unsigned long nr_lumpy_failed, | 318 | unsigned long nr_lumpy_failed, |
315 | isolate_mode_t isolate_mode), | 319 | isolate_mode_t isolate_mode, |
320 | int file), | ||
316 | 321 | ||
317 | TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode) | 322 | TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file) |
318 | 323 | ||
319 | ); | 324 | ); |
320 | 325 | ||
@@ -327,9 +332,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate, | |||
327 | unsigned long nr_lumpy_taken, | 332 | unsigned long nr_lumpy_taken, |
328 | unsigned long nr_lumpy_dirty, | 333 | unsigned long nr_lumpy_dirty, |
329 | unsigned long nr_lumpy_failed, | 334 | unsigned long nr_lumpy_failed, |
330 | isolate_mode_t isolate_mode), | 335 | isolate_mode_t isolate_mode, |
336 | int file), | ||
331 | 337 | ||
332 | TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode) | 338 | TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file) |
333 | 339 | ||
334 | ); | 340 | ); |
335 | 341 | ||
diff --git a/init/Kconfig b/init/Kconfig index 018d206c21f7..6ac2236244c3 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -783,6 +783,17 @@ config DEBUG_BLK_CGROUP | |||
783 | 783 | ||
784 | endif # CGROUPS | 784 | endif # CGROUPS |
785 | 785 | ||
786 | config CHECKPOINT_RESTORE | ||
787 | bool "Checkpoint/restore support" if EXPERT | ||
788 | default n | ||
789 | help | ||
790 | Enables additional kernel features in a sake of checkpoint/restore. | ||
791 | In particular it adds auxiliary prctl codes to setup process text, | ||
792 | data and heap segment sizes, and a few additional /proc filesystem | ||
793 | entries. | ||
794 | |||
795 | If unsure, say N here. | ||
796 | |||
786 | menuconfig NAMESPACES | 797 | menuconfig NAMESPACES |
787 | bool "Namespaces support" if EXPERT | 798 | bool "Namespaces support" if EXPERT |
788 | default !EXPERT | 799 | default !EXPERT |
diff --git a/kernel/exit.c b/kernel/exit.c index 94ed6e20bb53..c44738267be7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -887,7 +887,7 @@ static void check_stack_usage(void) | |||
887 | static inline void check_stack_usage(void) {} | 887 | static inline void check_stack_usage(void) {} |
888 | #endif | 888 | #endif |
889 | 889 | ||
890 | NORET_TYPE void do_exit(long code) | 890 | void do_exit(long code) |
891 | { | 891 | { |
892 | struct task_struct *tsk = current; | 892 | struct task_struct *tsk = current; |
893 | int group_dead; | 893 | int group_dead; |
@@ -1051,7 +1051,7 @@ NORET_TYPE void do_exit(long code) | |||
1051 | 1051 | ||
1052 | EXPORT_SYMBOL_GPL(do_exit); | 1052 | EXPORT_SYMBOL_GPL(do_exit); |
1053 | 1053 | ||
1054 | NORET_TYPE void complete_and_exit(struct completion *comp, long code) | 1054 | void complete_and_exit(struct completion *comp, long code) |
1055 | { | 1055 | { |
1056 | if (comp) | 1056 | if (comp) |
1057 | complete(comp); | 1057 | complete(comp); |
@@ -1070,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code) | |||
1070 | * Take down every thread in the group. This is called by fatal signals | 1070 | * Take down every thread in the group. This is called by fatal signals |
1071 | * as well as by sys_exit_group (below). | 1071 | * as well as by sys_exit_group (below). |
1072 | */ | 1072 | */ |
1073 | NORET_TYPE void | 1073 | void |
1074 | do_group_exit(int exit_code) | 1074 | do_group_exit(int exit_code) |
1075 | { | 1075 | { |
1076 | struct signal_struct *sig = current->signal; | 1076 | struct signal_struct *sig = current->signal; |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 090ee10d9604..7b0886786701 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -32,7 +32,6 @@ | |||
32 | #include <linux/console.h> | 32 | #include <linux/console.h> |
33 | #include <linux/vmalloc.h> | 33 | #include <linux/vmalloc.h> |
34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
35 | #include <linux/kmsg_dump.h> | ||
36 | #include <linux/syscore_ops.h> | 35 | #include <linux/syscore_ops.h> |
37 | 36 | ||
38 | #include <asm/page.h> | 37 | #include <asm/page.h> |
@@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs) | |||
1094 | if (kexec_crash_image) { | 1093 | if (kexec_crash_image) { |
1095 | struct pt_regs fixed_regs; | 1094 | struct pt_regs fixed_regs; |
1096 | 1095 | ||
1097 | kmsg_dump(KMSG_DUMP_KEXEC); | ||
1098 | |||
1099 | crash_setup_regs(&fixed_regs, regs); | 1096 | crash_setup_regs(&fixed_regs, regs); |
1100 | crash_save_vmcoreinfo(); | 1097 | crash_save_vmcoreinfo(); |
1101 | machine_crash_shutdown(&fixed_regs); | 1098 | machine_crash_shutdown(&fixed_regs); |
@@ -1132,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size) | |||
1132 | { | 1129 | { |
1133 | int ret = 0; | 1130 | int ret = 0; |
1134 | unsigned long start, end; | 1131 | unsigned long start, end; |
1132 | unsigned long old_size; | ||
1133 | struct resource *ram_res; | ||
1135 | 1134 | ||
1136 | mutex_lock(&kexec_mutex); | 1135 | mutex_lock(&kexec_mutex); |
1137 | 1136 | ||
@@ -1141,11 +1140,15 @@ int crash_shrink_memory(unsigned long new_size) | |||
1141 | } | 1140 | } |
1142 | start = crashk_res.start; | 1141 | start = crashk_res.start; |
1143 | end = crashk_res.end; | 1142 | end = crashk_res.end; |
1143 | old_size = (end == 0) ? 0 : end - start + 1; | ||
1144 | if (new_size >= old_size) { | ||
1145 | ret = (new_size == old_size) ? 0 : -EINVAL; | ||
1146 | goto unlock; | ||
1147 | } | ||
1144 | 1148 | ||
1145 | if (new_size >= end - start + 1) { | 1149 | ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); |
1146 | ret = -EINVAL; | 1150 | if (!ram_res) { |
1147 | if (new_size == end - start + 1) | 1151 | ret = -ENOMEM; |
1148 | ret = 0; | ||
1149 | goto unlock; | 1152 | goto unlock; |
1150 | } | 1153 | } |
1151 | 1154 | ||
@@ -1157,7 +1160,15 @@ int crash_shrink_memory(unsigned long new_size) | |||
1157 | 1160 | ||
1158 | if ((start == end) && (crashk_res.parent != NULL)) | 1161 | if ((start == end) && (crashk_res.parent != NULL)) |
1159 | release_resource(&crashk_res); | 1162 | release_resource(&crashk_res); |
1163 | |||
1164 | ram_res->start = end; | ||
1165 | ram_res->end = crashk_res.end; | ||
1166 | ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; | ||
1167 | ram_res->name = "System RAM"; | ||
1168 | |||
1160 | crashk_res.end = end - 1; | 1169 | crashk_res.end = end - 1; |
1170 | |||
1171 | insert_resource(&iomem_resource, ram_res); | ||
1161 | crash_unmap_reserved_pages(); | 1172 | crash_unmap_reserved_pages(); |
1162 | 1173 | ||
1163 | unlock: | 1174 | unlock: |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e5d84644823b..95dd7212e610 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file, | |||
2198 | const char __user *user_buf, size_t count, loff_t *ppos) | 2198 | const char __user *user_buf, size_t count, loff_t *ppos) |
2199 | { | 2199 | { |
2200 | char buf[32]; | 2200 | char buf[32]; |
2201 | int buf_size; | 2201 | size_t buf_size; |
2202 | 2202 | ||
2203 | buf_size = min(count, (sizeof(buf)-1)); | 2203 | buf_size = min(count, (sizeof(buf)-1)); |
2204 | if (copy_from_user(buf, user_buf, buf_size)) | 2204 | if (copy_from_user(buf, user_buf, buf_size)) |
diff --git a/kernel/panic.c b/kernel/panic.c index 3458469eb7c3..80aed44e345a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -49,6 +49,15 @@ static long no_blink(int state) | |||
49 | long (*panic_blink)(int state); | 49 | long (*panic_blink)(int state); |
50 | EXPORT_SYMBOL(panic_blink); | 50 | EXPORT_SYMBOL(panic_blink); |
51 | 51 | ||
52 | /* | ||
53 | * Stop ourself in panic -- architecture code may override this | ||
54 | */ | ||
55 | void __weak panic_smp_self_stop(void) | ||
56 | { | ||
57 | while (1) | ||
58 | cpu_relax(); | ||
59 | } | ||
60 | |||
52 | /** | 61 | /** |
53 | * panic - halt the system | 62 | * panic - halt the system |
54 | * @fmt: The text string to print | 63 | * @fmt: The text string to print |
@@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink); | |||
57 | * | 66 | * |
58 | * This function never returns. | 67 | * This function never returns. |
59 | */ | 68 | */ |
60 | NORET_TYPE void panic(const char * fmt, ...) | 69 | void panic(const char *fmt, ...) |
61 | { | 70 | { |
71 | static DEFINE_SPINLOCK(panic_lock); | ||
62 | static char buf[1024]; | 72 | static char buf[1024]; |
63 | va_list args; | 73 | va_list args; |
64 | long i, i_next = 0; | 74 | long i, i_next = 0; |
@@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
68 | * It's possible to come here directly from a panic-assertion and | 78 | * It's possible to come here directly from a panic-assertion and |
69 | * not have preempt disabled. Some functions called from here want | 79 | * not have preempt disabled. Some functions called from here want |
70 | * preempt to be disabled. No point enabling it later though... | 80 | * preempt to be disabled. No point enabling it later though... |
81 | * | ||
82 | * Only one CPU is allowed to execute the panic code from here. For | ||
83 | * multiple parallel invocations of panic, all other CPUs either | ||
84 | * stop themself or will wait until they are stopped by the 1st CPU | ||
85 | * with smp_send_stop(). | ||
71 | */ | 86 | */ |
72 | preempt_disable(); | 87 | if (!spin_trylock(&panic_lock)) |
88 | panic_smp_self_stop(); | ||
73 | 89 | ||
74 | console_verbose(); | 90 | console_verbose(); |
75 | bust_spinlocks(1); | 91 | bust_spinlocks(1); |
@@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
78 | va_end(args); | 94 | va_end(args); |
79 | printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); | 95 | printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); |
80 | #ifdef CONFIG_DEBUG_BUGVERBOSE | 96 | #ifdef CONFIG_DEBUG_BUGVERBOSE |
81 | dump_stack(); | 97 | /* |
98 | * Avoid nested stack-dumping if a panic occurs during oops processing | ||
99 | */ | ||
100 | if (!oops_in_progress) | ||
101 | dump_stack(); | ||
82 | #endif | 102 | #endif |
83 | 103 | ||
84 | /* | 104 | /* |
diff --git a/kernel/pid.c b/kernel/pid.c index fa5f72227e5f..ce8e00deaccb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b) | |||
137 | } | 137 | } |
138 | 138 | ||
139 | /* | 139 | /* |
140 | * We might be racing with someone else trying to set pid_ns->last_pid. | 140 | * We might be racing with someone else trying to set pid_ns->last_pid |
141 | * at the pid allocation time (there's also a sysctl for this, but racing | ||
142 | * with this one is OK, see comment in kernel/pid_namespace.c about it). | ||
141 | * We want the winner to have the "later" value, because if the | 143 | * We want the winner to have the "later" value, because if the |
142 | * "earlier" value prevails, then a pid may get reused immediately. | 144 | * "earlier" value prevails, then a pid may get reused immediately. |
143 | * | 145 | * |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index e9c9adc84ca6..a8968396046d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
191 | return; | 191 | return; |
192 | } | 192 | } |
193 | 193 | ||
194 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, | ||
195 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
196 | { | ||
197 | struct ctl_table tmp = *table; | ||
198 | |||
199 | if (write && !capable(CAP_SYS_ADMIN)) | ||
200 | return -EPERM; | ||
201 | |||
202 | /* | ||
203 | * Writing directly to ns' last_pid field is OK, since this field | ||
204 | * is volatile in a living namespace anyway and a code writing to | ||
205 | * it should synchronize its usage with external means. | ||
206 | */ | ||
207 | |||
208 | tmp.data = ¤t->nsproxy->pid_ns->last_pid; | ||
209 | return proc_dointvec(&tmp, write, buffer, lenp, ppos); | ||
210 | } | ||
211 | |||
212 | static struct ctl_table pid_ns_ctl_table[] = { | ||
213 | { | ||
214 | .procname = "ns_last_pid", | ||
215 | .maxlen = sizeof(int), | ||
216 | .mode = 0666, /* permissions are checked in the handler */ | ||
217 | .proc_handler = pid_ns_ctl_handler, | ||
218 | }, | ||
219 | { } | ||
220 | }; | ||
221 | |||
222 | static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; | ||
223 | |||
194 | static __init int pid_namespaces_init(void) | 224 | static __init int pid_namespaces_init(void) |
195 | { | 225 | { |
196 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); | 226 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); |
227 | register_sysctl_paths(kern_path, pid_ns_ctl_table); | ||
197 | return 0; | 228 | return 0; |
198 | } | 229 | } |
199 | 230 | ||
diff --git a/kernel/sys.c b/kernel/sys.c index ddf8155bf3f8..40701538fbd1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
1692 | return mask; | 1692 | return mask; |
1693 | } | 1693 | } |
1694 | 1694 | ||
1695 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
1696 | static int prctl_set_mm(int opt, unsigned long addr, | ||
1697 | unsigned long arg4, unsigned long arg5) | ||
1698 | { | ||
1699 | unsigned long rlim = rlimit(RLIMIT_DATA); | ||
1700 | unsigned long vm_req_flags; | ||
1701 | unsigned long vm_bad_flags; | ||
1702 | struct vm_area_struct *vma; | ||
1703 | int error = 0; | ||
1704 | struct mm_struct *mm = current->mm; | ||
1705 | |||
1706 | if (arg4 | arg5) | ||
1707 | return -EINVAL; | ||
1708 | |||
1709 | if (!capable(CAP_SYS_ADMIN)) | ||
1710 | return -EPERM; | ||
1711 | |||
1712 | if (addr >= TASK_SIZE) | ||
1713 | return -EINVAL; | ||
1714 | |||
1715 | down_read(&mm->mmap_sem); | ||
1716 | vma = find_vma(mm, addr); | ||
1717 | |||
1718 | if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { | ||
1719 | /* It must be existing VMA */ | ||
1720 | if (!vma || vma->vm_start > addr) | ||
1721 | goto out; | ||
1722 | } | ||
1723 | |||
1724 | error = -EINVAL; | ||
1725 | switch (opt) { | ||
1726 | case PR_SET_MM_START_CODE: | ||
1727 | case PR_SET_MM_END_CODE: | ||
1728 | vm_req_flags = VM_READ | VM_EXEC; | ||
1729 | vm_bad_flags = VM_WRITE | VM_MAYSHARE; | ||
1730 | |||
1731 | if ((vma->vm_flags & vm_req_flags) != vm_req_flags || | ||
1732 | (vma->vm_flags & vm_bad_flags)) | ||
1733 | goto out; | ||
1734 | |||
1735 | if (opt == PR_SET_MM_START_CODE) | ||
1736 | mm->start_code = addr; | ||
1737 | else | ||
1738 | mm->end_code = addr; | ||
1739 | break; | ||
1740 | |||
1741 | case PR_SET_MM_START_DATA: | ||
1742 | case PR_SET_MM_END_DATA: | ||
1743 | vm_req_flags = VM_READ | VM_WRITE; | ||
1744 | vm_bad_flags = VM_EXEC | VM_MAYSHARE; | ||
1745 | |||
1746 | if ((vma->vm_flags & vm_req_flags) != vm_req_flags || | ||
1747 | (vma->vm_flags & vm_bad_flags)) | ||
1748 | goto out; | ||
1749 | |||
1750 | if (opt == PR_SET_MM_START_DATA) | ||
1751 | mm->start_data = addr; | ||
1752 | else | ||
1753 | mm->end_data = addr; | ||
1754 | break; | ||
1755 | |||
1756 | case PR_SET_MM_START_STACK: | ||
1757 | |||
1758 | #ifdef CONFIG_STACK_GROWSUP | ||
1759 | vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; | ||
1760 | #else | ||
1761 | vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; | ||
1762 | #endif | ||
1763 | if ((vma->vm_flags & vm_req_flags) != vm_req_flags) | ||
1764 | goto out; | ||
1765 | |||
1766 | mm->start_stack = addr; | ||
1767 | break; | ||
1768 | |||
1769 | case PR_SET_MM_START_BRK: | ||
1770 | if (addr <= mm->end_data) | ||
1771 | goto out; | ||
1772 | |||
1773 | if (rlim < RLIM_INFINITY && | ||
1774 | (mm->brk - addr) + | ||
1775 | (mm->end_data - mm->start_data) > rlim) | ||
1776 | goto out; | ||
1777 | |||
1778 | mm->start_brk = addr; | ||
1779 | break; | ||
1780 | |||
1781 | case PR_SET_MM_BRK: | ||
1782 | if (addr <= mm->end_data) | ||
1783 | goto out; | ||
1784 | |||
1785 | if (rlim < RLIM_INFINITY && | ||
1786 | (addr - mm->start_brk) + | ||
1787 | (mm->end_data - mm->start_data) > rlim) | ||
1788 | goto out; | ||
1789 | |||
1790 | mm->brk = addr; | ||
1791 | break; | ||
1792 | |||
1793 | default: | ||
1794 | error = -EINVAL; | ||
1795 | goto out; | ||
1796 | } | ||
1797 | |||
1798 | error = 0; | ||
1799 | |||
1800 | out: | ||
1801 | up_read(&mm->mmap_sem); | ||
1802 | |||
1803 | return error; | ||
1804 | } | ||
1805 | #else /* CONFIG_CHECKPOINT_RESTORE */ | ||
1806 | static int prctl_set_mm(int opt, unsigned long addr, | ||
1807 | unsigned long arg4, unsigned long arg5) | ||
1808 | { | ||
1809 | return -EINVAL; | ||
1810 | } | ||
1811 | #endif | ||
1812 | |||
1695 | SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | 1813 | SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, |
1696 | unsigned long, arg4, unsigned long, arg5) | 1814 | unsigned long, arg4, unsigned long, arg5) |
1697 | { | 1815 | { |
@@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1841 | else | 1959 | else |
1842 | error = PR_MCE_KILL_DEFAULT; | 1960 | error = PR_MCE_KILL_DEFAULT; |
1843 | break; | 1961 | break; |
1962 | case PR_SET_MM: | ||
1963 | error = prctl_set_mm(arg2, arg3, arg4, arg5); | ||
1964 | break; | ||
1844 | default: | 1965 | default: |
1845 | error = -EINVAL; | 1966 | error = -EINVAL; |
1846 | break; | 1967 | break; |
diff --git a/lib/decompress_unlzo.c b/lib/decompress_unlzo.c index 5a7a2adf4c4c..4531294fa62f 100644 --- a/lib/decompress_unlzo.c +++ b/lib/decompress_unlzo.c | |||
@@ -279,7 +279,7 @@ STATIC inline int INIT unlzo(u8 *input, int in_len, | |||
279 | ret = 0; | 279 | ret = 0; |
280 | exit_2: | 280 | exit_2: |
281 | if (!input) | 281 | if (!input) |
282 | free(in_buf); | 282 | free(in_buf_save); |
283 | exit_1: | 283 | exit_1: |
284 | if (!output) | 284 | if (!output) |
285 | free(out_buf); | 285 | free(out_buf); |
diff --git a/lib/radix-tree.c b/lib/radix-tree.c index d9df7454519c..dc63d0818394 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c | |||
@@ -48,16 +48,14 @@ | |||
48 | struct radix_tree_node { | 48 | struct radix_tree_node { |
49 | unsigned int height; /* Height from the bottom */ | 49 | unsigned int height; /* Height from the bottom */ |
50 | unsigned int count; | 50 | unsigned int count; |
51 | struct rcu_head rcu_head; | 51 | union { |
52 | struct radix_tree_node *parent; /* Used when ascending tree */ | ||
53 | struct rcu_head rcu_head; /* Used when freeing node */ | ||
54 | }; | ||
52 | void __rcu *slots[RADIX_TREE_MAP_SIZE]; | 55 | void __rcu *slots[RADIX_TREE_MAP_SIZE]; |
53 | unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; | 56 | unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; |
54 | }; | 57 | }; |
55 | 58 | ||
56 | struct radix_tree_path { | ||
57 | struct radix_tree_node *node; | ||
58 | int offset; | ||
59 | }; | ||
60 | |||
61 | #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) | 59 | #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) |
62 | #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ | 60 | #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ |
63 | RADIX_TREE_MAP_SHIFT)) | 61 | RADIX_TREE_MAP_SHIFT)) |
@@ -256,6 +254,7 @@ static inline unsigned long radix_tree_maxindex(unsigned int height) | |||
256 | static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) | 254 | static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) |
257 | { | 255 | { |
258 | struct radix_tree_node *node; | 256 | struct radix_tree_node *node; |
257 | struct radix_tree_node *slot; | ||
259 | unsigned int height; | 258 | unsigned int height; |
260 | int tag; | 259 | int tag; |
261 | 260 | ||
@@ -274,18 +273,23 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) | |||
274 | if (!(node = radix_tree_node_alloc(root))) | 273 | if (!(node = radix_tree_node_alloc(root))) |
275 | return -ENOMEM; | 274 | return -ENOMEM; |
276 | 275 | ||
277 | /* Increase the height. */ | ||
278 | node->slots[0] = indirect_to_ptr(root->rnode); | ||
279 | |||
280 | /* Propagate the aggregated tag info into the new root */ | 276 | /* Propagate the aggregated tag info into the new root */ |
281 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { | 277 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { |
282 | if (root_tag_get(root, tag)) | 278 | if (root_tag_get(root, tag)) |
283 | tag_set(node, tag, 0); | 279 | tag_set(node, tag, 0); |
284 | } | 280 | } |
285 | 281 | ||
282 | /* Increase the height. */ | ||
286 | newheight = root->height+1; | 283 | newheight = root->height+1; |
287 | node->height = newheight; | 284 | node->height = newheight; |
288 | node->count = 1; | 285 | node->count = 1; |
286 | node->parent = NULL; | ||
287 | slot = root->rnode; | ||
288 | if (newheight > 1) { | ||
289 | slot = indirect_to_ptr(slot); | ||
290 | slot->parent = node; | ||
291 | } | ||
292 | node->slots[0] = slot; | ||
289 | node = ptr_to_indirect(node); | 293 | node = ptr_to_indirect(node); |
290 | rcu_assign_pointer(root->rnode, node); | 294 | rcu_assign_pointer(root->rnode, node); |
291 | root->height = newheight; | 295 | root->height = newheight; |
@@ -331,6 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root, | |||
331 | if (!(slot = radix_tree_node_alloc(root))) | 335 | if (!(slot = radix_tree_node_alloc(root))) |
332 | return -ENOMEM; | 336 | return -ENOMEM; |
333 | slot->height = height; | 337 | slot->height = height; |
338 | slot->parent = node; | ||
334 | if (node) { | 339 | if (node) { |
335 | rcu_assign_pointer(node->slots[offset], slot); | 340 | rcu_assign_pointer(node->slots[offset], slot); |
336 | node->count++; | 341 | node->count++; |
@@ -504,47 +509,41 @@ EXPORT_SYMBOL(radix_tree_tag_set); | |||
504 | void *radix_tree_tag_clear(struct radix_tree_root *root, | 509 | void *radix_tree_tag_clear(struct radix_tree_root *root, |
505 | unsigned long index, unsigned int tag) | 510 | unsigned long index, unsigned int tag) |
506 | { | 511 | { |
507 | /* | 512 | struct radix_tree_node *node = NULL; |
508 | * The radix tree path needs to be one longer than the maximum path | ||
509 | * since the "list" is null terminated. | ||
510 | */ | ||
511 | struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; | ||
512 | struct radix_tree_node *slot = NULL; | 513 | struct radix_tree_node *slot = NULL; |
513 | unsigned int height, shift; | 514 | unsigned int height, shift; |
515 | int uninitialized_var(offset); | ||
514 | 516 | ||
515 | height = root->height; | 517 | height = root->height; |
516 | if (index > radix_tree_maxindex(height)) | 518 | if (index > radix_tree_maxindex(height)) |
517 | goto out; | 519 | goto out; |
518 | 520 | ||
519 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | 521 | shift = height * RADIX_TREE_MAP_SHIFT; |
520 | pathp->node = NULL; | ||
521 | slot = indirect_to_ptr(root->rnode); | 522 | slot = indirect_to_ptr(root->rnode); |
522 | 523 | ||
523 | while (height > 0) { | 524 | while (shift) { |
524 | int offset; | ||
525 | |||
526 | if (slot == NULL) | 525 | if (slot == NULL) |
527 | goto out; | 526 | goto out; |
528 | 527 | ||
528 | shift -= RADIX_TREE_MAP_SHIFT; | ||
529 | offset = (index >> shift) & RADIX_TREE_MAP_MASK; | 529 | offset = (index >> shift) & RADIX_TREE_MAP_MASK; |
530 | pathp[1].offset = offset; | 530 | node = slot; |
531 | pathp[1].node = slot; | ||
532 | slot = slot->slots[offset]; | 531 | slot = slot->slots[offset]; |
533 | pathp++; | ||
534 | shift -= RADIX_TREE_MAP_SHIFT; | ||
535 | height--; | ||
536 | } | 532 | } |
537 | 533 | ||
538 | if (slot == NULL) | 534 | if (slot == NULL) |
539 | goto out; | 535 | goto out; |
540 | 536 | ||
541 | while (pathp->node) { | 537 | while (node) { |
542 | if (!tag_get(pathp->node, tag, pathp->offset)) | 538 | if (!tag_get(node, tag, offset)) |
543 | goto out; | 539 | goto out; |
544 | tag_clear(pathp->node, tag, pathp->offset); | 540 | tag_clear(node, tag, offset); |
545 | if (any_tag_set(pathp->node, tag)) | 541 | if (any_tag_set(node, tag)) |
546 | goto out; | 542 | goto out; |
547 | pathp--; | 543 | |
544 | index >>= RADIX_TREE_MAP_SHIFT; | ||
545 | offset = index & RADIX_TREE_MAP_MASK; | ||
546 | node = node->parent; | ||
548 | } | 547 | } |
549 | 548 | ||
550 | /* clear the root's tag bit */ | 549 | /* clear the root's tag bit */ |
@@ -646,8 +645,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, | |||
646 | unsigned int iftag, unsigned int settag) | 645 | unsigned int iftag, unsigned int settag) |
647 | { | 646 | { |
648 | unsigned int height = root->height; | 647 | unsigned int height = root->height; |
649 | struct radix_tree_path path[height]; | 648 | struct radix_tree_node *node = NULL; |
650 | struct radix_tree_path *pathp = path; | ||
651 | struct radix_tree_node *slot; | 649 | struct radix_tree_node *slot; |
652 | unsigned int shift; | 650 | unsigned int shift; |
653 | unsigned long tagged = 0; | 651 | unsigned long tagged = 0; |
@@ -671,14 +669,8 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, | |||
671 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | 669 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; |
672 | slot = indirect_to_ptr(root->rnode); | 670 | slot = indirect_to_ptr(root->rnode); |
673 | 671 | ||
674 | /* | ||
675 | * we fill the path from (root->height - 2) to 0, leaving the index at | ||
676 | * (root->height - 1) as a terminator. Zero the node in the terminator | ||
677 | * so that we can use this to end walk loops back up the path. | ||
678 | */ | ||
679 | path[height - 1].node = NULL; | ||
680 | |||
681 | for (;;) { | 672 | for (;;) { |
673 | unsigned long upindex; | ||
682 | int offset; | 674 | int offset; |
683 | 675 | ||
684 | offset = (index >> shift) & RADIX_TREE_MAP_MASK; | 676 | offset = (index >> shift) & RADIX_TREE_MAP_MASK; |
@@ -686,12 +678,10 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, | |||
686 | goto next; | 678 | goto next; |
687 | if (!tag_get(slot, iftag, offset)) | 679 | if (!tag_get(slot, iftag, offset)) |
688 | goto next; | 680 | goto next; |
689 | if (height > 1) { | 681 | if (shift) { |
690 | /* Go down one level */ | 682 | /* Go down one level */ |
691 | height--; | ||
692 | shift -= RADIX_TREE_MAP_SHIFT; | 683 | shift -= RADIX_TREE_MAP_SHIFT; |
693 | path[height - 1].node = slot; | 684 | node = slot; |
694 | path[height - 1].offset = offset; | ||
695 | slot = slot->slots[offset]; | 685 | slot = slot->slots[offset]; |
696 | continue; | 686 | continue; |
697 | } | 687 | } |
@@ -701,15 +691,27 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, | |||
701 | tag_set(slot, settag, offset); | 691 | tag_set(slot, settag, offset); |
702 | 692 | ||
703 | /* walk back up the path tagging interior nodes */ | 693 | /* walk back up the path tagging interior nodes */ |
704 | pathp = &path[0]; | 694 | upindex = index; |
705 | while (pathp->node) { | 695 | while (node) { |
696 | upindex >>= RADIX_TREE_MAP_SHIFT; | ||
697 | offset = upindex & RADIX_TREE_MAP_MASK; | ||
698 | |||
706 | /* stop if we find a node with the tag already set */ | 699 | /* stop if we find a node with the tag already set */ |
707 | if (tag_get(pathp->node, settag, pathp->offset)) | 700 | if (tag_get(node, settag, offset)) |
708 | break; | 701 | break; |
709 | tag_set(pathp->node, settag, pathp->offset); | 702 | tag_set(node, settag, offset); |
710 | pathp++; | 703 | node = node->parent; |
711 | } | 704 | } |
712 | 705 | ||
706 | /* | ||
707 | * Small optimization: now clear that node pointer. | ||
708 | * Since all of this slot's ancestors now have the tag set | ||
709 | * from setting it above, we have no further need to walk | ||
710 | * back up the tree setting tags, until we update slot to | ||
711 | * point to another radix_tree_node. | ||
712 | */ | ||
713 | node = NULL; | ||
714 | |||
713 | next: | 715 | next: |
714 | /* Go to next item at level determined by 'shift' */ | 716 | /* Go to next item at level determined by 'shift' */ |
715 | index = ((index >> shift) + 1) << shift; | 717 | index = ((index >> shift) + 1) << shift; |
@@ -724,8 +726,7 @@ next: | |||
724 | * last_index is guaranteed to be in the tree, what | 726 | * last_index is guaranteed to be in the tree, what |
725 | * we do below cannot wander astray. | 727 | * we do below cannot wander astray. |
726 | */ | 728 | */ |
727 | slot = path[height - 1].node; | 729 | slot = slot->parent; |
728 | height++; | ||
729 | shift += RADIX_TREE_MAP_SHIFT; | 730 | shift += RADIX_TREE_MAP_SHIFT; |
730 | } | 731 | } |
731 | } | 732 | } |
@@ -1299,7 +1300,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) | |||
1299 | /* try to shrink tree height */ | 1300 | /* try to shrink tree height */ |
1300 | while (root->height > 0) { | 1301 | while (root->height > 0) { |
1301 | struct radix_tree_node *to_free = root->rnode; | 1302 | struct radix_tree_node *to_free = root->rnode; |
1302 | void *newptr; | 1303 | struct radix_tree_node *slot; |
1303 | 1304 | ||
1304 | BUG_ON(!radix_tree_is_indirect_ptr(to_free)); | 1305 | BUG_ON(!radix_tree_is_indirect_ptr(to_free)); |
1305 | to_free = indirect_to_ptr(to_free); | 1306 | to_free = indirect_to_ptr(to_free); |
@@ -1320,10 +1321,12 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) | |||
1320 | * (to_free->slots[0]), it will be safe to dereference the new | 1321 | * (to_free->slots[0]), it will be safe to dereference the new |
1321 | * one (root->rnode) as far as dependent read barriers go. | 1322 | * one (root->rnode) as far as dependent read barriers go. |
1322 | */ | 1323 | */ |
1323 | newptr = to_free->slots[0]; | 1324 | slot = to_free->slots[0]; |
1324 | if (root->height > 1) | 1325 | if (root->height > 1) { |
1325 | newptr = ptr_to_indirect(newptr); | 1326 | slot->parent = NULL; |
1326 | root->rnode = newptr; | 1327 | slot = ptr_to_indirect(slot); |
1328 | } | ||
1329 | root->rnode = slot; | ||
1327 | root->height--; | 1330 | root->height--; |
1328 | 1331 | ||
1329 | /* | 1332 | /* |
@@ -1363,16 +1366,12 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) | |||
1363 | */ | 1366 | */ |
1364 | void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) | 1367 | void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) |
1365 | { | 1368 | { |
1366 | /* | 1369 | struct radix_tree_node *node = NULL; |
1367 | * The radix tree path needs to be one longer than the maximum path | ||
1368 | * since the "list" is null terminated. | ||
1369 | */ | ||
1370 | struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; | ||
1371 | struct radix_tree_node *slot = NULL; | 1370 | struct radix_tree_node *slot = NULL; |
1372 | struct radix_tree_node *to_free; | 1371 | struct radix_tree_node *to_free; |
1373 | unsigned int height, shift; | 1372 | unsigned int height, shift; |
1374 | int tag; | 1373 | int tag; |
1375 | int offset; | 1374 | int uninitialized_var(offset); |
1376 | 1375 | ||
1377 | height = root->height; | 1376 | height = root->height; |
1378 | if (index > radix_tree_maxindex(height)) | 1377 | if (index > radix_tree_maxindex(height)) |
@@ -1385,39 +1384,35 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) | |||
1385 | goto out; | 1384 | goto out; |
1386 | } | 1385 | } |
1387 | slot = indirect_to_ptr(slot); | 1386 | slot = indirect_to_ptr(slot); |
1388 | 1387 | shift = height * RADIX_TREE_MAP_SHIFT; | |
1389 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | ||
1390 | pathp->node = NULL; | ||
1391 | 1388 | ||
1392 | do { | 1389 | do { |
1393 | if (slot == NULL) | 1390 | if (slot == NULL) |
1394 | goto out; | 1391 | goto out; |
1395 | 1392 | ||
1396 | pathp++; | 1393 | shift -= RADIX_TREE_MAP_SHIFT; |
1397 | offset = (index >> shift) & RADIX_TREE_MAP_MASK; | 1394 | offset = (index >> shift) & RADIX_TREE_MAP_MASK; |
1398 | pathp->offset = offset; | 1395 | node = slot; |
1399 | pathp->node = slot; | ||
1400 | slot = slot->slots[offset]; | 1396 | slot = slot->slots[offset]; |
1401 | shift -= RADIX_TREE_MAP_SHIFT; | 1397 | } while (shift); |
1402 | height--; | ||
1403 | } while (height > 0); | ||
1404 | 1398 | ||
1405 | if (slot == NULL) | 1399 | if (slot == NULL) |
1406 | goto out; | 1400 | goto out; |
1407 | 1401 | ||
1408 | /* | 1402 | /* |
1409 | * Clear all tags associated with the just-deleted item | 1403 | * Clear all tags associated with the item to be deleted. |
1404 | * This way of doing it would be inefficient, but seldom is any set. | ||
1410 | */ | 1405 | */ |
1411 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { | 1406 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { |
1412 | if (tag_get(pathp->node, tag, pathp->offset)) | 1407 | if (tag_get(node, tag, offset)) |
1413 | radix_tree_tag_clear(root, index, tag); | 1408 | radix_tree_tag_clear(root, index, tag); |
1414 | } | 1409 | } |
1415 | 1410 | ||
1416 | to_free = NULL; | 1411 | to_free = NULL; |
1417 | /* Now free the nodes we do not need anymore */ | 1412 | /* Now free the nodes we do not need anymore */ |
1418 | while (pathp->node) { | 1413 | while (node) { |
1419 | pathp->node->slots[pathp->offset] = NULL; | 1414 | node->slots[offset] = NULL; |
1420 | pathp->node->count--; | 1415 | node->count--; |
1421 | /* | 1416 | /* |
1422 | * Queue the node for deferred freeing after the | 1417 | * Queue the node for deferred freeing after the |
1423 | * last reference to it disappears (set NULL, above). | 1418 | * last reference to it disappears (set NULL, above). |
@@ -1425,17 +1420,20 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) | |||
1425 | if (to_free) | 1420 | if (to_free) |
1426 | radix_tree_node_free(to_free); | 1421 | radix_tree_node_free(to_free); |
1427 | 1422 | ||
1428 | if (pathp->node->count) { | 1423 | if (node->count) { |
1429 | if (pathp->node == indirect_to_ptr(root->rnode)) | 1424 | if (node == indirect_to_ptr(root->rnode)) |
1430 | radix_tree_shrink(root); | 1425 | radix_tree_shrink(root); |
1431 | goto out; | 1426 | goto out; |
1432 | } | 1427 | } |
1433 | 1428 | ||
1434 | /* Node with zero slots in use so free it */ | 1429 | /* Node with zero slots in use so free it */ |
1435 | to_free = pathp->node; | 1430 | to_free = node; |
1436 | pathp--; | ||
1437 | 1431 | ||
1432 | index >>= RADIX_TREE_MAP_SHIFT; | ||
1433 | offset = index & RADIX_TREE_MAP_MASK; | ||
1434 | node = node->parent; | ||
1438 | } | 1435 | } |
1436 | |||
1439 | root_tag_clear_all(root); | 1437 | root_tag_clear_all(root); |
1440 | root->height = 0; | 1438 | root->height = 0; |
1441 | root->rnode = NULL; | 1439 | root->rnode = NULL; |
diff --git a/mm/compaction.c b/mm/compaction.c index e6670c34eb49..71a58f67f481 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -350,7 +350,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
350 | } | 350 | } |
351 | 351 | ||
352 | if (!cc->sync) | 352 | if (!cc->sync) |
353 | mode |= ISOLATE_CLEAN; | 353 | mode |= ISOLATE_ASYNC_MIGRATE; |
354 | 354 | ||
355 | /* Try isolate the page */ | 355 | /* Try isolate the page */ |
356 | if (__isolate_lru_page(page, mode, 0) != 0) | 356 | if (__isolate_lru_page(page, mode, 0) != 0) |
@@ -557,7 +557,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
557 | nr_migrate = cc->nr_migratepages; | 557 | nr_migrate = cc->nr_migratepages; |
558 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 558 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
559 | (unsigned long)cc, false, | 559 | (unsigned long)cc, false, |
560 | cc->sync); | 560 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); |
561 | update_nr_listpages(cc); | 561 | update_nr_listpages(cc); |
562 | nr_remaining = cc->nr_migratepages; | 562 | nr_remaining = cc->nr_migratepages; |
563 | 563 | ||
@@ -671,6 +671,7 @@ static int compact_node(int nid) | |||
671 | .nr_freepages = 0, | 671 | .nr_freepages = 0, |
672 | .nr_migratepages = 0, | 672 | .nr_migratepages = 0, |
673 | .order = -1, | 673 | .order = -1, |
674 | .sync = true, | ||
674 | }; | 675 | }; |
675 | 676 | ||
676 | zone = &pgdat->node_zones[zoneid]; | 677 | zone = &pgdat->node_zones[zoneid]; |
diff --git a/mm/filemap.c b/mm/filemap.c index c4ee2e918bea..97f49ed35bd2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -393,24 +393,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range); | |||
393 | int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | 393 | int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) |
394 | { | 394 | { |
395 | int error; | 395 | int error; |
396 | struct mem_cgroup *memcg = NULL; | ||
397 | 396 | ||
398 | VM_BUG_ON(!PageLocked(old)); | 397 | VM_BUG_ON(!PageLocked(old)); |
399 | VM_BUG_ON(!PageLocked(new)); | 398 | VM_BUG_ON(!PageLocked(new)); |
400 | VM_BUG_ON(new->mapping); | 399 | VM_BUG_ON(new->mapping); |
401 | 400 | ||
402 | /* | ||
403 | * This is not page migration, but prepare_migration and | ||
404 | * end_migration does enough work for charge replacement. | ||
405 | * | ||
406 | * In the longer term we probably want a specialized function | ||
407 | * for moving the charge from old to new in a more efficient | ||
408 | * manner. | ||
409 | */ | ||
410 | error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); | ||
411 | if (error) | ||
412 | return error; | ||
413 | |||
414 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 401 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); |
415 | if (!error) { | 402 | if (!error) { |
416 | struct address_space *mapping = old->mapping; | 403 | struct address_space *mapping = old->mapping; |
@@ -432,13 +419,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | |||
432 | if (PageSwapBacked(new)) | 419 | if (PageSwapBacked(new)) |
433 | __inc_zone_page_state(new, NR_SHMEM); | 420 | __inc_zone_page_state(new, NR_SHMEM); |
434 | spin_unlock_irq(&mapping->tree_lock); | 421 | spin_unlock_irq(&mapping->tree_lock); |
422 | /* mem_cgroup codes must not be called under tree_lock */ | ||
423 | mem_cgroup_replace_page_cache(old, new); | ||
435 | radix_tree_preload_end(); | 424 | radix_tree_preload_end(); |
436 | if (freepage) | 425 | if (freepage) |
437 | freepage(old); | 426 | freepage(old); |
438 | page_cache_release(old); | 427 | page_cache_release(old); |
439 | mem_cgroup_end_migration(memcg, old, new, true); | ||
440 | } else { | ||
441 | mem_cgroup_end_migration(memcg, old, new, false); | ||
442 | } | 428 | } |
443 | 429 | ||
444 | return error; | 430 | return error; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 36b3d988b4ef..b3ffc21ce801 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -487,41 +487,68 @@ static struct attribute_group khugepaged_attr_group = { | |||
487 | .attrs = khugepaged_attr, | 487 | .attrs = khugepaged_attr, |
488 | .name = "khugepaged", | 488 | .name = "khugepaged", |
489 | }; | 489 | }; |
490 | #endif /* CONFIG_SYSFS */ | ||
491 | 490 | ||
492 | static int __init hugepage_init(void) | 491 | static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) |
493 | { | 492 | { |
494 | int err; | 493 | int err; |
495 | #ifdef CONFIG_SYSFS | ||
496 | static struct kobject *hugepage_kobj; | ||
497 | #endif | ||
498 | |||
499 | err = -EINVAL; | ||
500 | if (!has_transparent_hugepage()) { | ||
501 | transparent_hugepage_flags = 0; | ||
502 | goto out; | ||
503 | } | ||
504 | 494 | ||
505 | #ifdef CONFIG_SYSFS | 495 | *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); |
506 | err = -ENOMEM; | 496 | if (unlikely(!*hugepage_kobj)) { |
507 | hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); | ||
508 | if (unlikely(!hugepage_kobj)) { | ||
509 | printk(KERN_ERR "hugepage: failed kobject create\n"); | 497 | printk(KERN_ERR "hugepage: failed kobject create\n"); |
510 | goto out; | 498 | return -ENOMEM; |
511 | } | 499 | } |
512 | 500 | ||
513 | err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); | 501 | err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); |
514 | if (err) { | 502 | if (err) { |
515 | printk(KERN_ERR "hugepage: failed register hugeage group\n"); | 503 | printk(KERN_ERR "hugepage: failed register hugeage group\n"); |
516 | goto out; | 504 | goto delete_obj; |
517 | } | 505 | } |
518 | 506 | ||
519 | err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); | 507 | err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); |
520 | if (err) { | 508 | if (err) { |
521 | printk(KERN_ERR "hugepage: failed register hugeage group\n"); | 509 | printk(KERN_ERR "hugepage: failed register hugeage group\n"); |
522 | goto out; | 510 | goto remove_hp_group; |
523 | } | 511 | } |
524 | #endif | 512 | |
513 | return 0; | ||
514 | |||
515 | remove_hp_group: | ||
516 | sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); | ||
517 | delete_obj: | ||
518 | kobject_put(*hugepage_kobj); | ||
519 | return err; | ||
520 | } | ||
521 | |||
522 | static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) | ||
523 | { | ||
524 | sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); | ||
525 | sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); | ||
526 | kobject_put(hugepage_kobj); | ||
527 | } | ||
528 | #else | ||
529 | static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) | ||
530 | { | ||
531 | return 0; | ||
532 | } | ||
533 | |||
534 | static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) | ||
535 | { | ||
536 | } | ||
537 | #endif /* CONFIG_SYSFS */ | ||
538 | |||
539 | static int __init hugepage_init(void) | ||
540 | { | ||
541 | int err; | ||
542 | struct kobject *hugepage_kobj; | ||
543 | |||
544 | if (!has_transparent_hugepage()) { | ||
545 | transparent_hugepage_flags = 0; | ||
546 | return -EINVAL; | ||
547 | } | ||
548 | |||
549 | err = hugepage_init_sysfs(&hugepage_kobj); | ||
550 | if (err) | ||
551 | return err; | ||
525 | 552 | ||
526 | err = khugepaged_slab_init(); | 553 | err = khugepaged_slab_init(); |
527 | if (err) | 554 | if (err) |
@@ -545,7 +572,9 @@ static int __init hugepage_init(void) | |||
545 | 572 | ||
546 | set_recommended_min_free_kbytes(); | 573 | set_recommended_min_free_kbytes(); |
547 | 574 | ||
575 | return 0; | ||
548 | out: | 576 | out: |
577 | hugepage_exit_sysfs(hugepage_kobj); | ||
549 | return err; | 578 | return err; |
550 | } | 579 | } |
551 | module_init(hugepage_init) | 580 | module_init(hugepage_init) |
@@ -997,7 +1026,7 @@ out: | |||
997 | } | 1026 | } |
998 | 1027 | ||
999 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | 1028 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, |
1000 | pmd_t *pmd) | 1029 | pmd_t *pmd, unsigned long addr) |
1001 | { | 1030 | { |
1002 | int ret = 0; | 1031 | int ret = 0; |
1003 | 1032 | ||
@@ -1013,6 +1042,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1013 | pgtable = get_pmd_huge_pte(tlb->mm); | 1042 | pgtable = get_pmd_huge_pte(tlb->mm); |
1014 | page = pmd_page(*pmd); | 1043 | page = pmd_page(*pmd); |
1015 | pmd_clear(pmd); | 1044 | pmd_clear(pmd); |
1045 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | ||
1016 | page_remove_rmap(page); | 1046 | page_remove_rmap(page); |
1017 | VM_BUG_ON(page_mapcount(page) < 0); | 1047 | VM_BUG_ON(page_mapcount(page) < 0); |
1018 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | 1048 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); |
@@ -1116,7 +1146,6 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1116 | entry = pmd_modify(entry, newprot); | 1146 | entry = pmd_modify(entry, newprot); |
1117 | set_pmd_at(mm, addr, pmd, entry); | 1147 | set_pmd_at(mm, addr, pmd, entry); |
1118 | spin_unlock(&vma->vm_mm->page_table_lock); | 1148 | spin_unlock(&vma->vm_mm->page_table_lock); |
1119 | flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); | ||
1120 | ret = 1; | 1149 | ret = 1; |
1121 | } | 1150 | } |
1122 | } else | 1151 | } else |
@@ -1199,16 +1228,16 @@ static int __split_huge_page_splitting(struct page *page, | |||
1199 | static void __split_huge_page_refcount(struct page *page) | 1228 | static void __split_huge_page_refcount(struct page *page) |
1200 | { | 1229 | { |
1201 | int i; | 1230 | int i; |
1202 | unsigned long head_index = page->index; | ||
1203 | struct zone *zone = page_zone(page); | 1231 | struct zone *zone = page_zone(page); |
1204 | int zonestat; | ||
1205 | int tail_count = 0; | 1232 | int tail_count = 0; |
1206 | 1233 | ||
1207 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | 1234 | /* prevent PageLRU to go away from under us, and freeze lru stats */ |
1208 | spin_lock_irq(&zone->lru_lock); | 1235 | spin_lock_irq(&zone->lru_lock); |
1209 | compound_lock(page); | 1236 | compound_lock(page); |
1237 | /* complete memcg works before add pages to LRU */ | ||
1238 | mem_cgroup_split_huge_fixup(page); | ||
1210 | 1239 | ||
1211 | for (i = 1; i < HPAGE_PMD_NR; i++) { | 1240 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { |
1212 | struct page *page_tail = page + i; | 1241 | struct page *page_tail = page + i; |
1213 | 1242 | ||
1214 | /* tail_page->_mapcount cannot change */ | 1243 | /* tail_page->_mapcount cannot change */ |
@@ -1271,14 +1300,13 @@ static void __split_huge_page_refcount(struct page *page) | |||
1271 | BUG_ON(page_tail->mapping); | 1300 | BUG_ON(page_tail->mapping); |
1272 | page_tail->mapping = page->mapping; | 1301 | page_tail->mapping = page->mapping; |
1273 | 1302 | ||
1274 | page_tail->index = ++head_index; | 1303 | page_tail->index = page->index + i; |
1275 | 1304 | ||
1276 | BUG_ON(!PageAnon(page_tail)); | 1305 | BUG_ON(!PageAnon(page_tail)); |
1277 | BUG_ON(!PageUptodate(page_tail)); | 1306 | BUG_ON(!PageUptodate(page_tail)); |
1278 | BUG_ON(!PageDirty(page_tail)); | 1307 | BUG_ON(!PageDirty(page_tail)); |
1279 | BUG_ON(!PageSwapBacked(page_tail)); | 1308 | BUG_ON(!PageSwapBacked(page_tail)); |
1280 | 1309 | ||
1281 | mem_cgroup_split_huge_fixup(page, page_tail); | ||
1282 | 1310 | ||
1283 | lru_add_page_tail(zone, page, page_tail); | 1311 | lru_add_page_tail(zone, page, page_tail); |
1284 | } | 1312 | } |
@@ -1288,15 +1316,6 @@ static void __split_huge_page_refcount(struct page *page) | |||
1288 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1316 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
1289 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); | 1317 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); |
1290 | 1318 | ||
1291 | /* | ||
1292 | * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, | ||
1293 | * so adjust those appropriately if this page is on the LRU. | ||
1294 | */ | ||
1295 | if (PageLRU(page)) { | ||
1296 | zonestat = NR_LRU_BASE + page_lru(page); | ||
1297 | __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); | ||
1298 | } | ||
1299 | |||
1300 | ClearPageCompound(page); | 1319 | ClearPageCompound(page); |
1301 | compound_unlock(page); | 1320 | compound_unlock(page); |
1302 | spin_unlock_irq(&zone->lru_lock); | 1321 | spin_unlock_irq(&zone->lru_lock); |
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/kthread.h> | 28 | #include <linux/kthread.h> |
29 | #include <linux/wait.h> | 29 | #include <linux/wait.h> |
30 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
31 | #include <linux/memcontrol.h> | ||
31 | #include <linux/rbtree.h> | 32 | #include <linux/rbtree.h> |
32 | #include <linux/memory.h> | 33 | #include <linux/memory.h> |
33 | #include <linux/mmu_notifier.h> | 34 | #include <linux/mmu_notifier.h> |
@@ -1571,6 +1572,16 @@ struct page *ksm_does_need_to_copy(struct page *page, | |||
1571 | 1572 | ||
1572 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 1573 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
1573 | if (new_page) { | 1574 | if (new_page) { |
1575 | /* | ||
1576 | * The memcg-specific accounting when moving | ||
1577 | * pages around the LRU lists relies on the | ||
1578 | * page's owner (memcg) to be valid. Usually, | ||
1579 | * pages are assigned to a new owner before | ||
1580 | * being put on the LRU list, but since this | ||
1581 | * is not the case here, the stale owner from | ||
1582 | * a previous allocation cycle must be reset. | ||
1583 | */ | ||
1584 | mem_cgroup_reset_owner(new_page); | ||
1574 | copy_user_highpage(new_page, page, address, vma); | 1585 | copy_user_highpage(new_page, page, address, vma); |
1575 | 1586 | ||
1576 | SetPageDirty(new_page); | 1587 | SetPageDirty(new_page); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d87aa3510c5e..602207be9853 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -123,16 +123,22 @@ struct mem_cgroup_stat_cpu { | |||
123 | unsigned long targets[MEM_CGROUP_NTARGETS]; | 123 | unsigned long targets[MEM_CGROUP_NTARGETS]; |
124 | }; | 124 | }; |
125 | 125 | ||
126 | struct mem_cgroup_reclaim_iter { | ||
127 | /* css_id of the last scanned hierarchy member */ | ||
128 | int position; | ||
129 | /* scan generation, increased every round-trip */ | ||
130 | unsigned int generation; | ||
131 | }; | ||
132 | |||
126 | /* | 133 | /* |
127 | * per-zone information in memory controller. | 134 | * per-zone information in memory controller. |
128 | */ | 135 | */ |
129 | struct mem_cgroup_per_zone { | 136 | struct mem_cgroup_per_zone { |
130 | /* | 137 | struct lruvec lruvec; |
131 | * spin_lock to protect the per cgroup LRU | ||
132 | */ | ||
133 | struct list_head lists[NR_LRU_LISTS]; | ||
134 | unsigned long count[NR_LRU_LISTS]; | 138 | unsigned long count[NR_LRU_LISTS]; |
135 | 139 | ||
140 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | ||
141 | |||
136 | struct zone_reclaim_stat reclaim_stat; | 142 | struct zone_reclaim_stat reclaim_stat; |
137 | struct rb_node tree_node; /* RB tree node */ | 143 | struct rb_node tree_node; /* RB tree node */ |
138 | unsigned long long usage_in_excess;/* Set to the value by which */ | 144 | unsigned long long usage_in_excess;/* Set to the value by which */ |
@@ -233,11 +239,6 @@ struct mem_cgroup { | |||
233 | * per zone LRU lists. | 239 | * per zone LRU lists. |
234 | */ | 240 | */ |
235 | struct mem_cgroup_lru_info info; | 241 | struct mem_cgroup_lru_info info; |
236 | /* | ||
237 | * While reclaiming in a hierarchy, we cache the last child we | ||
238 | * reclaimed from. | ||
239 | */ | ||
240 | int last_scanned_child; | ||
241 | int last_scanned_node; | 242 | int last_scanned_node; |
242 | #if MAX_NUMNODES > 1 | 243 | #if MAX_NUMNODES > 1 |
243 | nodemask_t scan_nodes; | 244 | nodemask_t scan_nodes; |
@@ -366,8 +367,6 @@ enum charge_type { | |||
366 | #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) | 367 | #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) |
367 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 | 368 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 |
368 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) | 369 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) |
369 | #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 | ||
370 | #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) | ||
371 | 370 | ||
372 | static void mem_cgroup_get(struct mem_cgroup *memcg); | 371 | static void mem_cgroup_get(struct mem_cgroup *memcg); |
373 | static void mem_cgroup_put(struct mem_cgroup *memcg); | 372 | static void mem_cgroup_put(struct mem_cgroup *memcg); |
@@ -566,7 +565,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) | |||
566 | struct mem_cgroup_per_zone *mz; | 565 | struct mem_cgroup_per_zone *mz; |
567 | struct mem_cgroup_tree_per_zone *mctz; | 566 | struct mem_cgroup_tree_per_zone *mctz; |
568 | 567 | ||
569 | for_each_node_state(node, N_POSSIBLE) { | 568 | for_each_node(node) { |
570 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 569 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
571 | mz = mem_cgroup_zoneinfo(memcg, node, zone); | 570 | mz = mem_cgroup_zoneinfo(memcg, node, zone); |
572 | mctz = soft_limit_tree_node_zone(node, zone); | 571 | mctz = soft_limit_tree_node_zone(node, zone); |
@@ -656,16 +655,6 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, | |||
656 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); | 655 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); |
657 | } | 656 | } |
658 | 657 | ||
659 | void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val) | ||
660 | { | ||
661 | this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); | ||
662 | } | ||
663 | |||
664 | void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val) | ||
665 | { | ||
666 | this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); | ||
667 | } | ||
668 | |||
669 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | 658 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, |
670 | enum mem_cgroup_events_index idx) | 659 | enum mem_cgroup_events_index idx) |
671 | { | 660 | { |
@@ -749,37 +738,32 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, | |||
749 | return total; | 738 | return total; |
750 | } | 739 | } |
751 | 740 | ||
752 | static bool __memcg_event_check(struct mem_cgroup *memcg, int target) | 741 | static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, |
742 | enum mem_cgroup_events_target target) | ||
753 | { | 743 | { |
754 | unsigned long val, next; | 744 | unsigned long val, next; |
755 | 745 | ||
756 | val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); | 746 | val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); |
757 | next = __this_cpu_read(memcg->stat->targets[target]); | 747 | next = __this_cpu_read(memcg->stat->targets[target]); |
758 | /* from time_after() in jiffies.h */ | 748 | /* from time_after() in jiffies.h */ |
759 | return ((long)next - (long)val < 0); | 749 | if ((long)next - (long)val < 0) { |
760 | } | 750 | switch (target) { |
761 | 751 | case MEM_CGROUP_TARGET_THRESH: | |
762 | static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) | 752 | next = val + THRESHOLDS_EVENTS_TARGET; |
763 | { | 753 | break; |
764 | unsigned long val, next; | 754 | case MEM_CGROUP_TARGET_SOFTLIMIT: |
765 | 755 | next = val + SOFTLIMIT_EVENTS_TARGET; | |
766 | val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); | 756 | break; |
767 | 757 | case MEM_CGROUP_TARGET_NUMAINFO: | |
768 | switch (target) { | 758 | next = val + NUMAINFO_EVENTS_TARGET; |
769 | case MEM_CGROUP_TARGET_THRESH: | 759 | break; |
770 | next = val + THRESHOLDS_EVENTS_TARGET; | 760 | default: |
771 | break; | 761 | break; |
772 | case MEM_CGROUP_TARGET_SOFTLIMIT: | 762 | } |
773 | next = val + SOFTLIMIT_EVENTS_TARGET; | 763 | __this_cpu_write(memcg->stat->targets[target], next); |
774 | break; | 764 | return true; |
775 | case MEM_CGROUP_TARGET_NUMAINFO: | ||
776 | next = val + NUMAINFO_EVENTS_TARGET; | ||
777 | break; | ||
778 | default: | ||
779 | return; | ||
780 | } | 765 | } |
781 | 766 | return false; | |
782 | __this_cpu_write(memcg->stat->targets[target], next); | ||
783 | } | 767 | } |
784 | 768 | ||
785 | /* | 769 | /* |
@@ -790,25 +774,27 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
790 | { | 774 | { |
791 | preempt_disable(); | 775 | preempt_disable(); |
792 | /* threshold event is triggered in finer grain than soft limit */ | 776 | /* threshold event is triggered in finer grain than soft limit */ |
793 | if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { | 777 | if (unlikely(mem_cgroup_event_ratelimit(memcg, |
778 | MEM_CGROUP_TARGET_THRESH))) { | ||
779 | bool do_softlimit, do_numainfo; | ||
780 | |||
781 | do_softlimit = mem_cgroup_event_ratelimit(memcg, | ||
782 | MEM_CGROUP_TARGET_SOFTLIMIT); | ||
783 | #if MAX_NUMNODES > 1 | ||
784 | do_numainfo = mem_cgroup_event_ratelimit(memcg, | ||
785 | MEM_CGROUP_TARGET_NUMAINFO); | ||
786 | #endif | ||
787 | preempt_enable(); | ||
788 | |||
794 | mem_cgroup_threshold(memcg); | 789 | mem_cgroup_threshold(memcg); |
795 | __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); | 790 | if (unlikely(do_softlimit)) |
796 | if (unlikely(__memcg_event_check(memcg, | ||
797 | MEM_CGROUP_TARGET_SOFTLIMIT))) { | ||
798 | mem_cgroup_update_tree(memcg, page); | 791 | mem_cgroup_update_tree(memcg, page); |
799 | __mem_cgroup_target_update(memcg, | ||
800 | MEM_CGROUP_TARGET_SOFTLIMIT); | ||
801 | } | ||
802 | #if MAX_NUMNODES > 1 | 792 | #if MAX_NUMNODES > 1 |
803 | if (unlikely(__memcg_event_check(memcg, | 793 | if (unlikely(do_numainfo)) |
804 | MEM_CGROUP_TARGET_NUMAINFO))) { | ||
805 | atomic_inc(&memcg->numainfo_events); | 794 | atomic_inc(&memcg->numainfo_events); |
806 | __mem_cgroup_target_update(memcg, | ||
807 | MEM_CGROUP_TARGET_NUMAINFO); | ||
808 | } | ||
809 | #endif | 795 | #endif |
810 | } | 796 | } else |
811 | preempt_enable(); | 797 | preempt_enable(); |
812 | } | 798 | } |
813 | 799 | ||
814 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 800 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
@@ -853,83 +839,116 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
853 | return memcg; | 839 | return memcg; |
854 | } | 840 | } |
855 | 841 | ||
856 | /* The caller has to guarantee "mem" exists before calling this */ | 842 | /** |
857 | static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) | 843 | * mem_cgroup_iter - iterate over memory cgroup hierarchy |
844 | * @root: hierarchy root | ||
845 | * @prev: previously returned memcg, NULL on first invocation | ||
846 | * @reclaim: cookie for shared reclaim walks, NULL for full walks | ||
847 | * | ||
848 | * Returns references to children of the hierarchy below @root, or | ||
849 | * @root itself, or %NULL after a full round-trip. | ||
850 | * | ||
851 | * Caller must pass the return value in @prev on subsequent | ||
852 | * invocations for reference counting, or use mem_cgroup_iter_break() | ||
853 | * to cancel a hierarchy walk before the round-trip is complete. | ||
854 | * | ||
855 | * Reclaimers can specify a zone and a priority level in @reclaim to | ||
856 | * divide up the memcgs in the hierarchy among all concurrent | ||
857 | * reclaimers operating on the same zone and priority. | ||
858 | */ | ||
859 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | ||
860 | struct mem_cgroup *prev, | ||
861 | struct mem_cgroup_reclaim_cookie *reclaim) | ||
858 | { | 862 | { |
859 | struct cgroup_subsys_state *css; | 863 | struct mem_cgroup *memcg = NULL; |
860 | int found; | 864 | int id = 0; |
861 | 865 | ||
862 | if (!memcg) /* ROOT cgroup has the smallest ID */ | 866 | if (mem_cgroup_disabled()) |
863 | return root_mem_cgroup; /*css_put/get against root is ignored*/ | ||
864 | if (!memcg->use_hierarchy) { | ||
865 | if (css_tryget(&memcg->css)) | ||
866 | return memcg; | ||
867 | return NULL; | 867 | return NULL; |
868 | } | ||
869 | rcu_read_lock(); | ||
870 | /* | ||
871 | * searching a memory cgroup which has the smallest ID under given | ||
872 | * ROOT cgroup. (ID >= 1) | ||
873 | */ | ||
874 | css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found); | ||
875 | if (css && css_tryget(css)) | ||
876 | memcg = container_of(css, struct mem_cgroup, css); | ||
877 | else | ||
878 | memcg = NULL; | ||
879 | rcu_read_unlock(); | ||
880 | return memcg; | ||
881 | } | ||
882 | 868 | ||
883 | static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, | 869 | if (!root) |
884 | struct mem_cgroup *root, | 870 | root = root_mem_cgroup; |
885 | bool cond) | ||
886 | { | ||
887 | int nextid = css_id(&iter->css) + 1; | ||
888 | int found; | ||
889 | int hierarchy_used; | ||
890 | struct cgroup_subsys_state *css; | ||
891 | 871 | ||
892 | hierarchy_used = iter->use_hierarchy; | 872 | if (prev && !reclaim) |
873 | id = css_id(&prev->css); | ||
893 | 874 | ||
894 | css_put(&iter->css); | 875 | if (prev && prev != root) |
895 | /* If no ROOT, walk all, ignore hierarchy */ | 876 | css_put(&prev->css); |
896 | if (!cond || (root && !hierarchy_used)) | ||
897 | return NULL; | ||
898 | 877 | ||
899 | if (!root) | 878 | if (!root->use_hierarchy && root != root_mem_cgroup) { |
900 | root = root_mem_cgroup; | 879 | if (prev) |
880 | return NULL; | ||
881 | return root; | ||
882 | } | ||
901 | 883 | ||
902 | do { | 884 | while (!memcg) { |
903 | iter = NULL; | 885 | struct mem_cgroup_reclaim_iter *uninitialized_var(iter); |
904 | rcu_read_lock(); | 886 | struct cgroup_subsys_state *css; |
887 | |||
888 | if (reclaim) { | ||
889 | int nid = zone_to_nid(reclaim->zone); | ||
890 | int zid = zone_idx(reclaim->zone); | ||
891 | struct mem_cgroup_per_zone *mz; | ||
905 | 892 | ||
906 | css = css_get_next(&mem_cgroup_subsys, nextid, | 893 | mz = mem_cgroup_zoneinfo(root, nid, zid); |
907 | &root->css, &found); | 894 | iter = &mz->reclaim_iter[reclaim->priority]; |
908 | if (css && css_tryget(css)) | 895 | if (prev && reclaim->generation != iter->generation) |
909 | iter = container_of(css, struct mem_cgroup, css); | 896 | return NULL; |
897 | id = iter->position; | ||
898 | } | ||
899 | |||
900 | rcu_read_lock(); | ||
901 | css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); | ||
902 | if (css) { | ||
903 | if (css == &root->css || css_tryget(css)) | ||
904 | memcg = container_of(css, | ||
905 | struct mem_cgroup, css); | ||
906 | } else | ||
907 | id = 0; | ||
910 | rcu_read_unlock(); | 908 | rcu_read_unlock(); |
911 | /* If css is NULL, no more cgroups will be found */ | ||
912 | nextid = found + 1; | ||
913 | } while (css && !iter); | ||
914 | 909 | ||
915 | return iter; | 910 | if (reclaim) { |
911 | iter->position = id; | ||
912 | if (!css) | ||
913 | iter->generation++; | ||
914 | else if (!prev && memcg) | ||
915 | reclaim->generation = iter->generation; | ||
916 | } | ||
917 | |||
918 | if (prev && !css) | ||
919 | return NULL; | ||
920 | } | ||
921 | return memcg; | ||
916 | } | 922 | } |
917 | /* | ||
918 | * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please | ||
919 | * be careful that "break" loop is not allowed. We have reference count. | ||
920 | * Instead of that modify "cond" to be false and "continue" to exit the loop. | ||
921 | */ | ||
922 | #define for_each_mem_cgroup_tree_cond(iter, root, cond) \ | ||
923 | for (iter = mem_cgroup_start_loop(root);\ | ||
924 | iter != NULL;\ | ||
925 | iter = mem_cgroup_get_next(iter, root, cond)) | ||
926 | 923 | ||
927 | #define for_each_mem_cgroup_tree(iter, root) \ | 924 | /** |
928 | for_each_mem_cgroup_tree_cond(iter, root, true) | 925 | * mem_cgroup_iter_break - abort a hierarchy walk prematurely |
926 | * @root: hierarchy root | ||
927 | * @prev: last visited hierarchy member as returned by mem_cgroup_iter() | ||
928 | */ | ||
929 | void mem_cgroup_iter_break(struct mem_cgroup *root, | ||
930 | struct mem_cgroup *prev) | ||
931 | { | ||
932 | if (!root) | ||
933 | root = root_mem_cgroup; | ||
934 | if (prev && prev != root) | ||
935 | css_put(&prev->css); | ||
936 | } | ||
929 | 937 | ||
930 | #define for_each_mem_cgroup_all(iter) \ | 938 | /* |
931 | for_each_mem_cgroup_tree_cond(iter, NULL, true) | 939 | * Iteration constructs for visiting all cgroups (under a tree). If |
940 | * loops are exited prematurely (break), mem_cgroup_iter_break() must | ||
941 | * be used for reference counting. | ||
942 | */ | ||
943 | #define for_each_mem_cgroup_tree(iter, root) \ | ||
944 | for (iter = mem_cgroup_iter(root, NULL, NULL); \ | ||
945 | iter != NULL; \ | ||
946 | iter = mem_cgroup_iter(root, iter, NULL)) | ||
932 | 947 | ||
948 | #define for_each_mem_cgroup(iter) \ | ||
949 | for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ | ||
950 | iter != NULL; \ | ||
951 | iter = mem_cgroup_iter(NULL, iter, NULL)) | ||
933 | 952 | ||
934 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | 953 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) |
935 | { | 954 | { |
@@ -949,11 +968,11 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | |||
949 | goto out; | 968 | goto out; |
950 | 969 | ||
951 | switch (idx) { | 970 | switch (idx) { |
952 | case PGMAJFAULT: | ||
953 | mem_cgroup_pgmajfault(memcg, 1); | ||
954 | break; | ||
955 | case PGFAULT: | 971 | case PGFAULT: |
956 | mem_cgroup_pgfault(memcg, 1); | 972 | this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); |
973 | break; | ||
974 | case PGMAJFAULT: | ||
975 | this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); | ||
957 | break; | 976 | break; |
958 | default: | 977 | default: |
959 | BUG(); | 978 | BUG(); |
@@ -963,6 +982,27 @@ out: | |||
963 | } | 982 | } |
964 | EXPORT_SYMBOL(mem_cgroup_count_vm_event); | 983 | EXPORT_SYMBOL(mem_cgroup_count_vm_event); |
965 | 984 | ||
985 | /** | ||
986 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg | ||
987 | * @zone: zone of the wanted lruvec | ||
988 | * @mem: memcg of the wanted lruvec | ||
989 | * | ||
990 | * Returns the lru list vector holding pages for the given @zone and | ||
991 | * @mem. This can be the global zone lruvec, if the memory controller | ||
992 | * is disabled. | ||
993 | */ | ||
994 | struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, | ||
995 | struct mem_cgroup *memcg) | ||
996 | { | ||
997 | struct mem_cgroup_per_zone *mz; | ||
998 | |||
999 | if (mem_cgroup_disabled()) | ||
1000 | return &zone->lruvec; | ||
1001 | |||
1002 | mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); | ||
1003 | return &mz->lruvec; | ||
1004 | } | ||
1005 | |||
966 | /* | 1006 | /* |
967 | * Following LRU functions are allowed to be used without PCG_LOCK. | 1007 | * Following LRU functions are allowed to be used without PCG_LOCK. |
968 | * Operations are called by routine of global LRU independently from memcg. | 1008 | * Operations are called by routine of global LRU independently from memcg. |
@@ -977,180 +1017,91 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event); | |||
977 | * When moving account, the page is not on LRU. It's isolated. | 1017 | * When moving account, the page is not on LRU. It's isolated. |
978 | */ | 1018 | */ |
979 | 1019 | ||
980 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | 1020 | /** |
981 | { | 1021 | * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec |
982 | struct page_cgroup *pc; | 1022 | * @zone: zone of the page |
983 | struct mem_cgroup_per_zone *mz; | 1023 | * @page: the page |
984 | 1024 | * @lru: current lru | |
985 | if (mem_cgroup_disabled()) | 1025 | * |
986 | return; | 1026 | * This function accounts for @page being added to @lru, and returns |
987 | pc = lookup_page_cgroup(page); | 1027 | * the lruvec for the given @zone and the memcg @page is charged to. |
988 | /* can happen while we handle swapcache. */ | 1028 | * |
989 | if (!TestClearPageCgroupAcctLRU(pc)) | 1029 | * The callsite is then responsible for physically linking the page to |
990 | return; | 1030 | * the returned lruvec->lists[@lru]. |
991 | VM_BUG_ON(!pc->mem_cgroup); | ||
992 | /* | ||
993 | * We don't check PCG_USED bit. It's cleared when the "page" is finally | ||
994 | * removed from global LRU. | ||
995 | */ | ||
996 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | ||
997 | /* huge page split is done under lru_lock. so, we have no races. */ | ||
998 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); | ||
999 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
1000 | return; | ||
1001 | VM_BUG_ON(list_empty(&pc->lru)); | ||
1002 | list_del_init(&pc->lru); | ||
1003 | } | ||
1004 | |||
1005 | void mem_cgroup_del_lru(struct page *page) | ||
1006 | { | ||
1007 | mem_cgroup_del_lru_list(page, page_lru(page)); | ||
1008 | } | ||
1009 | |||
1010 | /* | ||
1011 | * Writeback is about to end against a page which has been marked for immediate | ||
1012 | * reclaim. If it still appears to be reclaimable, move it to the tail of the | ||
1013 | * inactive list. | ||
1014 | */ | 1031 | */ |
1015 | void mem_cgroup_rotate_reclaimable_page(struct page *page) | 1032 | struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, |
1033 | enum lru_list lru) | ||
1016 | { | 1034 | { |
1017 | struct mem_cgroup_per_zone *mz; | 1035 | struct mem_cgroup_per_zone *mz; |
1036 | struct mem_cgroup *memcg; | ||
1018 | struct page_cgroup *pc; | 1037 | struct page_cgroup *pc; |
1019 | enum lru_list lru = page_lru(page); | ||
1020 | 1038 | ||
1021 | if (mem_cgroup_disabled()) | 1039 | if (mem_cgroup_disabled()) |
1022 | return; | 1040 | return &zone->lruvec; |
1023 | 1041 | ||
1024 | pc = lookup_page_cgroup(page); | 1042 | pc = lookup_page_cgroup(page); |
1025 | /* unused or root page is not rotated. */ | 1043 | memcg = pc->mem_cgroup; |
1026 | if (!PageCgroupUsed(pc)) | 1044 | mz = page_cgroup_zoneinfo(memcg, page); |
1027 | return; | 1045 | /* compound_order() is stabilized through lru_lock */ |
1028 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | 1046 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); |
1029 | smp_rmb(); | 1047 | return &mz->lruvec; |
1030 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
1031 | return; | ||
1032 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | ||
1033 | list_move_tail(&pc->lru, &mz->lists[lru]); | ||
1034 | } | 1048 | } |
1035 | 1049 | ||
1036 | void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | 1050 | /** |
1051 | * mem_cgroup_lru_del_list - account for removing an lru page | ||
1052 | * @page: the page | ||
1053 | * @lru: target lru | ||
1054 | * | ||
1055 | * This function accounts for @page being removed from @lru. | ||
1056 | * | ||
1057 | * The callsite is then responsible for physically unlinking | ||
1058 | * @page->lru. | ||
1059 | */ | ||
1060 | void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) | ||
1037 | { | 1061 | { |
1038 | struct mem_cgroup_per_zone *mz; | 1062 | struct mem_cgroup_per_zone *mz; |
1063 | struct mem_cgroup *memcg; | ||
1039 | struct page_cgroup *pc; | 1064 | struct page_cgroup *pc; |
1040 | 1065 | ||
1041 | if (mem_cgroup_disabled()) | 1066 | if (mem_cgroup_disabled()) |
1042 | return; | 1067 | return; |
1043 | 1068 | ||
1044 | pc = lookup_page_cgroup(page); | 1069 | pc = lookup_page_cgroup(page); |
1045 | /* unused or root page is not rotated. */ | 1070 | memcg = pc->mem_cgroup; |
1046 | if (!PageCgroupUsed(pc)) | 1071 | VM_BUG_ON(!memcg); |
1047 | return; | 1072 | mz = page_cgroup_zoneinfo(memcg, page); |
1048 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | ||
1049 | smp_rmb(); | ||
1050 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
1051 | return; | ||
1052 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | ||
1053 | list_move(&pc->lru, &mz->lists[lru]); | ||
1054 | } | ||
1055 | |||
1056 | void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | ||
1057 | { | ||
1058 | struct page_cgroup *pc; | ||
1059 | struct mem_cgroup_per_zone *mz; | ||
1060 | |||
1061 | if (mem_cgroup_disabled()) | ||
1062 | return; | ||
1063 | pc = lookup_page_cgroup(page); | ||
1064 | VM_BUG_ON(PageCgroupAcctLRU(pc)); | ||
1065 | /* | ||
1066 | * putback: charge: | ||
1067 | * SetPageLRU SetPageCgroupUsed | ||
1068 | * smp_mb smp_mb | ||
1069 | * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU | ||
1070 | * | ||
1071 | * Ensure that one of the two sides adds the page to the memcg | ||
1072 | * LRU during a race. | ||
1073 | */ | ||
1074 | smp_mb(); | ||
1075 | if (!PageCgroupUsed(pc)) | ||
1076 | return; | ||
1077 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | ||
1078 | smp_rmb(); | ||
1079 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | ||
1080 | /* huge page split is done under lru_lock. so, we have no races. */ | 1073 | /* huge page split is done under lru_lock. so, we have no races. */ |
1081 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); | 1074 | VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); |
1082 | SetPageCgroupAcctLRU(pc); | 1075 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); |
1083 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
1084 | return; | ||
1085 | list_add(&pc->lru, &mz->lists[lru]); | ||
1086 | } | ||
1087 | |||
1088 | /* | ||
1089 | * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed | ||
1090 | * while it's linked to lru because the page may be reused after it's fully | ||
1091 | * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. | ||
1092 | * It's done under lock_page and expected that zone->lru_lock isnever held. | ||
1093 | */ | ||
1094 | static void mem_cgroup_lru_del_before_commit(struct page *page) | ||
1095 | { | ||
1096 | unsigned long flags; | ||
1097 | struct zone *zone = page_zone(page); | ||
1098 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
1099 | |||
1100 | /* | ||
1101 | * Doing this check without taking ->lru_lock seems wrong but this | ||
1102 | * is safe. Because if page_cgroup's USED bit is unset, the page | ||
1103 | * will not be added to any memcg's LRU. If page_cgroup's USED bit is | ||
1104 | * set, the commit after this will fail, anyway. | ||
1105 | * This all charge/uncharge is done under some mutual execustion. | ||
1106 | * So, we don't need to taking care of changes in USED bit. | ||
1107 | */ | ||
1108 | if (likely(!PageLRU(page))) | ||
1109 | return; | ||
1110 | |||
1111 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
1112 | /* | ||
1113 | * Forget old LRU when this page_cgroup is *not* used. This Used bit | ||
1114 | * is guarded by lock_page() because the page is SwapCache. | ||
1115 | */ | ||
1116 | if (!PageCgroupUsed(pc)) | ||
1117 | mem_cgroup_del_lru_list(page, page_lru(page)); | ||
1118 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
1119 | } | 1076 | } |
1120 | 1077 | ||
1121 | static void mem_cgroup_lru_add_after_commit(struct page *page) | 1078 | void mem_cgroup_lru_del(struct page *page) |
1122 | { | 1079 | { |
1123 | unsigned long flags; | 1080 | mem_cgroup_lru_del_list(page, page_lru(page)); |
1124 | struct zone *zone = page_zone(page); | ||
1125 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
1126 | /* | ||
1127 | * putback: charge: | ||
1128 | * SetPageLRU SetPageCgroupUsed | ||
1129 | * smp_mb smp_mb | ||
1130 | * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU | ||
1131 | * | ||
1132 | * Ensure that one of the two sides adds the page to the memcg | ||
1133 | * LRU during a race. | ||
1134 | */ | ||
1135 | smp_mb(); | ||
1136 | /* taking care of that the page is added to LRU while we commit it */ | ||
1137 | if (likely(!PageLRU(page))) | ||
1138 | return; | ||
1139 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
1140 | /* link when the page is linked to LRU but page_cgroup isn't */ | ||
1141 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) | ||
1142 | mem_cgroup_add_lru_list(page, page_lru(page)); | ||
1143 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
1144 | } | 1081 | } |
1145 | 1082 | ||
1146 | 1083 | /** | |
1147 | void mem_cgroup_move_lists(struct page *page, | 1084 | * mem_cgroup_lru_move_lists - account for moving a page between lrus |
1148 | enum lru_list from, enum lru_list to) | 1085 | * @zone: zone of the page |
1086 | * @page: the page | ||
1087 | * @from: current lru | ||
1088 | * @to: target lru | ||
1089 | * | ||
1090 | * This function accounts for @page being moved between the lrus @from | ||
1091 | * and @to, and returns the lruvec for the given @zone and the memcg | ||
1092 | * @page is charged to. | ||
1093 | * | ||
1094 | * The callsite is then responsible for physically relinking | ||
1095 | * @page->lru to the returned lruvec->lists[@to]. | ||
1096 | */ | ||
1097 | struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, | ||
1098 | struct page *page, | ||
1099 | enum lru_list from, | ||
1100 | enum lru_list to) | ||
1149 | { | 1101 | { |
1150 | if (mem_cgroup_disabled()) | 1102 | /* XXX: Optimize this, especially for @from == @to */ |
1151 | return; | 1103 | mem_cgroup_lru_del_list(page, from); |
1152 | mem_cgroup_del_lru_list(page, from); | 1104 | return mem_cgroup_lru_add_list(zone, page, to); |
1153 | mem_cgroup_add_lru_list(page, to); | ||
1154 | } | 1105 | } |
1155 | 1106 | ||
1156 | /* | 1107 | /* |
@@ -1175,10 +1126,21 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) | |||
1175 | struct task_struct *p; | 1126 | struct task_struct *p; |
1176 | 1127 | ||
1177 | p = find_lock_task_mm(task); | 1128 | p = find_lock_task_mm(task); |
1178 | if (!p) | 1129 | if (p) { |
1179 | return 0; | 1130 | curr = try_get_mem_cgroup_from_mm(p->mm); |
1180 | curr = try_get_mem_cgroup_from_mm(p->mm); | 1131 | task_unlock(p); |
1181 | task_unlock(p); | 1132 | } else { |
1133 | /* | ||
1134 | * All threads may have already detached their mm's, but the oom | ||
1135 | * killer still needs to detect if they have already been oom | ||
1136 | * killed to prevent needlessly killing additional tasks. | ||
1137 | */ | ||
1138 | task_lock(task); | ||
1139 | curr = mem_cgroup_from_task(task); | ||
1140 | if (curr) | ||
1141 | css_get(&curr->css); | ||
1142 | task_unlock(task); | ||
1143 | } | ||
1182 | if (!curr) | 1144 | if (!curr) |
1183 | return 0; | 1145 | return 0; |
1184 | /* | 1146 | /* |
@@ -1258,68 +1220,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) | |||
1258 | return &mz->reclaim_stat; | 1220 | return &mz->reclaim_stat; |
1259 | } | 1221 | } |
1260 | 1222 | ||
1261 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | ||
1262 | struct list_head *dst, | ||
1263 | unsigned long *scanned, int order, | ||
1264 | isolate_mode_t mode, | ||
1265 | struct zone *z, | ||
1266 | struct mem_cgroup *mem_cont, | ||
1267 | int active, int file) | ||
1268 | { | ||
1269 | unsigned long nr_taken = 0; | ||
1270 | struct page *page; | ||
1271 | unsigned long scan; | ||
1272 | LIST_HEAD(pc_list); | ||
1273 | struct list_head *src; | ||
1274 | struct page_cgroup *pc, *tmp; | ||
1275 | int nid = zone_to_nid(z); | ||
1276 | int zid = zone_idx(z); | ||
1277 | struct mem_cgroup_per_zone *mz; | ||
1278 | int lru = LRU_FILE * file + active; | ||
1279 | int ret; | ||
1280 | |||
1281 | BUG_ON(!mem_cont); | ||
1282 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | ||
1283 | src = &mz->lists[lru]; | ||
1284 | |||
1285 | scan = 0; | ||
1286 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { | ||
1287 | if (scan >= nr_to_scan) | ||
1288 | break; | ||
1289 | |||
1290 | if (unlikely(!PageCgroupUsed(pc))) | ||
1291 | continue; | ||
1292 | |||
1293 | page = lookup_cgroup_page(pc); | ||
1294 | |||
1295 | if (unlikely(!PageLRU(page))) | ||
1296 | continue; | ||
1297 | |||
1298 | scan++; | ||
1299 | ret = __isolate_lru_page(page, mode, file); | ||
1300 | switch (ret) { | ||
1301 | case 0: | ||
1302 | list_move(&page->lru, dst); | ||
1303 | mem_cgroup_del_lru(page); | ||
1304 | nr_taken += hpage_nr_pages(page); | ||
1305 | break; | ||
1306 | case -EBUSY: | ||
1307 | /* we don't affect global LRU but rotate in our LRU */ | ||
1308 | mem_cgroup_rotate_lru_list(page, page_lru(page)); | ||
1309 | break; | ||
1310 | default: | ||
1311 | break; | ||
1312 | } | ||
1313 | } | ||
1314 | |||
1315 | *scanned = scan; | ||
1316 | |||
1317 | trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, | ||
1318 | 0, 0, 0, mode); | ||
1319 | |||
1320 | return nr_taken; | ||
1321 | } | ||
1322 | |||
1323 | #define mem_cgroup_from_res_counter(counter, member) \ | 1223 | #define mem_cgroup_from_res_counter(counter, member) \ |
1324 | container_of(counter, struct mem_cgroup, member) | 1224 | container_of(counter, struct mem_cgroup, member) |
1325 | 1225 | ||
@@ -1536,41 +1436,40 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
1536 | return min(limit, memsw); | 1436 | return min(limit, memsw); |
1537 | } | 1437 | } |
1538 | 1438 | ||
1539 | /* | 1439 | static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, |
1540 | * Visit the first child (need not be the first child as per the ordering | 1440 | gfp_t gfp_mask, |
1541 | * of the cgroup list, since we track last_scanned_child) of @mem and use | 1441 | unsigned long flags) |
1542 | * that to reclaim free pages from. | ||
1543 | */ | ||
1544 | static struct mem_cgroup * | ||
1545 | mem_cgroup_select_victim(struct mem_cgroup *root_memcg) | ||
1546 | { | 1442 | { |
1547 | struct mem_cgroup *ret = NULL; | 1443 | unsigned long total = 0; |
1548 | struct cgroup_subsys_state *css; | 1444 | bool noswap = false; |
1549 | int nextid, found; | 1445 | int loop; |
1550 | |||
1551 | if (!root_memcg->use_hierarchy) { | ||
1552 | css_get(&root_memcg->css); | ||
1553 | ret = root_memcg; | ||
1554 | } | ||
1555 | 1446 | ||
1556 | while (!ret) { | 1447 | if (flags & MEM_CGROUP_RECLAIM_NOSWAP) |
1557 | rcu_read_lock(); | 1448 | noswap = true; |
1558 | nextid = root_memcg->last_scanned_child + 1; | 1449 | if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) |
1559 | css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css, | 1450 | noswap = true; |
1560 | &found); | ||
1561 | if (css && css_tryget(css)) | ||
1562 | ret = container_of(css, struct mem_cgroup, css); | ||
1563 | 1451 | ||
1564 | rcu_read_unlock(); | 1452 | for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { |
1565 | /* Updates scanning parameter */ | 1453 | if (loop) |
1566 | if (!css) { | 1454 | drain_all_stock_async(memcg); |
1567 | /* this means start scan from ID:1 */ | 1455 | total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); |
1568 | root_memcg->last_scanned_child = 0; | 1456 | /* |
1569 | } else | 1457 | * Allow limit shrinkers, which are triggered directly |
1570 | root_memcg->last_scanned_child = found; | 1458 | * by userspace, to catch signals and stop reclaim |
1459 | * after minimal progress, regardless of the margin. | ||
1460 | */ | ||
1461 | if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) | ||
1462 | break; | ||
1463 | if (mem_cgroup_margin(memcg)) | ||
1464 | break; | ||
1465 | /* | ||
1466 | * If nothing was reclaimed after two attempts, there | ||
1467 | * may be no reclaimable pages in this hierarchy. | ||
1468 | */ | ||
1469 | if (loop && !total) | ||
1470 | break; | ||
1571 | } | 1471 | } |
1572 | 1472 | return total; | |
1573 | return ret; | ||
1574 | } | 1473 | } |
1575 | 1474 | ||
1576 | /** | 1475 | /** |
@@ -1710,61 +1609,35 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | |||
1710 | } | 1609 | } |
1711 | #endif | 1610 | #endif |
1712 | 1611 | ||
1713 | /* | 1612 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, |
1714 | * Scan the hierarchy if needed to reclaim memory. We remember the last child | 1613 | struct zone *zone, |
1715 | * we reclaimed from, so that we don't end up penalizing one child extensively | 1614 | gfp_t gfp_mask, |
1716 | * based on its position in the children list. | 1615 | unsigned long *total_scanned) |
1717 | * | 1616 | { |
1718 | * root_memcg is the original ancestor that we've been reclaim from. | 1617 | struct mem_cgroup *victim = NULL; |
1719 | * | 1618 | int total = 0; |
1720 | * We give up and return to the caller when we visit root_memcg twice. | ||
1721 | * (other groups can be removed while we're walking....) | ||
1722 | * | ||
1723 | * If shrink==true, for avoiding to free too much, this returns immedieately. | ||
1724 | */ | ||
1725 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, | ||
1726 | struct zone *zone, | ||
1727 | gfp_t gfp_mask, | ||
1728 | unsigned long reclaim_options, | ||
1729 | unsigned long *total_scanned) | ||
1730 | { | ||
1731 | struct mem_cgroup *victim; | ||
1732 | int ret, total = 0; | ||
1733 | int loop = 0; | 1619 | int loop = 0; |
1734 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; | ||
1735 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; | ||
1736 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; | ||
1737 | unsigned long excess; | 1620 | unsigned long excess; |
1738 | unsigned long nr_scanned; | 1621 | unsigned long nr_scanned; |
1622 | struct mem_cgroup_reclaim_cookie reclaim = { | ||
1623 | .zone = zone, | ||
1624 | .priority = 0, | ||
1625 | }; | ||
1739 | 1626 | ||
1740 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; | 1627 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; |
1741 | 1628 | ||
1742 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | ||
1743 | if (!check_soft && !shrink && root_memcg->memsw_is_minimum) | ||
1744 | noswap = true; | ||
1745 | |||
1746 | while (1) { | 1629 | while (1) { |
1747 | victim = mem_cgroup_select_victim(root_memcg); | 1630 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); |
1748 | if (victim == root_memcg) { | 1631 | if (!victim) { |
1749 | loop++; | 1632 | loop++; |
1750 | /* | ||
1751 | * We are not draining per cpu cached charges during | ||
1752 | * soft limit reclaim because global reclaim doesn't | ||
1753 | * care about charges. It tries to free some memory and | ||
1754 | * charges will not give any. | ||
1755 | */ | ||
1756 | if (!check_soft && loop >= 1) | ||
1757 | drain_all_stock_async(root_memcg); | ||
1758 | if (loop >= 2) { | 1633 | if (loop >= 2) { |
1759 | /* | 1634 | /* |
1760 | * If we have not been able to reclaim | 1635 | * If we have not been able to reclaim |
1761 | * anything, it might because there are | 1636 | * anything, it might because there are |
1762 | * no reclaimable pages under this hierarchy | 1637 | * no reclaimable pages under this hierarchy |
1763 | */ | 1638 | */ |
1764 | if (!check_soft || !total) { | 1639 | if (!total) |
1765 | css_put(&victim->css); | ||
1766 | break; | 1640 | break; |
1767 | } | ||
1768 | /* | 1641 | /* |
1769 | * We want to do more targeted reclaim. | 1642 | * We want to do more targeted reclaim. |
1770 | * excess >> 2 is not to excessive so as to | 1643 | * excess >> 2 is not to excessive so as to |
@@ -1772,40 +1645,20 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, | |||
1772 | * coming back to reclaim from this cgroup | 1645 | * coming back to reclaim from this cgroup |
1773 | */ | 1646 | */ |
1774 | if (total >= (excess >> 2) || | 1647 | if (total >= (excess >> 2) || |
1775 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { | 1648 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) |
1776 | css_put(&victim->css); | ||
1777 | break; | 1649 | break; |
1778 | } | ||
1779 | } | 1650 | } |
1780 | } | ||
1781 | if (!mem_cgroup_reclaimable(victim, noswap)) { | ||
1782 | /* this cgroup's local usage == 0 */ | ||
1783 | css_put(&victim->css); | ||
1784 | continue; | 1651 | continue; |
1785 | } | 1652 | } |
1786 | /* we use swappiness of local cgroup */ | 1653 | if (!mem_cgroup_reclaimable(victim, false)) |
1787 | if (check_soft) { | 1654 | continue; |
1788 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, | 1655 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, |
1789 | noswap, zone, &nr_scanned); | 1656 | zone, &nr_scanned); |
1790 | *total_scanned += nr_scanned; | 1657 | *total_scanned += nr_scanned; |
1791 | } else | 1658 | if (!res_counter_soft_limit_excess(&root_memcg->res)) |
1792 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | 1659 | break; |
1793 | noswap); | ||
1794 | css_put(&victim->css); | ||
1795 | /* | ||
1796 | * At shrinking usage, we can't check we should stop here or | ||
1797 | * reclaim more. It's depends on callers. last_scanned_child | ||
1798 | * will work enough for keeping fairness under tree. | ||
1799 | */ | ||
1800 | if (shrink) | ||
1801 | return ret; | ||
1802 | total += ret; | ||
1803 | if (check_soft) { | ||
1804 | if (!res_counter_soft_limit_excess(&root_memcg->res)) | ||
1805 | return total; | ||
1806 | } else if (mem_cgroup_margin(root_memcg)) | ||
1807 | return total; | ||
1808 | } | 1660 | } |
1661 | mem_cgroup_iter_break(root_memcg, victim); | ||
1809 | return total; | 1662 | return total; |
1810 | } | 1663 | } |
1811 | 1664 | ||
@@ -1817,16 +1670,16 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, | |||
1817 | static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) | 1670 | static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) |
1818 | { | 1671 | { |
1819 | struct mem_cgroup *iter, *failed = NULL; | 1672 | struct mem_cgroup *iter, *failed = NULL; |
1820 | bool cond = true; | ||
1821 | 1673 | ||
1822 | for_each_mem_cgroup_tree_cond(iter, memcg, cond) { | 1674 | for_each_mem_cgroup_tree(iter, memcg) { |
1823 | if (iter->oom_lock) { | 1675 | if (iter->oom_lock) { |
1824 | /* | 1676 | /* |
1825 | * this subtree of our hierarchy is already locked | 1677 | * this subtree of our hierarchy is already locked |
1826 | * so we cannot give a lock. | 1678 | * so we cannot give a lock. |
1827 | */ | 1679 | */ |
1828 | failed = iter; | 1680 | failed = iter; |
1829 | cond = false; | 1681 | mem_cgroup_iter_break(memcg, iter); |
1682 | break; | ||
1830 | } else | 1683 | } else |
1831 | iter->oom_lock = true; | 1684 | iter->oom_lock = true; |
1832 | } | 1685 | } |
@@ -1838,11 +1691,10 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) | |||
1838 | * OK, we failed to lock the whole subtree so we have to clean up | 1691 | * OK, we failed to lock the whole subtree so we have to clean up |
1839 | * what we set up to the failing subtree | 1692 | * what we set up to the failing subtree |
1840 | */ | 1693 | */ |
1841 | cond = true; | 1694 | for_each_mem_cgroup_tree(iter, memcg) { |
1842 | for_each_mem_cgroup_tree_cond(iter, memcg, cond) { | ||
1843 | if (iter == failed) { | 1695 | if (iter == failed) { |
1844 | cond = false; | 1696 | mem_cgroup_iter_break(memcg, iter); |
1845 | continue; | 1697 | break; |
1846 | } | 1698 | } |
1847 | iter->oom_lock = false; | 1699 | iter->oom_lock = false; |
1848 | } | 1700 | } |
@@ -2007,7 +1859,7 @@ void mem_cgroup_update_page_stat(struct page *page, | |||
2007 | bool need_unlock = false; | 1859 | bool need_unlock = false; |
2008 | unsigned long uninitialized_var(flags); | 1860 | unsigned long uninitialized_var(flags); |
2009 | 1861 | ||
2010 | if (unlikely(!pc)) | 1862 | if (mem_cgroup_disabled()) |
2011 | return; | 1863 | return; |
2012 | 1864 | ||
2013 | rcu_read_lock(); | 1865 | rcu_read_lock(); |
@@ -2238,7 +2090,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, | |||
2238 | struct mem_cgroup *iter; | 2090 | struct mem_cgroup *iter; |
2239 | 2091 | ||
2240 | if ((action == CPU_ONLINE)) { | 2092 | if ((action == CPU_ONLINE)) { |
2241 | for_each_mem_cgroup_all(iter) | 2093 | for_each_mem_cgroup(iter) |
2242 | synchronize_mem_cgroup_on_move(iter, cpu); | 2094 | synchronize_mem_cgroup_on_move(iter, cpu); |
2243 | return NOTIFY_OK; | 2095 | return NOTIFY_OK; |
2244 | } | 2096 | } |
@@ -2246,7 +2098,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, | |||
2246 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) | 2098 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) |
2247 | return NOTIFY_OK; | 2099 | return NOTIFY_OK; |
2248 | 2100 | ||
2249 | for_each_mem_cgroup_all(iter) | 2101 | for_each_mem_cgroup(iter) |
2250 | mem_cgroup_drain_pcp_counter(iter, cpu); | 2102 | mem_cgroup_drain_pcp_counter(iter, cpu); |
2251 | 2103 | ||
2252 | stock = &per_cpu(memcg_stock, cpu); | 2104 | stock = &per_cpu(memcg_stock, cpu); |
@@ -2300,8 +2152,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2300 | if (!(gfp_mask & __GFP_WAIT)) | 2152 | if (!(gfp_mask & __GFP_WAIT)) |
2301 | return CHARGE_WOULDBLOCK; | 2153 | return CHARGE_WOULDBLOCK; |
2302 | 2154 | ||
2303 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | 2155 | ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); |
2304 | gfp_mask, flags, NULL); | ||
2305 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) | 2156 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) |
2306 | return CHARGE_RETRY; | 2157 | return CHARGE_RETRY; |
2307 | /* | 2158 | /* |
@@ -2334,8 +2185,25 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2334 | } | 2185 | } |
2335 | 2186 | ||
2336 | /* | 2187 | /* |
2337 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 2188 | * __mem_cgroup_try_charge() does |
2338 | * oom-killer can be invoked. | 2189 | * 1. detect memcg to be charged against from passed *mm and *ptr, |
2190 | * 2. update res_counter | ||
2191 | * 3. call memory reclaim if necessary. | ||
2192 | * | ||
2193 | * In some special case, if the task is fatal, fatal_signal_pending() or | ||
2194 | * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup | ||
2195 | * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon | ||
2196 | * as possible without any hazards. 2: all pages should have a valid | ||
2197 | * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg | ||
2198 | * pointer, that is treated as a charge to root_mem_cgroup. | ||
2199 | * | ||
2200 | * So __mem_cgroup_try_charge() will return | ||
2201 | * 0 ... on success, filling *ptr with a valid memcg pointer. | ||
2202 | * -ENOMEM ... charge failure because of resource limits. | ||
2203 | * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. | ||
2204 | * | ||
2205 | * Unlike the exported interface, an "oom" parameter is added. if oom==true, | ||
2206 | * the oom-killer can be invoked. | ||
2339 | */ | 2207 | */ |
2340 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 2208 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
2341 | gfp_t gfp_mask, | 2209 | gfp_t gfp_mask, |
@@ -2364,7 +2232,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2364 | * set, if so charge the init_mm (happens for pagecache usage). | 2232 | * set, if so charge the init_mm (happens for pagecache usage). |
2365 | */ | 2233 | */ |
2366 | if (!*ptr && !mm) | 2234 | if (!*ptr && !mm) |
2367 | goto bypass; | 2235 | *ptr = root_mem_cgroup; |
2368 | again: | 2236 | again: |
2369 | if (*ptr) { /* css should be a valid one */ | 2237 | if (*ptr) { /* css should be a valid one */ |
2370 | memcg = *ptr; | 2238 | memcg = *ptr; |
@@ -2390,7 +2258,9 @@ again: | |||
2390 | * task-struct. So, mm->owner can be NULL. | 2258 | * task-struct. So, mm->owner can be NULL. |
2391 | */ | 2259 | */ |
2392 | memcg = mem_cgroup_from_task(p); | 2260 | memcg = mem_cgroup_from_task(p); |
2393 | if (!memcg || mem_cgroup_is_root(memcg)) { | 2261 | if (!memcg) |
2262 | memcg = root_mem_cgroup; | ||
2263 | if (mem_cgroup_is_root(memcg)) { | ||
2394 | rcu_read_unlock(); | 2264 | rcu_read_unlock(); |
2395 | goto done; | 2265 | goto done; |
2396 | } | 2266 | } |
@@ -2465,8 +2335,8 @@ nomem: | |||
2465 | *ptr = NULL; | 2335 | *ptr = NULL; |
2466 | return -ENOMEM; | 2336 | return -ENOMEM; |
2467 | bypass: | 2337 | bypass: |
2468 | *ptr = NULL; | 2338 | *ptr = root_mem_cgroup; |
2469 | return 0; | 2339 | return -EINTR; |
2470 | } | 2340 | } |
2471 | 2341 | ||
2472 | /* | 2342 | /* |
@@ -2522,7 +2392,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
2522 | memcg = NULL; | 2392 | memcg = NULL; |
2523 | } else if (PageSwapCache(page)) { | 2393 | } else if (PageSwapCache(page)) { |
2524 | ent.val = page_private(page); | 2394 | ent.val = page_private(page); |
2525 | id = lookup_swap_cgroup(ent); | 2395 | id = lookup_swap_cgroup_id(ent); |
2526 | rcu_read_lock(); | 2396 | rcu_read_lock(); |
2527 | memcg = mem_cgroup_lookup(id); | 2397 | memcg = mem_cgroup_lookup(id); |
2528 | if (memcg && !css_tryget(&memcg->css)) | 2398 | if (memcg && !css_tryget(&memcg->css)) |
@@ -2574,6 +2444,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2574 | 2444 | ||
2575 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); | 2445 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); |
2576 | unlock_page_cgroup(pc); | 2446 | unlock_page_cgroup(pc); |
2447 | WARN_ON_ONCE(PageLRU(page)); | ||
2577 | /* | 2448 | /* |
2578 | * "charge_statistics" updated event counter. Then, check it. | 2449 | * "charge_statistics" updated event counter. Then, check it. |
2579 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 2450 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
@@ -2585,44 +2456,29 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2585 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2456 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
2586 | 2457 | ||
2587 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ | 2458 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ |
2588 | (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) | 2459 | (1 << PCG_MIGRATION)) |
2589 | /* | 2460 | /* |
2590 | * Because tail pages are not marked as "used", set it. We're under | 2461 | * Because tail pages are not marked as "used", set it. We're under |
2591 | * zone->lru_lock, 'splitting on pmd' and compund_lock. | 2462 | * zone->lru_lock, 'splitting on pmd' and compound_lock. |
2463 | * charge/uncharge will be never happen and move_account() is done under | ||
2464 | * compound_lock(), so we don't have to take care of races. | ||
2592 | */ | 2465 | */ |
2593 | void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) | 2466 | void mem_cgroup_split_huge_fixup(struct page *head) |
2594 | { | 2467 | { |
2595 | struct page_cgroup *head_pc = lookup_page_cgroup(head); | 2468 | struct page_cgroup *head_pc = lookup_page_cgroup(head); |
2596 | struct page_cgroup *tail_pc = lookup_page_cgroup(tail); | 2469 | struct page_cgroup *pc; |
2597 | unsigned long flags; | 2470 | int i; |
2598 | 2471 | ||
2599 | if (mem_cgroup_disabled()) | 2472 | if (mem_cgroup_disabled()) |
2600 | return; | 2473 | return; |
2601 | /* | 2474 | for (i = 1; i < HPAGE_PMD_NR; i++) { |
2602 | * We have no races with charge/uncharge but will have races with | 2475 | pc = head_pc + i; |
2603 | * page state accounting. | 2476 | pc->mem_cgroup = head_pc->mem_cgroup; |
2604 | */ | 2477 | smp_wmb();/* see __commit_charge() */ |
2605 | move_lock_page_cgroup(head_pc, &flags); | 2478 | pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; |
2606 | |||
2607 | tail_pc->mem_cgroup = head_pc->mem_cgroup; | ||
2608 | smp_wmb(); /* see __commit_charge() */ | ||
2609 | if (PageCgroupAcctLRU(head_pc)) { | ||
2610 | enum lru_list lru; | ||
2611 | struct mem_cgroup_per_zone *mz; | ||
2612 | |||
2613 | /* | ||
2614 | * LRU flags cannot be copied because we need to add tail | ||
2615 | *.page to LRU by generic call and our hook will be called. | ||
2616 | * We hold lru_lock, then, reduce counter directly. | ||
2617 | */ | ||
2618 | lru = page_lru(head); | ||
2619 | mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); | ||
2620 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | ||
2621 | } | 2479 | } |
2622 | tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; | ||
2623 | move_unlock_page_cgroup(head_pc, &flags); | ||
2624 | } | 2480 | } |
2625 | #endif | 2481 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
2626 | 2482 | ||
2627 | /** | 2483 | /** |
2628 | * mem_cgroup_move_account - move account of the page | 2484 | * mem_cgroup_move_account - move account of the page |
@@ -2737,7 +2593,7 @@ static int mem_cgroup_move_parent(struct page *page, | |||
2737 | 2593 | ||
2738 | parent = mem_cgroup_from_cont(pcg); | 2594 | parent = mem_cgroup_from_cont(pcg); |
2739 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); | 2595 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); |
2740 | if (ret || !parent) | 2596 | if (ret) |
2741 | goto put_back; | 2597 | goto put_back; |
2742 | 2598 | ||
2743 | if (nr_pages > 1) | 2599 | if (nr_pages > 1) |
@@ -2783,12 +2639,9 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2783 | } | 2639 | } |
2784 | 2640 | ||
2785 | pc = lookup_page_cgroup(page); | 2641 | pc = lookup_page_cgroup(page); |
2786 | BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ | ||
2787 | |||
2788 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); | 2642 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); |
2789 | if (ret || !memcg) | 2643 | if (ret == -ENOMEM) |
2790 | return ret; | 2644 | return ret; |
2791 | |||
2792 | __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); | 2645 | __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); |
2793 | return 0; | 2646 | return 0; |
2794 | } | 2647 | } |
@@ -2798,19 +2651,11 @@ int mem_cgroup_newpage_charge(struct page *page, | |||
2798 | { | 2651 | { |
2799 | if (mem_cgroup_disabled()) | 2652 | if (mem_cgroup_disabled()) |
2800 | return 0; | 2653 | return 0; |
2801 | /* | 2654 | VM_BUG_ON(page_mapped(page)); |
2802 | * If already mapped, we don't have to account. | 2655 | VM_BUG_ON(page->mapping && !PageAnon(page)); |
2803 | * If page cache, page->mapping has address_space. | 2656 | VM_BUG_ON(!mm); |
2804 | * But page->mapping may have out-of-use anon_vma pointer, | ||
2805 | * detecit it by PageAnon() check. newly-mapped-anon's page->mapping | ||
2806 | * is NULL. | ||
2807 | */ | ||
2808 | if (page_mapped(page) || (page->mapping && !PageAnon(page))) | ||
2809 | return 0; | ||
2810 | if (unlikely(!mm)) | ||
2811 | mm = &init_mm; | ||
2812 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 2657 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
2813 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2658 | MEM_CGROUP_CHARGE_TYPE_MAPPED); |
2814 | } | 2659 | } |
2815 | 2660 | ||
2816 | static void | 2661 | static void |
@@ -2822,14 +2667,27 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg, | |||
2822 | enum charge_type ctype) | 2667 | enum charge_type ctype) |
2823 | { | 2668 | { |
2824 | struct page_cgroup *pc = lookup_page_cgroup(page); | 2669 | struct page_cgroup *pc = lookup_page_cgroup(page); |
2670 | struct zone *zone = page_zone(page); | ||
2671 | unsigned long flags; | ||
2672 | bool removed = false; | ||
2673 | |||
2825 | /* | 2674 | /* |
2826 | * In some case, SwapCache, FUSE(splice_buf->radixtree), the page | 2675 | * In some case, SwapCache, FUSE(splice_buf->radixtree), the page |
2827 | * is already on LRU. It means the page may on some other page_cgroup's | 2676 | * is already on LRU. It means the page may on some other page_cgroup's |
2828 | * LRU. Take care of it. | 2677 | * LRU. Take care of it. |
2829 | */ | 2678 | */ |
2830 | mem_cgroup_lru_del_before_commit(page); | 2679 | spin_lock_irqsave(&zone->lru_lock, flags); |
2680 | if (PageLRU(page)) { | ||
2681 | del_page_from_lru_list(zone, page, page_lru(page)); | ||
2682 | ClearPageLRU(page); | ||
2683 | removed = true; | ||
2684 | } | ||
2831 | __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); | 2685 | __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); |
2832 | mem_cgroup_lru_add_after_commit(page); | 2686 | if (removed) { |
2687 | add_page_to_lru_list(zone, page, page_lru(page)); | ||
2688 | SetPageLRU(page); | ||
2689 | } | ||
2690 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
2833 | return; | 2691 | return; |
2834 | } | 2692 | } |
2835 | 2693 | ||
@@ -2837,6 +2695,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2837 | gfp_t gfp_mask) | 2695 | gfp_t gfp_mask) |
2838 | { | 2696 | { |
2839 | struct mem_cgroup *memcg = NULL; | 2697 | struct mem_cgroup *memcg = NULL; |
2698 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2840 | int ret; | 2699 | int ret; |
2841 | 2700 | ||
2842 | if (mem_cgroup_disabled()) | 2701 | if (mem_cgroup_disabled()) |
@@ -2846,31 +2705,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2846 | 2705 | ||
2847 | if (unlikely(!mm)) | 2706 | if (unlikely(!mm)) |
2848 | mm = &init_mm; | 2707 | mm = &init_mm; |
2708 | if (!page_is_file_cache(page)) | ||
2709 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
2849 | 2710 | ||
2850 | if (page_is_file_cache(page)) { | 2711 | if (!PageSwapCache(page)) |
2851 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); | 2712 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); |
2852 | if (ret || !memcg) | 2713 | else { /* page is swapcache/shmem */ |
2853 | return ret; | ||
2854 | |||
2855 | /* | ||
2856 | * FUSE reuses pages without going through the final | ||
2857 | * put that would remove them from the LRU list, make | ||
2858 | * sure that they get relinked properly. | ||
2859 | */ | ||
2860 | __mem_cgroup_commit_charge_lrucare(page, memcg, | ||
2861 | MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
2862 | return ret; | ||
2863 | } | ||
2864 | /* shmem */ | ||
2865 | if (PageSwapCache(page)) { | ||
2866 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); | 2714 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); |
2867 | if (!ret) | 2715 | if (!ret) |
2868 | __mem_cgroup_commit_charge_swapin(page, memcg, | 2716 | __mem_cgroup_commit_charge_swapin(page, memcg, type); |
2869 | MEM_CGROUP_CHARGE_TYPE_SHMEM); | 2717 | } |
2870 | } else | ||
2871 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, | ||
2872 | MEM_CGROUP_CHARGE_TYPE_SHMEM); | ||
2873 | |||
2874 | return ret; | 2718 | return ret; |
2875 | } | 2719 | } |
2876 | 2720 | ||
@@ -2882,12 +2726,12 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2882 | */ | 2726 | */ |
2883 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | 2727 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, |
2884 | struct page *page, | 2728 | struct page *page, |
2885 | gfp_t mask, struct mem_cgroup **ptr) | 2729 | gfp_t mask, struct mem_cgroup **memcgp) |
2886 | { | 2730 | { |
2887 | struct mem_cgroup *memcg; | 2731 | struct mem_cgroup *memcg; |
2888 | int ret; | 2732 | int ret; |
2889 | 2733 | ||
2890 | *ptr = NULL; | 2734 | *memcgp = NULL; |
2891 | 2735 | ||
2892 | if (mem_cgroup_disabled()) | 2736 | if (mem_cgroup_disabled()) |
2893 | return 0; | 2737 | return 0; |
@@ -2905,27 +2749,32 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2905 | memcg = try_get_mem_cgroup_from_page(page); | 2749 | memcg = try_get_mem_cgroup_from_page(page); |
2906 | if (!memcg) | 2750 | if (!memcg) |
2907 | goto charge_cur_mm; | 2751 | goto charge_cur_mm; |
2908 | *ptr = memcg; | 2752 | *memcgp = memcg; |
2909 | ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); | 2753 | ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); |
2910 | css_put(&memcg->css); | 2754 | css_put(&memcg->css); |
2755 | if (ret == -EINTR) | ||
2756 | ret = 0; | ||
2911 | return ret; | 2757 | return ret; |
2912 | charge_cur_mm: | 2758 | charge_cur_mm: |
2913 | if (unlikely(!mm)) | 2759 | if (unlikely(!mm)) |
2914 | mm = &init_mm; | 2760 | mm = &init_mm; |
2915 | return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); | 2761 | ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); |
2762 | if (ret == -EINTR) | ||
2763 | ret = 0; | ||
2764 | return ret; | ||
2916 | } | 2765 | } |
2917 | 2766 | ||
2918 | static void | 2767 | static void |
2919 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | 2768 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, |
2920 | enum charge_type ctype) | 2769 | enum charge_type ctype) |
2921 | { | 2770 | { |
2922 | if (mem_cgroup_disabled()) | 2771 | if (mem_cgroup_disabled()) |
2923 | return; | 2772 | return; |
2924 | if (!ptr) | 2773 | if (!memcg) |
2925 | return; | 2774 | return; |
2926 | cgroup_exclude_rmdir(&ptr->css); | 2775 | cgroup_exclude_rmdir(&memcg->css); |
2927 | 2776 | ||
2928 | __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); | 2777 | __mem_cgroup_commit_charge_lrucare(page, memcg, ctype); |
2929 | /* | 2778 | /* |
2930 | * Now swap is on-memory. This means this page may be | 2779 | * Now swap is on-memory. This means this page may be |
2931 | * counted both as mem and swap....double count. | 2780 | * counted both as mem and swap....double count. |
@@ -2935,21 +2784,22 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
2935 | */ | 2784 | */ |
2936 | if (do_swap_account && PageSwapCache(page)) { | 2785 | if (do_swap_account && PageSwapCache(page)) { |
2937 | swp_entry_t ent = {.val = page_private(page)}; | 2786 | swp_entry_t ent = {.val = page_private(page)}; |
2787 | struct mem_cgroup *swap_memcg; | ||
2938 | unsigned short id; | 2788 | unsigned short id; |
2939 | struct mem_cgroup *memcg; | ||
2940 | 2789 | ||
2941 | id = swap_cgroup_record(ent, 0); | 2790 | id = swap_cgroup_record(ent, 0); |
2942 | rcu_read_lock(); | 2791 | rcu_read_lock(); |
2943 | memcg = mem_cgroup_lookup(id); | 2792 | swap_memcg = mem_cgroup_lookup(id); |
2944 | if (memcg) { | 2793 | if (swap_memcg) { |
2945 | /* | 2794 | /* |
2946 | * This recorded memcg can be obsolete one. So, avoid | 2795 | * This recorded memcg can be obsolete one. So, avoid |
2947 | * calling css_tryget | 2796 | * calling css_tryget |
2948 | */ | 2797 | */ |
2949 | if (!mem_cgroup_is_root(memcg)) | 2798 | if (!mem_cgroup_is_root(swap_memcg)) |
2950 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 2799 | res_counter_uncharge(&swap_memcg->memsw, |
2951 | mem_cgroup_swap_statistics(memcg, false); | 2800 | PAGE_SIZE); |
2952 | mem_cgroup_put(memcg); | 2801 | mem_cgroup_swap_statistics(swap_memcg, false); |
2802 | mem_cgroup_put(swap_memcg); | ||
2953 | } | 2803 | } |
2954 | rcu_read_unlock(); | 2804 | rcu_read_unlock(); |
2955 | } | 2805 | } |
@@ -2958,13 +2808,14 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
2958 | * So, rmdir()->pre_destroy() can be called while we do this charge. | 2808 | * So, rmdir()->pre_destroy() can be called while we do this charge. |
2959 | * In that case, we need to call pre_destroy() again. check it here. | 2809 | * In that case, we need to call pre_destroy() again. check it here. |
2960 | */ | 2810 | */ |
2961 | cgroup_release_and_wakeup_rmdir(&ptr->css); | 2811 | cgroup_release_and_wakeup_rmdir(&memcg->css); |
2962 | } | 2812 | } |
2963 | 2813 | ||
2964 | void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | 2814 | void mem_cgroup_commit_charge_swapin(struct page *page, |
2815 | struct mem_cgroup *memcg) | ||
2965 | { | 2816 | { |
2966 | __mem_cgroup_commit_charge_swapin(page, ptr, | 2817 | __mem_cgroup_commit_charge_swapin(page, memcg, |
2967 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2818 | MEM_CGROUP_CHARGE_TYPE_MAPPED); |
2968 | } | 2819 | } |
2969 | 2820 | ||
2970 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) | 2821 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) |
@@ -3054,7 +2905,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
3054 | * Check if our page_cgroup is valid | 2905 | * Check if our page_cgroup is valid |
3055 | */ | 2906 | */ |
3056 | pc = lookup_page_cgroup(page); | 2907 | pc = lookup_page_cgroup(page); |
3057 | if (unlikely(!pc || !PageCgroupUsed(pc))) | 2908 | if (unlikely(!PageCgroupUsed(pc))) |
3058 | return NULL; | 2909 | return NULL; |
3059 | 2910 | ||
3060 | lock_page_cgroup(pc); | 2911 | lock_page_cgroup(pc); |
@@ -3117,8 +2968,7 @@ void mem_cgroup_uncharge_page(struct page *page) | |||
3117 | /* early check. */ | 2968 | /* early check. */ |
3118 | if (page_mapped(page)) | 2969 | if (page_mapped(page)) |
3119 | return; | 2970 | return; |
3120 | if (page->mapping && !PageAnon(page)) | 2971 | VM_BUG_ON(page->mapping && !PageAnon(page)); |
3121 | return; | ||
3122 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2972 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); |
3123 | } | 2973 | } |
3124 | 2974 | ||
@@ -3176,6 +3026,23 @@ void mem_cgroup_uncharge_end(void) | |||
3176 | batch->memcg = NULL; | 3026 | batch->memcg = NULL; |
3177 | } | 3027 | } |
3178 | 3028 | ||
3029 | /* | ||
3030 | * A function for resetting pc->mem_cgroup for newly allocated pages. | ||
3031 | * This function should be called if the newpage will be added to LRU | ||
3032 | * before start accounting. | ||
3033 | */ | ||
3034 | void mem_cgroup_reset_owner(struct page *newpage) | ||
3035 | { | ||
3036 | struct page_cgroup *pc; | ||
3037 | |||
3038 | if (mem_cgroup_disabled()) | ||
3039 | return; | ||
3040 | |||
3041 | pc = lookup_page_cgroup(newpage); | ||
3042 | VM_BUG_ON(PageCgroupUsed(pc)); | ||
3043 | pc->mem_cgroup = root_mem_cgroup; | ||
3044 | } | ||
3045 | |||
3179 | #ifdef CONFIG_SWAP | 3046 | #ifdef CONFIG_SWAP |
3180 | /* | 3047 | /* |
3181 | * called after __delete_from_swap_cache() and drop "page" account. | 3048 | * called after __delete_from_swap_cache() and drop "page" account. |
@@ -3293,14 +3160,14 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3293 | * page belongs to. | 3160 | * page belongs to. |
3294 | */ | 3161 | */ |
3295 | int mem_cgroup_prepare_migration(struct page *page, | 3162 | int mem_cgroup_prepare_migration(struct page *page, |
3296 | struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) | 3163 | struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) |
3297 | { | 3164 | { |
3298 | struct mem_cgroup *memcg = NULL; | 3165 | struct mem_cgroup *memcg = NULL; |
3299 | struct page_cgroup *pc; | 3166 | struct page_cgroup *pc; |
3300 | enum charge_type ctype; | 3167 | enum charge_type ctype; |
3301 | int ret = 0; | 3168 | int ret = 0; |
3302 | 3169 | ||
3303 | *ptr = NULL; | 3170 | *memcgp = NULL; |
3304 | 3171 | ||
3305 | VM_BUG_ON(PageTransHuge(page)); | 3172 | VM_BUG_ON(PageTransHuge(page)); |
3306 | if (mem_cgroup_disabled()) | 3173 | if (mem_cgroup_disabled()) |
@@ -3351,10 +3218,10 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3351 | if (!memcg) | 3218 | if (!memcg) |
3352 | return 0; | 3219 | return 0; |
3353 | 3220 | ||
3354 | *ptr = memcg; | 3221 | *memcgp = memcg; |
3355 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); | 3222 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); |
3356 | css_put(&memcg->css);/* drop extra refcnt */ | 3223 | css_put(&memcg->css);/* drop extra refcnt */ |
3357 | if (ret || *ptr == NULL) { | 3224 | if (ret) { |
3358 | if (PageAnon(page)) { | 3225 | if (PageAnon(page)) { |
3359 | lock_page_cgroup(pc); | 3226 | lock_page_cgroup(pc); |
3360 | ClearPageCgroupMigration(pc); | 3227 | ClearPageCgroupMigration(pc); |
@@ -3364,6 +3231,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3364 | */ | 3231 | */ |
3365 | mem_cgroup_uncharge_page(page); | 3232 | mem_cgroup_uncharge_page(page); |
3366 | } | 3233 | } |
3234 | /* we'll need to revisit this error code (we have -EINTR) */ | ||
3367 | return -ENOMEM; | 3235 | return -ENOMEM; |
3368 | } | 3236 | } |
3369 | /* | 3237 | /* |
@@ -3432,12 +3300,51 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3432 | cgroup_release_and_wakeup_rmdir(&memcg->css); | 3300 | cgroup_release_and_wakeup_rmdir(&memcg->css); |
3433 | } | 3301 | } |
3434 | 3302 | ||
3303 | /* | ||
3304 | * At replace page cache, newpage is not under any memcg but it's on | ||
3305 | * LRU. So, this function doesn't touch res_counter but handles LRU | ||
3306 | * in correct way. Both pages are locked so we cannot race with uncharge. | ||
3307 | */ | ||
3308 | void mem_cgroup_replace_page_cache(struct page *oldpage, | ||
3309 | struct page *newpage) | ||
3310 | { | ||
3311 | struct mem_cgroup *memcg; | ||
3312 | struct page_cgroup *pc; | ||
3313 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
3314 | |||
3315 | if (mem_cgroup_disabled()) | ||
3316 | return; | ||
3317 | |||
3318 | pc = lookup_page_cgroup(oldpage); | ||
3319 | /* fix accounting on old pages */ | ||
3320 | lock_page_cgroup(pc); | ||
3321 | memcg = pc->mem_cgroup; | ||
3322 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); | ||
3323 | ClearPageCgroupUsed(pc); | ||
3324 | unlock_page_cgroup(pc); | ||
3325 | |||
3326 | if (PageSwapBacked(oldpage)) | ||
3327 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
3328 | |||
3329 | /* | ||
3330 | * Even if newpage->mapping was NULL before starting replacement, | ||
3331 | * the newpage may be on LRU(or pagevec for LRU) already. We lock | ||
3332 | * LRU while we overwrite pc->mem_cgroup. | ||
3333 | */ | ||
3334 | __mem_cgroup_commit_charge_lrucare(newpage, memcg, type); | ||
3335 | } | ||
3336 | |||
3435 | #ifdef CONFIG_DEBUG_VM | 3337 | #ifdef CONFIG_DEBUG_VM |
3436 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) | 3338 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) |
3437 | { | 3339 | { |
3438 | struct page_cgroup *pc; | 3340 | struct page_cgroup *pc; |
3439 | 3341 | ||
3440 | pc = lookup_page_cgroup(page); | 3342 | pc = lookup_page_cgroup(page); |
3343 | /* | ||
3344 | * Can be NULL while feeding pages into the page allocator for | ||
3345 | * the first time, i.e. during boot or memory hotplug; | ||
3346 | * or when mem_cgroup_disabled(). | ||
3347 | */ | ||
3441 | if (likely(pc) && PageCgroupUsed(pc)) | 3348 | if (likely(pc) && PageCgroupUsed(pc)) |
3442 | return pc; | 3349 | return pc; |
3443 | return NULL; | 3350 | return NULL; |
@@ -3457,23 +3364,8 @@ void mem_cgroup_print_bad_page(struct page *page) | |||
3457 | 3364 | ||
3458 | pc = lookup_page_cgroup_used(page); | 3365 | pc = lookup_page_cgroup_used(page); |
3459 | if (pc) { | 3366 | if (pc) { |
3460 | int ret = -1; | 3367 | printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", |
3461 | char *path; | ||
3462 | |||
3463 | printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", | ||
3464 | pc, pc->flags, pc->mem_cgroup); | 3368 | pc, pc->flags, pc->mem_cgroup); |
3465 | |||
3466 | path = kmalloc(PATH_MAX, GFP_KERNEL); | ||
3467 | if (path) { | ||
3468 | rcu_read_lock(); | ||
3469 | ret = cgroup_path(pc->mem_cgroup->css.cgroup, | ||
3470 | path, PATH_MAX); | ||
3471 | rcu_read_unlock(); | ||
3472 | } | ||
3473 | |||
3474 | printk(KERN_CONT "(%s)\n", | ||
3475 | (ret < 0) ? "cannot get the path" : path); | ||
3476 | kfree(path); | ||
3477 | } | 3369 | } |
3478 | } | 3370 | } |
3479 | #endif | 3371 | #endif |
@@ -3534,9 +3426,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
3534 | if (!ret) | 3426 | if (!ret) |
3535 | break; | 3427 | break; |
3536 | 3428 | ||
3537 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, | 3429 | mem_cgroup_reclaim(memcg, GFP_KERNEL, |
3538 | MEM_CGROUP_RECLAIM_SHRINK, | 3430 | MEM_CGROUP_RECLAIM_SHRINK); |
3539 | NULL); | ||
3540 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 3431 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
3541 | /* Usage is reduced ? */ | 3432 | /* Usage is reduced ? */ |
3542 | if (curusage >= oldusage) | 3433 | if (curusage >= oldusage) |
@@ -3594,10 +3485,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
3594 | if (!ret) | 3485 | if (!ret) |
3595 | break; | 3486 | break; |
3596 | 3487 | ||
3597 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, | 3488 | mem_cgroup_reclaim(memcg, GFP_KERNEL, |
3598 | MEM_CGROUP_RECLAIM_NOSWAP | | 3489 | MEM_CGROUP_RECLAIM_NOSWAP | |
3599 | MEM_CGROUP_RECLAIM_SHRINK, | 3490 | MEM_CGROUP_RECLAIM_SHRINK); |
3600 | NULL); | ||
3601 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 3491 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
3602 | /* Usage is reduced ? */ | 3492 | /* Usage is reduced ? */ |
3603 | if (curusage >= oldusage) | 3493 | if (curusage >= oldusage) |
@@ -3640,10 +3530,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3640 | break; | 3530 | break; |
3641 | 3531 | ||
3642 | nr_scanned = 0; | 3532 | nr_scanned = 0; |
3643 | reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, | 3533 | reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, |
3644 | gfp_mask, | 3534 | gfp_mask, &nr_scanned); |
3645 | MEM_CGROUP_RECLAIM_SOFT, | ||
3646 | &nr_scanned); | ||
3647 | nr_reclaimed += reclaimed; | 3535 | nr_reclaimed += reclaimed; |
3648 | *total_scanned += nr_scanned; | 3536 | *total_scanned += nr_scanned; |
3649 | spin_lock(&mctz->lock); | 3537 | spin_lock(&mctz->lock); |
@@ -3711,22 +3599,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3711 | static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | 3599 | static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, |
3712 | int node, int zid, enum lru_list lru) | 3600 | int node, int zid, enum lru_list lru) |
3713 | { | 3601 | { |
3714 | struct zone *zone; | ||
3715 | struct mem_cgroup_per_zone *mz; | 3602 | struct mem_cgroup_per_zone *mz; |
3716 | struct page_cgroup *pc, *busy; | ||
3717 | unsigned long flags, loop; | 3603 | unsigned long flags, loop; |
3718 | struct list_head *list; | 3604 | struct list_head *list; |
3605 | struct page *busy; | ||
3606 | struct zone *zone; | ||
3719 | int ret = 0; | 3607 | int ret = 0; |
3720 | 3608 | ||
3721 | zone = &NODE_DATA(node)->node_zones[zid]; | 3609 | zone = &NODE_DATA(node)->node_zones[zid]; |
3722 | mz = mem_cgroup_zoneinfo(memcg, node, zid); | 3610 | mz = mem_cgroup_zoneinfo(memcg, node, zid); |
3723 | list = &mz->lists[lru]; | 3611 | list = &mz->lruvec.lists[lru]; |
3724 | 3612 | ||
3725 | loop = MEM_CGROUP_ZSTAT(mz, lru); | 3613 | loop = MEM_CGROUP_ZSTAT(mz, lru); |
3726 | /* give some margin against EBUSY etc...*/ | 3614 | /* give some margin against EBUSY etc...*/ |
3727 | loop += 256; | 3615 | loop += 256; |
3728 | busy = NULL; | 3616 | busy = NULL; |
3729 | while (loop--) { | 3617 | while (loop--) { |
3618 | struct page_cgroup *pc; | ||
3730 | struct page *page; | 3619 | struct page *page; |
3731 | 3620 | ||
3732 | ret = 0; | 3621 | ret = 0; |
@@ -3735,24 +3624,24 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3735 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3624 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3736 | break; | 3625 | break; |
3737 | } | 3626 | } |
3738 | pc = list_entry(list->prev, struct page_cgroup, lru); | 3627 | page = list_entry(list->prev, struct page, lru); |
3739 | if (busy == pc) { | 3628 | if (busy == page) { |
3740 | list_move(&pc->lru, list); | 3629 | list_move(&page->lru, list); |
3741 | busy = NULL; | 3630 | busy = NULL; |
3742 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3631 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3743 | continue; | 3632 | continue; |
3744 | } | 3633 | } |
3745 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3634 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3746 | 3635 | ||
3747 | page = lookup_cgroup_page(pc); | 3636 | pc = lookup_page_cgroup(page); |
3748 | 3637 | ||
3749 | ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); | 3638 | ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); |
3750 | if (ret == -ENOMEM) | 3639 | if (ret == -ENOMEM || ret == -EINTR) |
3751 | break; | 3640 | break; |
3752 | 3641 | ||
3753 | if (ret == -EBUSY || ret == -EINVAL) { | 3642 | if (ret == -EBUSY || ret == -EINVAL) { |
3754 | /* found lock contention or "pc" is obsolete. */ | 3643 | /* found lock contention or "pc" is obsolete. */ |
3755 | busy = pc; | 3644 | busy = page; |
3756 | cond_resched(); | 3645 | cond_resched(); |
3757 | } else | 3646 | } else |
3758 | busy = NULL; | 3647 | busy = NULL; |
@@ -4846,7 +4735,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
4846 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4735 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
4847 | mz = &pn->zoneinfo[zone]; | 4736 | mz = &pn->zoneinfo[zone]; |
4848 | for_each_lru(l) | 4737 | for_each_lru(l) |
4849 | INIT_LIST_HEAD(&mz->lists[l]); | 4738 | INIT_LIST_HEAD(&mz->lruvec.lists[l]); |
4850 | mz->usage_in_excess = 0; | 4739 | mz->usage_in_excess = 0; |
4851 | mz->on_tree = false; | 4740 | mz->on_tree = false; |
4852 | mz->mem = memcg; | 4741 | mz->mem = memcg; |
@@ -4906,7 +4795,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
4906 | mem_cgroup_remove_from_trees(memcg); | 4795 | mem_cgroup_remove_from_trees(memcg); |
4907 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 4796 | free_css_id(&mem_cgroup_subsys, &memcg->css); |
4908 | 4797 | ||
4909 | for_each_node_state(node, N_POSSIBLE) | 4798 | for_each_node(node) |
4910 | free_mem_cgroup_per_zone_info(memcg, node); | 4799 | free_mem_cgroup_per_zone_info(memcg, node); |
4911 | 4800 | ||
4912 | free_percpu(memcg->stat); | 4801 | free_percpu(memcg->stat); |
@@ -4965,13 +4854,13 @@ static int mem_cgroup_soft_limit_tree_init(void) | |||
4965 | struct mem_cgroup_tree_per_zone *rtpz; | 4854 | struct mem_cgroup_tree_per_zone *rtpz; |
4966 | int tmp, node, zone; | 4855 | int tmp, node, zone; |
4967 | 4856 | ||
4968 | for_each_node_state(node, N_POSSIBLE) { | 4857 | for_each_node(node) { |
4969 | tmp = node; | 4858 | tmp = node; |
4970 | if (!node_state(node, N_NORMAL_MEMORY)) | 4859 | if (!node_state(node, N_NORMAL_MEMORY)) |
4971 | tmp = -1; | 4860 | tmp = -1; |
4972 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | 4861 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); |
4973 | if (!rtpn) | 4862 | if (!rtpn) |
4974 | return 1; | 4863 | goto err_cleanup; |
4975 | 4864 | ||
4976 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | 4865 | soft_limit_tree.rb_tree_per_node[node] = rtpn; |
4977 | 4866 | ||
@@ -4982,6 +4871,16 @@ static int mem_cgroup_soft_limit_tree_init(void) | |||
4982 | } | 4871 | } |
4983 | } | 4872 | } |
4984 | return 0; | 4873 | return 0; |
4874 | |||
4875 | err_cleanup: | ||
4876 | for_each_node(node) { | ||
4877 | if (!soft_limit_tree.rb_tree_per_node[node]) | ||
4878 | break; | ||
4879 | kfree(soft_limit_tree.rb_tree_per_node[node]); | ||
4880 | soft_limit_tree.rb_tree_per_node[node] = NULL; | ||
4881 | } | ||
4882 | return 1; | ||
4883 | |||
4985 | } | 4884 | } |
4986 | 4885 | ||
4987 | static struct cgroup_subsys_state * __ref | 4886 | static struct cgroup_subsys_state * __ref |
@@ -4995,7 +4894,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4995 | if (!memcg) | 4894 | if (!memcg) |
4996 | return ERR_PTR(error); | 4895 | return ERR_PTR(error); |
4997 | 4896 | ||
4998 | for_each_node_state(node, N_POSSIBLE) | 4897 | for_each_node(node) |
4999 | if (alloc_mem_cgroup_per_zone_info(memcg, node)) | 4898 | if (alloc_mem_cgroup_per_zone_info(memcg, node)) |
5000 | goto free_out; | 4899 | goto free_out; |
5001 | 4900 | ||
@@ -5033,7 +4932,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
5033 | res_counter_init(&memcg->res, NULL); | 4932 | res_counter_init(&memcg->res, NULL); |
5034 | res_counter_init(&memcg->memsw, NULL); | 4933 | res_counter_init(&memcg->memsw, NULL); |
5035 | } | 4934 | } |
5036 | memcg->last_scanned_child = 0; | ||
5037 | memcg->last_scanned_node = MAX_NUMNODES; | 4935 | memcg->last_scanned_node = MAX_NUMNODES; |
5038 | INIT_LIST_HEAD(&memcg->oom_notify); | 4936 | INIT_LIST_HEAD(&memcg->oom_notify); |
5039 | 4937 | ||
@@ -5129,9 +5027,9 @@ one_by_one: | |||
5129 | } | 5027 | } |
5130 | ret = __mem_cgroup_try_charge(NULL, | 5028 | ret = __mem_cgroup_try_charge(NULL, |
5131 | GFP_KERNEL, 1, &memcg, false); | 5029 | GFP_KERNEL, 1, &memcg, false); |
5132 | if (ret || !memcg) | 5030 | if (ret) |
5133 | /* mem_cgroup_clear_mc() will do uncharge later */ | 5031 | /* mem_cgroup_clear_mc() will do uncharge later */ |
5134 | return -ENOMEM; | 5032 | return ret; |
5135 | mc.precharge++; | 5033 | mc.precharge++; |
5136 | } | 5034 | } |
5137 | return ret; | 5035 | return ret; |
@@ -5276,7 +5174,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
5276 | } | 5174 | } |
5277 | /* There is a swap entry and a page doesn't exist or isn't charged */ | 5175 | /* There is a swap entry and a page doesn't exist or isn't charged */ |
5278 | if (ent.val && !ret && | 5176 | if (ent.val && !ret && |
5279 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { | 5177 | css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { |
5280 | ret = MC_TARGET_SWAP; | 5178 | ret = MC_TARGET_SWAP; |
5281 | if (target) | 5179 | if (target) |
5282 | target->ent = ent; | 5180 | target->ent = ent; |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 06d3479513aa..56080ea36140 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1557,7 +1557,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1557 | page_is_file_cache(page)); | 1557 | page_is_file_cache(page)); |
1558 | list_add(&page->lru, &pagelist); | 1558 | list_add(&page->lru, &pagelist); |
1559 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1559 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1560 | 0, true); | 1560 | 0, MIGRATE_SYNC); |
1561 | if (ret) { | 1561 | if (ret) { |
1562 | putback_lru_pages(&pagelist); | 1562 | putback_lru_pages(&pagelist); |
1563 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1563 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
diff --git a/mm/memory.c b/mm/memory.c index 829d43735402..5e30583c2605 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -293,7 +293,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | |||
293 | { | 293 | { |
294 | struct mmu_gather_batch *batch; | 294 | struct mmu_gather_batch *batch; |
295 | 295 | ||
296 | tlb->need_flush = 1; | 296 | VM_BUG_ON(!tlb->need_flush); |
297 | 297 | ||
298 | if (tlb_fast_mode(tlb)) { | 298 | if (tlb_fast_mode(tlb)) { |
299 | free_page_and_swap_cache(page); | 299 | free_page_and_swap_cache(page); |
@@ -1231,7 +1231,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1231 | if (next-addr != HPAGE_PMD_SIZE) { | 1231 | if (next-addr != HPAGE_PMD_SIZE) { |
1232 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | 1232 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); |
1233 | split_huge_page_pmd(vma->vm_mm, pmd); | 1233 | split_huge_page_pmd(vma->vm_mm, pmd); |
1234 | } else if (zap_huge_pmd(tlb, vma, pmd)) | 1234 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
1235 | continue; | 1235 | continue; |
1236 | /* fall through */ | 1236 | /* fall through */ |
1237 | } | 1237 | } |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2168489c0bc9..6629fafd6ce4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -809,7 +809,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
809 | } | 809 | } |
810 | /* this function returns # of failed pages */ | 810 | /* this function returns # of failed pages */ |
811 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, | 811 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, |
812 | true, true); | 812 | true, MIGRATE_SYNC); |
813 | if (ret) | 813 | if (ret) |
814 | putback_lru_pages(&source); | 814 | putback_lru_pages(&source); |
815 | } | 815 | } |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e3d58f088466..06b145fb64ab 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -942,7 +942,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
942 | 942 | ||
943 | if (!list_empty(&pagelist)) { | 943 | if (!list_empty(&pagelist)) { |
944 | err = migrate_pages(&pagelist, new_node_page, dest, | 944 | err = migrate_pages(&pagelist, new_node_page, dest, |
945 | false, true); | 945 | false, MIGRATE_SYNC); |
946 | if (err) | 946 | if (err) |
947 | putback_lru_pages(&pagelist); | 947 | putback_lru_pages(&pagelist); |
948 | } | 948 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 89ea0854332e..9871a56d82c3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -216,6 +216,56 @@ out: | |||
216 | pte_unmap_unlock(ptep, ptl); | 216 | pte_unmap_unlock(ptep, ptl); |
217 | } | 217 | } |
218 | 218 | ||
219 | #ifdef CONFIG_BLOCK | ||
220 | /* Returns true if all buffers are successfully locked */ | ||
221 | static bool buffer_migrate_lock_buffers(struct buffer_head *head, | ||
222 | enum migrate_mode mode) | ||
223 | { | ||
224 | struct buffer_head *bh = head; | ||
225 | |||
226 | /* Simple case, sync compaction */ | ||
227 | if (mode != MIGRATE_ASYNC) { | ||
228 | do { | ||
229 | get_bh(bh); | ||
230 | lock_buffer(bh); | ||
231 | bh = bh->b_this_page; | ||
232 | |||
233 | } while (bh != head); | ||
234 | |||
235 | return true; | ||
236 | } | ||
237 | |||
238 | /* async case, we cannot block on lock_buffer so use trylock_buffer */ | ||
239 | do { | ||
240 | get_bh(bh); | ||
241 | if (!trylock_buffer(bh)) { | ||
242 | /* | ||
243 | * We failed to lock the buffer and cannot stall in | ||
244 | * async migration. Release the taken locks | ||
245 | */ | ||
246 | struct buffer_head *failed_bh = bh; | ||
247 | put_bh(failed_bh); | ||
248 | bh = head; | ||
249 | while (bh != failed_bh) { | ||
250 | unlock_buffer(bh); | ||
251 | put_bh(bh); | ||
252 | bh = bh->b_this_page; | ||
253 | } | ||
254 | return false; | ||
255 | } | ||
256 | |||
257 | bh = bh->b_this_page; | ||
258 | } while (bh != head); | ||
259 | return true; | ||
260 | } | ||
261 | #else | ||
262 | static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, | ||
263 | enum migrate_mode mode) | ||
264 | { | ||
265 | return true; | ||
266 | } | ||
267 | #endif /* CONFIG_BLOCK */ | ||
268 | |||
219 | /* | 269 | /* |
220 | * Replace the page in the mapping. | 270 | * Replace the page in the mapping. |
221 | * | 271 | * |
@@ -225,7 +275,8 @@ out: | |||
225 | * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. | 275 | * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. |
226 | */ | 276 | */ |
227 | static int migrate_page_move_mapping(struct address_space *mapping, | 277 | static int migrate_page_move_mapping(struct address_space *mapping, |
228 | struct page *newpage, struct page *page) | 278 | struct page *newpage, struct page *page, |
279 | struct buffer_head *head, enum migrate_mode mode) | ||
229 | { | 280 | { |
230 | int expected_count; | 281 | int expected_count; |
231 | void **pslot; | 282 | void **pslot; |
@@ -255,6 +306,20 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
255 | } | 306 | } |
256 | 307 | ||
257 | /* | 308 | /* |
309 | * In the async migration case of moving a page with buffers, lock the | ||
310 | * buffers using trylock before the mapping is moved. If the mapping | ||
311 | * was moved, we later failed to lock the buffers and could not move | ||
312 | * the mapping back due to an elevated page count, we would have to | ||
313 | * block waiting on other references to be dropped. | ||
314 | */ | ||
315 | if (mode == MIGRATE_ASYNC && head && | ||
316 | !buffer_migrate_lock_buffers(head, mode)) { | ||
317 | page_unfreeze_refs(page, expected_count); | ||
318 | spin_unlock_irq(&mapping->tree_lock); | ||
319 | return -EAGAIN; | ||
320 | } | ||
321 | |||
322 | /* | ||
258 | * Now we know that no one else is looking at the page. | 323 | * Now we know that no one else is looking at the page. |
259 | */ | 324 | */ |
260 | get_page(newpage); /* add cache reference */ | 325 | get_page(newpage); /* add cache reference */ |
@@ -409,13 +474,14 @@ EXPORT_SYMBOL(fail_migrate_page); | |||
409 | * Pages are locked upon entry and exit. | 474 | * Pages are locked upon entry and exit. |
410 | */ | 475 | */ |
411 | int migrate_page(struct address_space *mapping, | 476 | int migrate_page(struct address_space *mapping, |
412 | struct page *newpage, struct page *page) | 477 | struct page *newpage, struct page *page, |
478 | enum migrate_mode mode) | ||
413 | { | 479 | { |
414 | int rc; | 480 | int rc; |
415 | 481 | ||
416 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ | 482 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ |
417 | 483 | ||
418 | rc = migrate_page_move_mapping(mapping, newpage, page); | 484 | rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); |
419 | 485 | ||
420 | if (rc) | 486 | if (rc) |
421 | return rc; | 487 | return rc; |
@@ -432,28 +498,28 @@ EXPORT_SYMBOL(migrate_page); | |||
432 | * exist. | 498 | * exist. |
433 | */ | 499 | */ |
434 | int buffer_migrate_page(struct address_space *mapping, | 500 | int buffer_migrate_page(struct address_space *mapping, |
435 | struct page *newpage, struct page *page) | 501 | struct page *newpage, struct page *page, enum migrate_mode mode) |
436 | { | 502 | { |
437 | struct buffer_head *bh, *head; | 503 | struct buffer_head *bh, *head; |
438 | int rc; | 504 | int rc; |
439 | 505 | ||
440 | if (!page_has_buffers(page)) | 506 | if (!page_has_buffers(page)) |
441 | return migrate_page(mapping, newpage, page); | 507 | return migrate_page(mapping, newpage, page, mode); |
442 | 508 | ||
443 | head = page_buffers(page); | 509 | head = page_buffers(page); |
444 | 510 | ||
445 | rc = migrate_page_move_mapping(mapping, newpage, page); | 511 | rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); |
446 | 512 | ||
447 | if (rc) | 513 | if (rc) |
448 | return rc; | 514 | return rc; |
449 | 515 | ||
450 | bh = head; | 516 | /* |
451 | do { | 517 | * In the async case, migrate_page_move_mapping locked the buffers |
452 | get_bh(bh); | 518 | * with an IRQ-safe spinlock held. In the sync case, the buffers |
453 | lock_buffer(bh); | 519 | * need to be locked now |
454 | bh = bh->b_this_page; | 520 | */ |
455 | 521 | if (mode != MIGRATE_ASYNC) | |
456 | } while (bh != head); | 522 | BUG_ON(!buffer_migrate_lock_buffers(head, mode)); |
457 | 523 | ||
458 | ClearPagePrivate(page); | 524 | ClearPagePrivate(page); |
459 | set_page_private(newpage, page_private(page)); | 525 | set_page_private(newpage, page_private(page)); |
@@ -530,10 +596,14 @@ static int writeout(struct address_space *mapping, struct page *page) | |||
530 | * Default handling if a filesystem does not provide a migration function. | 596 | * Default handling if a filesystem does not provide a migration function. |
531 | */ | 597 | */ |
532 | static int fallback_migrate_page(struct address_space *mapping, | 598 | static int fallback_migrate_page(struct address_space *mapping, |
533 | struct page *newpage, struct page *page) | 599 | struct page *newpage, struct page *page, enum migrate_mode mode) |
534 | { | 600 | { |
535 | if (PageDirty(page)) | 601 | if (PageDirty(page)) { |
602 | /* Only writeback pages in full synchronous migration */ | ||
603 | if (mode != MIGRATE_SYNC) | ||
604 | return -EBUSY; | ||
536 | return writeout(mapping, page); | 605 | return writeout(mapping, page); |
606 | } | ||
537 | 607 | ||
538 | /* | 608 | /* |
539 | * Buffers may be managed in a filesystem specific way. | 609 | * Buffers may be managed in a filesystem specific way. |
@@ -543,7 +613,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
543 | !try_to_release_page(page, GFP_KERNEL)) | 613 | !try_to_release_page(page, GFP_KERNEL)) |
544 | return -EAGAIN; | 614 | return -EAGAIN; |
545 | 615 | ||
546 | return migrate_page(mapping, newpage, page); | 616 | return migrate_page(mapping, newpage, page, mode); |
547 | } | 617 | } |
548 | 618 | ||
549 | /* | 619 | /* |
@@ -558,7 +628,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
558 | * == 0 - success | 628 | * == 0 - success |
559 | */ | 629 | */ |
560 | static int move_to_new_page(struct page *newpage, struct page *page, | 630 | static int move_to_new_page(struct page *newpage, struct page *page, |
561 | int remap_swapcache, bool sync) | 631 | int remap_swapcache, enum migrate_mode mode) |
562 | { | 632 | { |
563 | struct address_space *mapping; | 633 | struct address_space *mapping; |
564 | int rc; | 634 | int rc; |
@@ -579,29 +649,18 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
579 | 649 | ||
580 | mapping = page_mapping(page); | 650 | mapping = page_mapping(page); |
581 | if (!mapping) | 651 | if (!mapping) |
582 | rc = migrate_page(mapping, newpage, page); | 652 | rc = migrate_page(mapping, newpage, page, mode); |
583 | else { | 653 | else if (mapping->a_ops->migratepage) |
584 | /* | 654 | /* |
585 | * Do not writeback pages if !sync and migratepage is | 655 | * Most pages have a mapping and most filesystems provide a |
586 | * not pointing to migrate_page() which is nonblocking | 656 | * migratepage callback. Anonymous pages are part of swap |
587 | * (swapcache/tmpfs uses migratepage = migrate_page). | 657 | * space which also has its own migratepage callback. This |
658 | * is the most common path for page migration. | ||
588 | */ | 659 | */ |
589 | if (PageDirty(page) && !sync && | 660 | rc = mapping->a_ops->migratepage(mapping, |
590 | mapping->a_ops->migratepage != migrate_page) | 661 | newpage, page, mode); |
591 | rc = -EBUSY; | 662 | else |
592 | else if (mapping->a_ops->migratepage) | 663 | rc = fallback_migrate_page(mapping, newpage, page, mode); |
593 | /* | ||
594 | * Most pages have a mapping and most filesystems | ||
595 | * should provide a migration function. Anonymous | ||
596 | * pages are part of swap space which also has its | ||
597 | * own migration function. This is the most common | ||
598 | * path for page migration. | ||
599 | */ | ||
600 | rc = mapping->a_ops->migratepage(mapping, | ||
601 | newpage, page); | ||
602 | else | ||
603 | rc = fallback_migrate_page(mapping, newpage, page); | ||
604 | } | ||
605 | 664 | ||
606 | if (rc) { | 665 | if (rc) { |
607 | newpage->mapping = NULL; | 666 | newpage->mapping = NULL; |
@@ -616,7 +675,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
616 | } | 675 | } |
617 | 676 | ||
618 | static int __unmap_and_move(struct page *page, struct page *newpage, | 677 | static int __unmap_and_move(struct page *page, struct page *newpage, |
619 | int force, bool offlining, bool sync) | 678 | int force, bool offlining, enum migrate_mode mode) |
620 | { | 679 | { |
621 | int rc = -EAGAIN; | 680 | int rc = -EAGAIN; |
622 | int remap_swapcache = 1; | 681 | int remap_swapcache = 1; |
@@ -625,7 +684,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
625 | struct anon_vma *anon_vma = NULL; | 684 | struct anon_vma *anon_vma = NULL; |
626 | 685 | ||
627 | if (!trylock_page(page)) { | 686 | if (!trylock_page(page)) { |
628 | if (!force || !sync) | 687 | if (!force || mode == MIGRATE_ASYNC) |
629 | goto out; | 688 | goto out; |
630 | 689 | ||
631 | /* | 690 | /* |
@@ -671,10 +730,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
671 | 730 | ||
672 | if (PageWriteback(page)) { | 731 | if (PageWriteback(page)) { |
673 | /* | 732 | /* |
674 | * For !sync, there is no point retrying as the retry loop | 733 | * Only in the case of a full syncronous migration is it |
675 | * is expected to be too short for PageWriteback to be cleared | 734 | * necessary to wait for PageWriteback. In the async case, |
735 | * the retry loop is too short and in the sync-light case, | ||
736 | * the overhead of stalling is too much | ||
676 | */ | 737 | */ |
677 | if (!sync) { | 738 | if (mode != MIGRATE_SYNC) { |
678 | rc = -EBUSY; | 739 | rc = -EBUSY; |
679 | goto uncharge; | 740 | goto uncharge; |
680 | } | 741 | } |
@@ -745,7 +806,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
745 | 806 | ||
746 | skip_unmap: | 807 | skip_unmap: |
747 | if (!page_mapped(page)) | 808 | if (!page_mapped(page)) |
748 | rc = move_to_new_page(newpage, page, remap_swapcache, sync); | 809 | rc = move_to_new_page(newpage, page, remap_swapcache, mode); |
749 | 810 | ||
750 | if (rc && remap_swapcache) | 811 | if (rc && remap_swapcache) |
751 | remove_migration_ptes(page, page); | 812 | remove_migration_ptes(page, page); |
@@ -768,7 +829,8 @@ out: | |||
768 | * to the newly allocated page in newpage. | 829 | * to the newly allocated page in newpage. |
769 | */ | 830 | */ |
770 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 831 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, |
771 | struct page *page, int force, bool offlining, bool sync) | 832 | struct page *page, int force, bool offlining, |
833 | enum migrate_mode mode) | ||
772 | { | 834 | { |
773 | int rc = 0; | 835 | int rc = 0; |
774 | int *result = NULL; | 836 | int *result = NULL; |
@@ -777,6 +839,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
777 | if (!newpage) | 839 | if (!newpage) |
778 | return -ENOMEM; | 840 | return -ENOMEM; |
779 | 841 | ||
842 | mem_cgroup_reset_owner(newpage); | ||
843 | |||
780 | if (page_count(page) == 1) { | 844 | if (page_count(page) == 1) { |
781 | /* page was freed from under us. So we are done. */ | 845 | /* page was freed from under us. So we are done. */ |
782 | goto out; | 846 | goto out; |
@@ -786,7 +850,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
786 | if (unlikely(split_huge_page(page))) | 850 | if (unlikely(split_huge_page(page))) |
787 | goto out; | 851 | goto out; |
788 | 852 | ||
789 | rc = __unmap_and_move(page, newpage, force, offlining, sync); | 853 | rc = __unmap_and_move(page, newpage, force, offlining, mode); |
790 | out: | 854 | out: |
791 | if (rc != -EAGAIN) { | 855 | if (rc != -EAGAIN) { |
792 | /* | 856 | /* |
@@ -834,7 +898,8 @@ out: | |||
834 | */ | 898 | */ |
835 | static int unmap_and_move_huge_page(new_page_t get_new_page, | 899 | static int unmap_and_move_huge_page(new_page_t get_new_page, |
836 | unsigned long private, struct page *hpage, | 900 | unsigned long private, struct page *hpage, |
837 | int force, bool offlining, bool sync) | 901 | int force, bool offlining, |
902 | enum migrate_mode mode) | ||
838 | { | 903 | { |
839 | int rc = 0; | 904 | int rc = 0; |
840 | int *result = NULL; | 905 | int *result = NULL; |
@@ -847,7 +912,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
847 | rc = -EAGAIN; | 912 | rc = -EAGAIN; |
848 | 913 | ||
849 | if (!trylock_page(hpage)) { | 914 | if (!trylock_page(hpage)) { |
850 | if (!force || !sync) | 915 | if (!force || mode != MIGRATE_SYNC) |
851 | goto out; | 916 | goto out; |
852 | lock_page(hpage); | 917 | lock_page(hpage); |
853 | } | 918 | } |
@@ -858,7 +923,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
858 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | 923 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); |
859 | 924 | ||
860 | if (!page_mapped(hpage)) | 925 | if (!page_mapped(hpage)) |
861 | rc = move_to_new_page(new_hpage, hpage, 1, sync); | 926 | rc = move_to_new_page(new_hpage, hpage, 1, mode); |
862 | 927 | ||
863 | if (rc) | 928 | if (rc) |
864 | remove_migration_ptes(hpage, hpage); | 929 | remove_migration_ptes(hpage, hpage); |
@@ -901,7 +966,7 @@ out: | |||
901 | */ | 966 | */ |
902 | int migrate_pages(struct list_head *from, | 967 | int migrate_pages(struct list_head *from, |
903 | new_page_t get_new_page, unsigned long private, bool offlining, | 968 | new_page_t get_new_page, unsigned long private, bool offlining, |
904 | bool sync) | 969 | enum migrate_mode mode) |
905 | { | 970 | { |
906 | int retry = 1; | 971 | int retry = 1; |
907 | int nr_failed = 0; | 972 | int nr_failed = 0; |
@@ -922,7 +987,7 @@ int migrate_pages(struct list_head *from, | |||
922 | 987 | ||
923 | rc = unmap_and_move(get_new_page, private, | 988 | rc = unmap_and_move(get_new_page, private, |
924 | page, pass > 2, offlining, | 989 | page, pass > 2, offlining, |
925 | sync); | 990 | mode); |
926 | 991 | ||
927 | switch(rc) { | 992 | switch(rc) { |
928 | case -ENOMEM: | 993 | case -ENOMEM: |
@@ -952,7 +1017,7 @@ out: | |||
952 | 1017 | ||
953 | int migrate_huge_pages(struct list_head *from, | 1018 | int migrate_huge_pages(struct list_head *from, |
954 | new_page_t get_new_page, unsigned long private, bool offlining, | 1019 | new_page_t get_new_page, unsigned long private, bool offlining, |
955 | bool sync) | 1020 | enum migrate_mode mode) |
956 | { | 1021 | { |
957 | int retry = 1; | 1022 | int retry = 1; |
958 | int nr_failed = 0; | 1023 | int nr_failed = 0; |
@@ -969,7 +1034,7 @@ int migrate_huge_pages(struct list_head *from, | |||
969 | 1034 | ||
970 | rc = unmap_and_move_huge_page(get_new_page, | 1035 | rc = unmap_and_move_huge_page(get_new_page, |
971 | private, page, pass > 2, offlining, | 1036 | private, page, pass > 2, offlining, |
972 | sync); | 1037 | mode); |
973 | 1038 | ||
974 | switch(rc) { | 1039 | switch(rc) { |
975 | case -ENOMEM: | 1040 | case -ENOMEM: |
@@ -1098,7 +1163,7 @@ set_status: | |||
1098 | err = 0; | 1163 | err = 0; |
1099 | if (!list_empty(&pagelist)) { | 1164 | if (!list_empty(&pagelist)) { |
1100 | err = migrate_pages(&pagelist, new_page_node, | 1165 | err = migrate_pages(&pagelist, new_page_node, |
1101 | (unsigned long)pm, 0, true); | 1166 | (unsigned long)pm, 0, MIGRATE_SYNC); |
1102 | if (err) | 1167 | if (err) |
1103 | putback_lru_pages(&pagelist); | 1168 | putback_lru_pages(&pagelist); |
1104 | } | 1169 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7c122faa05c5..2958fd8e7c9a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -152,7 +152,7 @@ struct task_struct *find_lock_task_mm(struct task_struct *p) | |||
152 | 152 | ||
153 | /* return true if the task is not adequate as candidate victim task. */ | 153 | /* return true if the task is not adequate as candidate victim task. */ |
154 | static bool oom_unkillable_task(struct task_struct *p, | 154 | static bool oom_unkillable_task(struct task_struct *p, |
155 | const struct mem_cgroup *mem, const nodemask_t *nodemask) | 155 | const struct mem_cgroup *memcg, const nodemask_t *nodemask) |
156 | { | 156 | { |
157 | if (is_global_init(p)) | 157 | if (is_global_init(p)) |
158 | return true; | 158 | return true; |
@@ -160,7 +160,7 @@ static bool oom_unkillable_task(struct task_struct *p, | |||
160 | return true; | 160 | return true; |
161 | 161 | ||
162 | /* When mem_cgroup_out_of_memory() and p is not member of the group */ | 162 | /* When mem_cgroup_out_of_memory() and p is not member of the group */ |
163 | if (mem && !task_in_mem_cgroup(p, mem)) | 163 | if (memcg && !task_in_mem_cgroup(p, memcg)) |
164 | return true; | 164 | return true; |
165 | 165 | ||
166 | /* p may not have freeable memory in nodemask */ | 166 | /* p may not have freeable memory in nodemask */ |
@@ -179,12 +179,12 @@ static bool oom_unkillable_task(struct task_struct *p, | |||
179 | * predictable as possible. The goal is to return the highest value for the | 179 | * predictable as possible. The goal is to return the highest value for the |
180 | * task consuming the most memory to avoid subsequent oom failures. | 180 | * task consuming the most memory to avoid subsequent oom failures. |
181 | */ | 181 | */ |
182 | unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, | 182 | unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, |
183 | const nodemask_t *nodemask, unsigned long totalpages) | 183 | const nodemask_t *nodemask, unsigned long totalpages) |
184 | { | 184 | { |
185 | long points; | 185 | long points; |
186 | 186 | ||
187 | if (oom_unkillable_task(p, mem, nodemask)) | 187 | if (oom_unkillable_task(p, memcg, nodemask)) |
188 | return 0; | 188 | return 0; |
189 | 189 | ||
190 | p = find_lock_task_mm(p); | 190 | p = find_lock_task_mm(p); |
@@ -308,7 +308,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
308 | * (not docbooked, we don't want this one cluttering up the manual) | 308 | * (not docbooked, we don't want this one cluttering up the manual) |
309 | */ | 309 | */ |
310 | static struct task_struct *select_bad_process(unsigned int *ppoints, | 310 | static struct task_struct *select_bad_process(unsigned int *ppoints, |
311 | unsigned long totalpages, struct mem_cgroup *mem, | 311 | unsigned long totalpages, struct mem_cgroup *memcg, |
312 | const nodemask_t *nodemask) | 312 | const nodemask_t *nodemask) |
313 | { | 313 | { |
314 | struct task_struct *g, *p; | 314 | struct task_struct *g, *p; |
@@ -320,7 +320,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
320 | 320 | ||
321 | if (p->exit_state) | 321 | if (p->exit_state) |
322 | continue; | 322 | continue; |
323 | if (oom_unkillable_task(p, mem, nodemask)) | 323 | if (oom_unkillable_task(p, memcg, nodemask)) |
324 | continue; | 324 | continue; |
325 | 325 | ||
326 | /* | 326 | /* |
@@ -364,7 +364,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
364 | } | 364 | } |
365 | } | 365 | } |
366 | 366 | ||
367 | points = oom_badness(p, mem, nodemask, totalpages); | 367 | points = oom_badness(p, memcg, nodemask, totalpages); |
368 | if (points > *ppoints) { | 368 | if (points > *ppoints) { |
369 | chosen = p; | 369 | chosen = p; |
370 | *ppoints = points; | 370 | *ppoints = points; |
@@ -387,14 +387,14 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
387 | * | 387 | * |
388 | * Call with tasklist_lock read-locked. | 388 | * Call with tasklist_lock read-locked. |
389 | */ | 389 | */ |
390 | static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) | 390 | static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) |
391 | { | 391 | { |
392 | struct task_struct *p; | 392 | struct task_struct *p; |
393 | struct task_struct *task; | 393 | struct task_struct *task; |
394 | 394 | ||
395 | pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); | 395 | pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); |
396 | for_each_process(p) { | 396 | for_each_process(p) { |
397 | if (oom_unkillable_task(p, mem, nodemask)) | 397 | if (oom_unkillable_task(p, memcg, nodemask)) |
398 | continue; | 398 | continue; |
399 | 399 | ||
400 | task = find_lock_task_mm(p); | 400 | task = find_lock_task_mm(p); |
@@ -417,7 +417,7 @@ static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) | |||
417 | } | 417 | } |
418 | 418 | ||
419 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | 419 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, |
420 | struct mem_cgroup *mem, const nodemask_t *nodemask) | 420 | struct mem_cgroup *memcg, const nodemask_t *nodemask) |
421 | { | 421 | { |
422 | task_lock(current); | 422 | task_lock(current); |
423 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | 423 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " |
@@ -427,14 +427,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
427 | cpuset_print_task_mems_allowed(current); | 427 | cpuset_print_task_mems_allowed(current); |
428 | task_unlock(current); | 428 | task_unlock(current); |
429 | dump_stack(); | 429 | dump_stack(); |
430 | mem_cgroup_print_oom_info(mem, p); | 430 | mem_cgroup_print_oom_info(memcg, p); |
431 | show_mem(SHOW_MEM_FILTER_NODES); | 431 | show_mem(SHOW_MEM_FILTER_NODES); |
432 | if (sysctl_oom_dump_tasks) | 432 | if (sysctl_oom_dump_tasks) |
433 | dump_tasks(mem, nodemask); | 433 | dump_tasks(memcg, nodemask); |
434 | } | 434 | } |
435 | 435 | ||
436 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 436 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
437 | static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) | 437 | static int oom_kill_task(struct task_struct *p) |
438 | { | 438 | { |
439 | struct task_struct *q; | 439 | struct task_struct *q; |
440 | struct mm_struct *mm; | 440 | struct mm_struct *mm; |
@@ -484,7 +484,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) | |||
484 | 484 | ||
485 | static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | 485 | static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, |
486 | unsigned int points, unsigned long totalpages, | 486 | unsigned int points, unsigned long totalpages, |
487 | struct mem_cgroup *mem, nodemask_t *nodemask, | 487 | struct mem_cgroup *memcg, nodemask_t *nodemask, |
488 | const char *message) | 488 | const char *message) |
489 | { | 489 | { |
490 | struct task_struct *victim = p; | 490 | struct task_struct *victim = p; |
@@ -493,7 +493,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
493 | unsigned int victim_points = 0; | 493 | unsigned int victim_points = 0; |
494 | 494 | ||
495 | if (printk_ratelimit()) | 495 | if (printk_ratelimit()) |
496 | dump_header(p, gfp_mask, order, mem, nodemask); | 496 | dump_header(p, gfp_mask, order, memcg, nodemask); |
497 | 497 | ||
498 | /* | 498 | /* |
499 | * If the task is already exiting, don't alarm the sysadmin or kill | 499 | * If the task is already exiting, don't alarm the sysadmin or kill |
@@ -524,7 +524,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
524 | /* | 524 | /* |
525 | * oom_badness() returns 0 if the thread is unkillable | 525 | * oom_badness() returns 0 if the thread is unkillable |
526 | */ | 526 | */ |
527 | child_points = oom_badness(child, mem, nodemask, | 527 | child_points = oom_badness(child, memcg, nodemask, |
528 | totalpages); | 528 | totalpages); |
529 | if (child_points > victim_points) { | 529 | if (child_points > victim_points) { |
530 | victim = child; | 530 | victim = child; |
@@ -533,7 +533,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
533 | } | 533 | } |
534 | } while_each_thread(p, t); | 534 | } while_each_thread(p, t); |
535 | 535 | ||
536 | return oom_kill_task(victim, mem); | 536 | return oom_kill_task(victim); |
537 | } | 537 | } |
538 | 538 | ||
539 | /* | 539 | /* |
@@ -561,7 +561,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | |||
561 | } | 561 | } |
562 | 562 | ||
563 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 563 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
564 | void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) | 564 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) |
565 | { | 565 | { |
566 | unsigned long limit; | 566 | unsigned long limit; |
567 | unsigned int points = 0; | 567 | unsigned int points = 0; |
@@ -578,14 +578,14 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) | |||
578 | } | 578 | } |
579 | 579 | ||
580 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); | 580 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); |
581 | limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; | 581 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; |
582 | read_lock(&tasklist_lock); | 582 | read_lock(&tasklist_lock); |
583 | retry: | 583 | retry: |
584 | p = select_bad_process(&points, limit, mem, NULL); | 584 | p = select_bad_process(&points, limit, memcg, NULL); |
585 | if (!p || PTR_ERR(p) == -1UL) | 585 | if (!p || PTR_ERR(p) == -1UL) |
586 | goto out; | 586 | goto out; |
587 | 587 | ||
588 | if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL, | 588 | if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL, |
589 | "Memory cgroup out of memory")) | 589 | "Memory cgroup out of memory")) |
590 | goto retry; | 590 | goto retry; |
591 | out: | 591 | out: |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 794e6715c226..0027d8f4a1bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1981,14 +1981,20 @@ static struct page * | |||
1981 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 1981 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
1982 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 1982 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
1983 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 1983 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
1984 | int migratetype, unsigned long *did_some_progress, | 1984 | int migratetype, bool sync_migration, |
1985 | bool sync_migration) | 1985 | bool *deferred_compaction, |
1986 | unsigned long *did_some_progress) | ||
1986 | { | 1987 | { |
1987 | struct page *page; | 1988 | struct page *page; |
1988 | 1989 | ||
1989 | if (!order || compaction_deferred(preferred_zone)) | 1990 | if (!order) |
1990 | return NULL; | 1991 | return NULL; |
1991 | 1992 | ||
1993 | if (compaction_deferred(preferred_zone)) { | ||
1994 | *deferred_compaction = true; | ||
1995 | return NULL; | ||
1996 | } | ||
1997 | |||
1992 | current->flags |= PF_MEMALLOC; | 1998 | current->flags |= PF_MEMALLOC; |
1993 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 1999 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
1994 | nodemask, sync_migration); | 2000 | nodemask, sync_migration); |
@@ -2016,7 +2022,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2016 | * but not enough to satisfy watermarks. | 2022 | * but not enough to satisfy watermarks. |
2017 | */ | 2023 | */ |
2018 | count_vm_event(COMPACTFAIL); | 2024 | count_vm_event(COMPACTFAIL); |
2019 | defer_compaction(preferred_zone); | 2025 | |
2026 | /* | ||
2027 | * As async compaction considers a subset of pageblocks, only | ||
2028 | * defer if the failure was a sync compaction failure. | ||
2029 | */ | ||
2030 | if (sync_migration) | ||
2031 | defer_compaction(preferred_zone); | ||
2020 | 2032 | ||
2021 | cond_resched(); | 2033 | cond_resched(); |
2022 | } | 2034 | } |
@@ -2028,8 +2040,9 @@ static inline struct page * | |||
2028 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2040 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2029 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2041 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2030 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2042 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2031 | int migratetype, unsigned long *did_some_progress, | 2043 | int migratetype, bool sync_migration, |
2032 | bool sync_migration) | 2044 | bool *deferred_compaction, |
2045 | unsigned long *did_some_progress) | ||
2033 | { | 2046 | { |
2034 | return NULL; | 2047 | return NULL; |
2035 | } | 2048 | } |
@@ -2179,6 +2192,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2179 | unsigned long pages_reclaimed = 0; | 2192 | unsigned long pages_reclaimed = 0; |
2180 | unsigned long did_some_progress; | 2193 | unsigned long did_some_progress; |
2181 | bool sync_migration = false; | 2194 | bool sync_migration = false; |
2195 | bool deferred_compaction = false; | ||
2182 | 2196 | ||
2183 | /* | 2197 | /* |
2184 | * In the slowpath, we sanity check order to avoid ever trying to | 2198 | * In the slowpath, we sanity check order to avoid ever trying to |
@@ -2259,12 +2273,22 @@ rebalance: | |||
2259 | zonelist, high_zoneidx, | 2273 | zonelist, high_zoneidx, |
2260 | nodemask, | 2274 | nodemask, |
2261 | alloc_flags, preferred_zone, | 2275 | alloc_flags, preferred_zone, |
2262 | migratetype, &did_some_progress, | 2276 | migratetype, sync_migration, |
2263 | sync_migration); | 2277 | &deferred_compaction, |
2278 | &did_some_progress); | ||
2264 | if (page) | 2279 | if (page) |
2265 | goto got_pg; | 2280 | goto got_pg; |
2266 | sync_migration = true; | 2281 | sync_migration = true; |
2267 | 2282 | ||
2283 | /* | ||
2284 | * If compaction is deferred for high-order allocations, it is because | ||
2285 | * sync compaction recently failed. In this is the case and the caller | ||
2286 | * has requested the system not be heavily disrupted, fail the | ||
2287 | * allocation now instead of entering direct reclaim | ||
2288 | */ | ||
2289 | if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) | ||
2290 | goto nopage; | ||
2291 | |||
2268 | /* Try direct reclaim and then allocating */ | 2292 | /* Try direct reclaim and then allocating */ |
2269 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2293 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
2270 | zonelist, high_zoneidx, | 2294 | zonelist, high_zoneidx, |
@@ -2328,8 +2352,9 @@ rebalance: | |||
2328 | zonelist, high_zoneidx, | 2352 | zonelist, high_zoneidx, |
2329 | nodemask, | 2353 | nodemask, |
2330 | alloc_flags, preferred_zone, | 2354 | alloc_flags, preferred_zone, |
2331 | migratetype, &did_some_progress, | 2355 | migratetype, sync_migration, |
2332 | sync_migration); | 2356 | &deferred_compaction, |
2357 | &did_some_progress); | ||
2333 | if (page) | 2358 | if (page) |
2334 | goto got_pg; | 2359 | goto got_pg; |
2335 | } | 2360 | } |
@@ -4237,7 +4262,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4237 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4262 | for (j = 0; j < MAX_NR_ZONES; j++) { |
4238 | struct zone *zone = pgdat->node_zones + j; | 4263 | struct zone *zone = pgdat->node_zones + j; |
4239 | unsigned long size, realsize, memmap_pages; | 4264 | unsigned long size, realsize, memmap_pages; |
4240 | enum lru_list l; | 4265 | enum lru_list lru; |
4241 | 4266 | ||
4242 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 4267 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
4243 | realsize = size - zone_absent_pages_in_node(nid, j, | 4268 | realsize = size - zone_absent_pages_in_node(nid, j, |
@@ -4287,8 +4312,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4287 | zone->zone_pgdat = pgdat; | 4312 | zone->zone_pgdat = pgdat; |
4288 | 4313 | ||
4289 | zone_pcp_init(zone); | 4314 | zone_pcp_init(zone); |
4290 | for_each_lru(l) | 4315 | for_each_lru(lru) |
4291 | INIT_LIST_HEAD(&zone->lru[l].list); | 4316 | INIT_LIST_HEAD(&zone->lruvec.lists[lru]); |
4292 | zone->reclaim_stat.recent_rotated[0] = 0; | 4317 | zone->reclaim_stat.recent_rotated[0] = 0; |
4293 | zone->reclaim_stat.recent_rotated[1] = 0; | 4318 | zone->reclaim_stat.recent_rotated[1] = 0; |
4294 | zone->reclaim_stat.recent_scanned[0] = 0; | 4319 | zone->reclaim_stat.recent_scanned[0] = 0; |
@@ -4642,8 +4667,10 @@ static void check_for_regular_memory(pg_data_t *pgdat) | |||
4642 | 4667 | ||
4643 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { | 4668 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { |
4644 | struct zone *zone = &pgdat->node_zones[zone_type]; | 4669 | struct zone *zone = &pgdat->node_zones[zone_type]; |
4645 | if (zone->present_pages) | 4670 | if (zone->present_pages) { |
4646 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); | 4671 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); |
4672 | break; | ||
4673 | } | ||
4647 | } | 4674 | } |
4648 | #endif | 4675 | #endif |
4649 | } | 4676 | } |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 2d123f94a8df..de1616aa9b1e 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -11,13 +11,6 @@ | |||
11 | #include <linux/swapops.h> | 11 | #include <linux/swapops.h> |
12 | #include <linux/kmemleak.h> | 12 | #include <linux/kmemleak.h> |
13 | 13 | ||
14 | static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id) | ||
15 | { | ||
16 | pc->flags = 0; | ||
17 | set_page_cgroup_array_id(pc, id); | ||
18 | pc->mem_cgroup = NULL; | ||
19 | INIT_LIST_HEAD(&pc->lru); | ||
20 | } | ||
21 | static unsigned long total_usage; | 14 | static unsigned long total_usage; |
22 | 15 | ||
23 | #if !defined(CONFIG_SPARSEMEM) | 16 | #if !defined(CONFIG_SPARSEMEM) |
@@ -35,35 +28,27 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) | |||
35 | struct page_cgroup *base; | 28 | struct page_cgroup *base; |
36 | 29 | ||
37 | base = NODE_DATA(page_to_nid(page))->node_page_cgroup; | 30 | base = NODE_DATA(page_to_nid(page))->node_page_cgroup; |
31 | #ifdef CONFIG_DEBUG_VM | ||
32 | /* | ||
33 | * The sanity checks the page allocator does upon freeing a | ||
34 | * page can reach here before the page_cgroup arrays are | ||
35 | * allocated when feeding a range of pages to the allocator | ||
36 | * for the first time during bootup or memory hotplug. | ||
37 | */ | ||
38 | if (unlikely(!base)) | 38 | if (unlikely(!base)) |
39 | return NULL; | 39 | return NULL; |
40 | 40 | #endif | |
41 | offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; | 41 | offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; |
42 | return base + offset; | 42 | return base + offset; |
43 | } | 43 | } |
44 | 44 | ||
45 | struct page *lookup_cgroup_page(struct page_cgroup *pc) | ||
46 | { | ||
47 | unsigned long pfn; | ||
48 | struct page *page; | ||
49 | pg_data_t *pgdat; | ||
50 | |||
51 | pgdat = NODE_DATA(page_cgroup_array_id(pc)); | ||
52 | pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn; | ||
53 | page = pfn_to_page(pfn); | ||
54 | VM_BUG_ON(pc != lookup_page_cgroup(page)); | ||
55 | return page; | ||
56 | } | ||
57 | |||
58 | static int __init alloc_node_page_cgroup(int nid) | 45 | static int __init alloc_node_page_cgroup(int nid) |
59 | { | 46 | { |
60 | struct page_cgroup *base, *pc; | 47 | struct page_cgroup *base; |
61 | unsigned long table_size; | 48 | unsigned long table_size; |
62 | unsigned long start_pfn, nr_pages, index; | 49 | unsigned long nr_pages; |
63 | 50 | ||
64 | start_pfn = NODE_DATA(nid)->node_start_pfn; | ||
65 | nr_pages = NODE_DATA(nid)->node_spanned_pages; | 51 | nr_pages = NODE_DATA(nid)->node_spanned_pages; |
66 | |||
67 | if (!nr_pages) | 52 | if (!nr_pages) |
68 | return 0; | 53 | return 0; |
69 | 54 | ||
@@ -73,10 +58,6 @@ static int __init alloc_node_page_cgroup(int nid) | |||
73 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 58 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); |
74 | if (!base) | 59 | if (!base) |
75 | return -ENOMEM; | 60 | return -ENOMEM; |
76 | for (index = 0; index < nr_pages; index++) { | ||
77 | pc = base + index; | ||
78 | init_page_cgroup(pc, nid); | ||
79 | } | ||
80 | NODE_DATA(nid)->node_page_cgroup = base; | 61 | NODE_DATA(nid)->node_page_cgroup = base; |
81 | total_usage += table_size; | 62 | total_usage += table_size; |
82 | return 0; | 63 | return 0; |
@@ -111,29 +92,23 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) | |||
111 | { | 92 | { |
112 | unsigned long pfn = page_to_pfn(page); | 93 | unsigned long pfn = page_to_pfn(page); |
113 | struct mem_section *section = __pfn_to_section(pfn); | 94 | struct mem_section *section = __pfn_to_section(pfn); |
114 | 95 | #ifdef CONFIG_DEBUG_VM | |
96 | /* | ||
97 | * The sanity checks the page allocator does upon freeing a | ||
98 | * page can reach here before the page_cgroup arrays are | ||
99 | * allocated when feeding a range of pages to the allocator | ||
100 | * for the first time during bootup or memory hotplug. | ||
101 | */ | ||
115 | if (!section->page_cgroup) | 102 | if (!section->page_cgroup) |
116 | return NULL; | 103 | return NULL; |
104 | #endif | ||
117 | return section->page_cgroup + pfn; | 105 | return section->page_cgroup + pfn; |
118 | } | 106 | } |
119 | 107 | ||
120 | struct page *lookup_cgroup_page(struct page_cgroup *pc) | ||
121 | { | ||
122 | struct mem_section *section; | ||
123 | struct page *page; | ||
124 | unsigned long nr; | ||
125 | |||
126 | nr = page_cgroup_array_id(pc); | ||
127 | section = __nr_to_section(nr); | ||
128 | page = pfn_to_page(pc - section->page_cgroup); | ||
129 | VM_BUG_ON(pc != lookup_page_cgroup(page)); | ||
130 | return page; | ||
131 | } | ||
132 | |||
133 | static void *__meminit alloc_page_cgroup(size_t size, int nid) | 108 | static void *__meminit alloc_page_cgroup(size_t size, int nid) |
134 | { | 109 | { |
110 | gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; | ||
135 | void *addr = NULL; | 111 | void *addr = NULL; |
136 | gfp_t flags = GFP_KERNEL | __GFP_NOWARN; | ||
137 | 112 | ||
138 | addr = alloc_pages_exact_nid(nid, size, flags); | 113 | addr = alloc_pages_exact_nid(nid, size, flags); |
139 | if (addr) { | 114 | if (addr) { |
@@ -142,39 +117,20 @@ static void *__meminit alloc_page_cgroup(size_t size, int nid) | |||
142 | } | 117 | } |
143 | 118 | ||
144 | if (node_state(nid, N_HIGH_MEMORY)) | 119 | if (node_state(nid, N_HIGH_MEMORY)) |
145 | addr = vmalloc_node(size, nid); | 120 | addr = vzalloc_node(size, nid); |
146 | else | 121 | else |
147 | addr = vmalloc(size); | 122 | addr = vzalloc(size); |
148 | 123 | ||
149 | return addr; | 124 | return addr; |
150 | } | 125 | } |
151 | 126 | ||
152 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
153 | static void free_page_cgroup(void *addr) | ||
154 | { | ||
155 | if (is_vmalloc_addr(addr)) { | ||
156 | vfree(addr); | ||
157 | } else { | ||
158 | struct page *page = virt_to_page(addr); | ||
159 | size_t table_size = | ||
160 | sizeof(struct page_cgroup) * PAGES_PER_SECTION; | ||
161 | |||
162 | BUG_ON(PageReserved(page)); | ||
163 | free_pages_exact(addr, table_size); | ||
164 | } | ||
165 | } | ||
166 | #endif | ||
167 | |||
168 | static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) | 127 | static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) |
169 | { | 128 | { |
170 | struct page_cgroup *base, *pc; | ||
171 | struct mem_section *section; | 129 | struct mem_section *section; |
130 | struct page_cgroup *base; | ||
172 | unsigned long table_size; | 131 | unsigned long table_size; |
173 | unsigned long nr; | ||
174 | int index; | ||
175 | 132 | ||
176 | nr = pfn_to_section_nr(pfn); | 133 | section = __pfn_to_section(pfn); |
177 | section = __nr_to_section(nr); | ||
178 | 134 | ||
179 | if (section->page_cgroup) | 135 | if (section->page_cgroup) |
180 | return 0; | 136 | return 0; |
@@ -194,10 +150,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) | |||
194 | return -ENOMEM; | 150 | return -ENOMEM; |
195 | } | 151 | } |
196 | 152 | ||
197 | for (index = 0; index < PAGES_PER_SECTION; index++) { | ||
198 | pc = base + index; | ||
199 | init_page_cgroup(pc, nr); | ||
200 | } | ||
201 | /* | 153 | /* |
202 | * The passed "pfn" may not be aligned to SECTION. For the calculation | 154 | * The passed "pfn" may not be aligned to SECTION. For the calculation |
203 | * we need to apply a mask. | 155 | * we need to apply a mask. |
@@ -208,6 +160,20 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) | |||
208 | return 0; | 160 | return 0; |
209 | } | 161 | } |
210 | #ifdef CONFIG_MEMORY_HOTPLUG | 162 | #ifdef CONFIG_MEMORY_HOTPLUG |
163 | static void free_page_cgroup(void *addr) | ||
164 | { | ||
165 | if (is_vmalloc_addr(addr)) { | ||
166 | vfree(addr); | ||
167 | } else { | ||
168 | struct page *page = virt_to_page(addr); | ||
169 | size_t table_size = | ||
170 | sizeof(struct page_cgroup) * PAGES_PER_SECTION; | ||
171 | |||
172 | BUG_ON(PageReserved(page)); | ||
173 | free_pages_exact(addr, table_size); | ||
174 | } | ||
175 | } | ||
176 | |||
211 | void __free_page_cgroup(unsigned long pfn) | 177 | void __free_page_cgroup(unsigned long pfn) |
212 | { | 178 | { |
213 | struct mem_section *ms; | 179 | struct mem_section *ms; |
@@ -366,7 +332,6 @@ struct swap_cgroup { | |||
366 | unsigned short id; | 332 | unsigned short id; |
367 | }; | 333 | }; |
368 | #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) | 334 | #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) |
369 | #define SC_POS_MASK (SC_PER_PAGE - 1) | ||
370 | 335 | ||
371 | /* | 336 | /* |
372 | * SwapCgroup implements "lookup" and "exchange" operations. | 337 | * SwapCgroup implements "lookup" and "exchange" operations. |
@@ -408,6 +373,21 @@ not_enough_page: | |||
408 | return -ENOMEM; | 373 | return -ENOMEM; |
409 | } | 374 | } |
410 | 375 | ||
376 | static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, | ||
377 | struct swap_cgroup_ctrl **ctrlp) | ||
378 | { | ||
379 | pgoff_t offset = swp_offset(ent); | ||
380 | struct swap_cgroup_ctrl *ctrl; | ||
381 | struct page *mappage; | ||
382 | |||
383 | ctrl = &swap_cgroup_ctrl[swp_type(ent)]; | ||
384 | if (ctrlp) | ||
385 | *ctrlp = ctrl; | ||
386 | |||
387 | mappage = ctrl->map[offset / SC_PER_PAGE]; | ||
388 | return page_address(mappage) + offset % SC_PER_PAGE; | ||
389 | } | ||
390 | |||
411 | /** | 391 | /** |
412 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. | 392 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. |
413 | * @end: swap entry to be cmpxchged | 393 | * @end: swap entry to be cmpxchged |
@@ -420,21 +400,13 @@ not_enough_page: | |||
420 | unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | 400 | unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, |
421 | unsigned short old, unsigned short new) | 401 | unsigned short old, unsigned short new) |
422 | { | 402 | { |
423 | int type = swp_type(ent); | ||
424 | unsigned long offset = swp_offset(ent); | ||
425 | unsigned long idx = offset / SC_PER_PAGE; | ||
426 | unsigned long pos = offset & SC_POS_MASK; | ||
427 | struct swap_cgroup_ctrl *ctrl; | 403 | struct swap_cgroup_ctrl *ctrl; |
428 | struct page *mappage; | ||
429 | struct swap_cgroup *sc; | 404 | struct swap_cgroup *sc; |
430 | unsigned long flags; | 405 | unsigned long flags; |
431 | unsigned short retval; | 406 | unsigned short retval; |
432 | 407 | ||
433 | ctrl = &swap_cgroup_ctrl[type]; | 408 | sc = lookup_swap_cgroup(ent, &ctrl); |
434 | 409 | ||
435 | mappage = ctrl->map[idx]; | ||
436 | sc = page_address(mappage); | ||
437 | sc += pos; | ||
438 | spin_lock_irqsave(&ctrl->lock, flags); | 410 | spin_lock_irqsave(&ctrl->lock, flags); |
439 | retval = sc->id; | 411 | retval = sc->id; |
440 | if (retval == old) | 412 | if (retval == old) |
@@ -455,21 +427,13 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | |||
455 | */ | 427 | */ |
456 | unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | 428 | unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) |
457 | { | 429 | { |
458 | int type = swp_type(ent); | ||
459 | unsigned long offset = swp_offset(ent); | ||
460 | unsigned long idx = offset / SC_PER_PAGE; | ||
461 | unsigned long pos = offset & SC_POS_MASK; | ||
462 | struct swap_cgroup_ctrl *ctrl; | 430 | struct swap_cgroup_ctrl *ctrl; |
463 | struct page *mappage; | ||
464 | struct swap_cgroup *sc; | 431 | struct swap_cgroup *sc; |
465 | unsigned short old; | 432 | unsigned short old; |
466 | unsigned long flags; | 433 | unsigned long flags; |
467 | 434 | ||
468 | ctrl = &swap_cgroup_ctrl[type]; | 435 | sc = lookup_swap_cgroup(ent, &ctrl); |
469 | 436 | ||
470 | mappage = ctrl->map[idx]; | ||
471 | sc = page_address(mappage); | ||
472 | sc += pos; | ||
473 | spin_lock_irqsave(&ctrl->lock, flags); | 437 | spin_lock_irqsave(&ctrl->lock, flags); |
474 | old = sc->id; | 438 | old = sc->id; |
475 | sc->id = id; | 439 | sc->id = id; |
@@ -479,28 +443,14 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | |||
479 | } | 443 | } |
480 | 444 | ||
481 | /** | 445 | /** |
482 | * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry | 446 | * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry |
483 | * @ent: swap entry to be looked up. | 447 | * @ent: swap entry to be looked up. |
484 | * | 448 | * |
485 | * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) | 449 | * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) |
486 | */ | 450 | */ |
487 | unsigned short lookup_swap_cgroup(swp_entry_t ent) | 451 | unsigned short lookup_swap_cgroup_id(swp_entry_t ent) |
488 | { | 452 | { |
489 | int type = swp_type(ent); | 453 | return lookup_swap_cgroup(ent, NULL)->id; |
490 | unsigned long offset = swp_offset(ent); | ||
491 | unsigned long idx = offset / SC_PER_PAGE; | ||
492 | unsigned long pos = offset & SC_POS_MASK; | ||
493 | struct swap_cgroup_ctrl *ctrl; | ||
494 | struct page *mappage; | ||
495 | struct swap_cgroup *sc; | ||
496 | unsigned short ret; | ||
497 | |||
498 | ctrl = &swap_cgroup_ctrl[type]; | ||
499 | mappage = ctrl->map[idx]; | ||
500 | sc = page_address(mappage); | ||
501 | sc += pos; | ||
502 | ret = sc->id; | ||
503 | return ret; | ||
504 | } | 454 | } |
505 | 455 | ||
506 | int swap_cgroup_swapon(int type, unsigned long max_pages) | 456 | int swap_cgroup_swapon(int type, unsigned long max_pages) |
@@ -773,7 +773,7 @@ out: | |||
773 | } | 773 | } |
774 | 774 | ||
775 | static int page_referenced_anon(struct page *page, | 775 | static int page_referenced_anon(struct page *page, |
776 | struct mem_cgroup *mem_cont, | 776 | struct mem_cgroup *memcg, |
777 | unsigned long *vm_flags) | 777 | unsigned long *vm_flags) |
778 | { | 778 | { |
779 | unsigned int mapcount; | 779 | unsigned int mapcount; |
@@ -796,7 +796,7 @@ static int page_referenced_anon(struct page *page, | |||
796 | * counting on behalf of references from different | 796 | * counting on behalf of references from different |
797 | * cgroups | 797 | * cgroups |
798 | */ | 798 | */ |
799 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 799 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) |
800 | continue; | 800 | continue; |
801 | referenced += page_referenced_one(page, vma, address, | 801 | referenced += page_referenced_one(page, vma, address, |
802 | &mapcount, vm_flags); | 802 | &mapcount, vm_flags); |
@@ -811,7 +811,7 @@ static int page_referenced_anon(struct page *page, | |||
811 | /** | 811 | /** |
812 | * page_referenced_file - referenced check for object-based rmap | 812 | * page_referenced_file - referenced check for object-based rmap |
813 | * @page: the page we're checking references on. | 813 | * @page: the page we're checking references on. |
814 | * @mem_cont: target memory controller | 814 | * @memcg: target memory control group |
815 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | 815 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page |
816 | * | 816 | * |
817 | * For an object-based mapped page, find all the places it is mapped and | 817 | * For an object-based mapped page, find all the places it is mapped and |
@@ -822,7 +822,7 @@ static int page_referenced_anon(struct page *page, | |||
822 | * This function is only called from page_referenced for object-based pages. | 822 | * This function is only called from page_referenced for object-based pages. |
823 | */ | 823 | */ |
824 | static int page_referenced_file(struct page *page, | 824 | static int page_referenced_file(struct page *page, |
825 | struct mem_cgroup *mem_cont, | 825 | struct mem_cgroup *memcg, |
826 | unsigned long *vm_flags) | 826 | unsigned long *vm_flags) |
827 | { | 827 | { |
828 | unsigned int mapcount; | 828 | unsigned int mapcount; |
@@ -864,7 +864,7 @@ static int page_referenced_file(struct page *page, | |||
864 | * counting on behalf of references from different | 864 | * counting on behalf of references from different |
865 | * cgroups | 865 | * cgroups |
866 | */ | 866 | */ |
867 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 867 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) |
868 | continue; | 868 | continue; |
869 | referenced += page_referenced_one(page, vma, address, | 869 | referenced += page_referenced_one(page, vma, address, |
870 | &mapcount, vm_flags); | 870 | &mapcount, vm_flags); |
@@ -880,7 +880,7 @@ static int page_referenced_file(struct page *page, | |||
880 | * page_referenced - test if the page was referenced | 880 | * page_referenced - test if the page was referenced |
881 | * @page: the page to test | 881 | * @page: the page to test |
882 | * @is_locked: caller holds lock on the page | 882 | * @is_locked: caller holds lock on the page |
883 | * @mem_cont: target memory controller | 883 | * @memcg: target memory cgroup |
884 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | 884 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page |
885 | * | 885 | * |
886 | * Quick test_and_clear_referenced for all mappings to a page, | 886 | * Quick test_and_clear_referenced for all mappings to a page, |
@@ -888,7 +888,7 @@ static int page_referenced_file(struct page *page, | |||
888 | */ | 888 | */ |
889 | int page_referenced(struct page *page, | 889 | int page_referenced(struct page *page, |
890 | int is_locked, | 890 | int is_locked, |
891 | struct mem_cgroup *mem_cont, | 891 | struct mem_cgroup *memcg, |
892 | unsigned long *vm_flags) | 892 | unsigned long *vm_flags) |
893 | { | 893 | { |
894 | int referenced = 0; | 894 | int referenced = 0; |
@@ -904,13 +904,13 @@ int page_referenced(struct page *page, | |||
904 | } | 904 | } |
905 | } | 905 | } |
906 | if (unlikely(PageKsm(page))) | 906 | if (unlikely(PageKsm(page))) |
907 | referenced += page_referenced_ksm(page, mem_cont, | 907 | referenced += page_referenced_ksm(page, memcg, |
908 | vm_flags); | 908 | vm_flags); |
909 | else if (PageAnon(page)) | 909 | else if (PageAnon(page)) |
910 | referenced += page_referenced_anon(page, mem_cont, | 910 | referenced += page_referenced_anon(page, memcg, |
911 | vm_flags); | 911 | vm_flags); |
912 | else if (page->mapping) | 912 | else if (page->mapping) |
913 | referenced += page_referenced_file(page, mem_cont, | 913 | referenced += page_referenced_file(page, memcg, |
914 | vm_flags); | 914 | vm_flags); |
915 | if (we_locked) | 915 | if (we_locked) |
916 | unlock_page(page); | 916 | unlock_page(page); |
@@ -366,7 +366,8 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page | |||
366 | const char *n) | 366 | const char *n) |
367 | { | 367 | { |
368 | VM_BUG_ON(!irqs_disabled()); | 368 | VM_BUG_ON(!irqs_disabled()); |
369 | #ifdef CONFIG_CMPXCHG_DOUBLE | 369 | #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ |
370 | defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) | ||
370 | if (s->flags & __CMPXCHG_DOUBLE) { | 371 | if (s->flags & __CMPXCHG_DOUBLE) { |
371 | if (cmpxchg_double(&page->freelist, &page->counters, | 372 | if (cmpxchg_double(&page->freelist, &page->counters, |
372 | freelist_old, counters_old, | 373 | freelist_old, counters_old, |
@@ -400,7 +401,8 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | |||
400 | void *freelist_new, unsigned long counters_new, | 401 | void *freelist_new, unsigned long counters_new, |
401 | const char *n) | 402 | const char *n) |
402 | { | 403 | { |
403 | #ifdef CONFIG_CMPXCHG_DOUBLE | 404 | #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ |
405 | defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) | ||
404 | if (s->flags & __CMPXCHG_DOUBLE) { | 406 | if (s->flags & __CMPXCHG_DOUBLE) { |
405 | if (cmpxchg_double(&page->freelist, &page->counters, | 407 | if (cmpxchg_double(&page->freelist, &page->counters, |
406 | freelist_old, counters_old, | 408 | freelist_old, counters_old, |
@@ -3014,7 +3016,8 @@ static int kmem_cache_open(struct kmem_cache *s, | |||
3014 | } | 3016 | } |
3015 | } | 3017 | } |
3016 | 3018 | ||
3017 | #ifdef CONFIG_CMPXCHG_DOUBLE | 3019 | #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ |
3020 | defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) | ||
3018 | if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) | 3021 | if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) |
3019 | /* Enable fast mode */ | 3022 | /* Enable fast mode */ |
3020 | s->flags |= __CMPXCHG_DOUBLE; | 3023 | s->flags |= __CMPXCHG_DOUBLE; |
@@ -23,7 +23,6 @@ | |||
23 | #include <linux/init.h> | 23 | #include <linux/init.h> |
24 | #include <linux/export.h> | 24 | #include <linux/export.h> |
25 | #include <linux/mm_inline.h> | 25 | #include <linux/mm_inline.h> |
26 | #include <linux/buffer_head.h> /* for try_to_release_page() */ | ||
27 | #include <linux/percpu_counter.h> | 26 | #include <linux/percpu_counter.h> |
28 | #include <linux/percpu.h> | 27 | #include <linux/percpu.h> |
29 | #include <linux/cpu.h> | 28 | #include <linux/cpu.h> |
@@ -54,7 +53,7 @@ static void __page_cache_release(struct page *page) | |||
54 | spin_lock_irqsave(&zone->lru_lock, flags); | 53 | spin_lock_irqsave(&zone->lru_lock, flags); |
55 | VM_BUG_ON(!PageLRU(page)); | 54 | VM_BUG_ON(!PageLRU(page)); |
56 | __ClearPageLRU(page); | 55 | __ClearPageLRU(page); |
57 | del_page_from_lru(zone, page); | 56 | del_page_from_lru_list(zone, page, page_off_lru(page)); |
58 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 57 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
59 | } | 58 | } |
60 | } | 59 | } |
@@ -232,12 +231,14 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, | |||
232 | static void pagevec_move_tail_fn(struct page *page, void *arg) | 231 | static void pagevec_move_tail_fn(struct page *page, void *arg) |
233 | { | 232 | { |
234 | int *pgmoved = arg; | 233 | int *pgmoved = arg; |
235 | struct zone *zone = page_zone(page); | ||
236 | 234 | ||
237 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | 235 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
238 | enum lru_list lru = page_lru_base_type(page); | 236 | enum lru_list lru = page_lru_base_type(page); |
239 | list_move_tail(&page->lru, &zone->lru[lru].list); | 237 | struct lruvec *lruvec; |
240 | mem_cgroup_rotate_reclaimable_page(page); | 238 | |
239 | lruvec = mem_cgroup_lru_move_lists(page_zone(page), | ||
240 | page, lru, lru); | ||
241 | list_move_tail(&page->lru, &lruvec->lists[lru]); | ||
241 | (*pgmoved)++; | 242 | (*pgmoved)++; |
242 | } | 243 | } |
243 | } | 244 | } |
@@ -368,7 +369,6 @@ void mark_page_accessed(struct page *page) | |||
368 | SetPageReferenced(page); | 369 | SetPageReferenced(page); |
369 | } | 370 | } |
370 | } | 371 | } |
371 | |||
372 | EXPORT_SYMBOL(mark_page_accessed); | 372 | EXPORT_SYMBOL(mark_page_accessed); |
373 | 373 | ||
374 | void __lru_cache_add(struct page *page, enum lru_list lru) | 374 | void __lru_cache_add(struct page *page, enum lru_list lru) |
@@ -377,7 +377,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru) | |||
377 | 377 | ||
378 | page_cache_get(page); | 378 | page_cache_get(page); |
379 | if (!pagevec_add(pvec, page)) | 379 | if (!pagevec_add(pvec, page)) |
380 | ____pagevec_lru_add(pvec, lru); | 380 | __pagevec_lru_add(pvec, lru); |
381 | put_cpu_var(lru_add_pvecs); | 381 | put_cpu_var(lru_add_pvecs); |
382 | } | 382 | } |
383 | EXPORT_SYMBOL(__lru_cache_add); | 383 | EXPORT_SYMBOL(__lru_cache_add); |
@@ -476,12 +476,13 @@ static void lru_deactivate_fn(struct page *page, void *arg) | |||
476 | */ | 476 | */ |
477 | SetPageReclaim(page); | 477 | SetPageReclaim(page); |
478 | } else { | 478 | } else { |
479 | struct lruvec *lruvec; | ||
479 | /* | 480 | /* |
480 | * The page's writeback ends up during pagevec | 481 | * The page's writeback ends up during pagevec |
481 | * We moves tha page into tail of inactive. | 482 | * We moves tha page into tail of inactive. |
482 | */ | 483 | */ |
483 | list_move_tail(&page->lru, &zone->lru[lru].list); | 484 | lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru); |
484 | mem_cgroup_rotate_reclaimable_page(page); | 485 | list_move_tail(&page->lru, &lruvec->lists[lru]); |
485 | __count_vm_event(PGROTATED); | 486 | __count_vm_event(PGROTATED); |
486 | } | 487 | } |
487 | 488 | ||
@@ -504,7 +505,7 @@ static void drain_cpu_pagevecs(int cpu) | |||
504 | for_each_lru(lru) { | 505 | for_each_lru(lru) { |
505 | pvec = &pvecs[lru - LRU_BASE]; | 506 | pvec = &pvecs[lru - LRU_BASE]; |
506 | if (pagevec_count(pvec)) | 507 | if (pagevec_count(pvec)) |
507 | ____pagevec_lru_add(pvec, lru); | 508 | __pagevec_lru_add(pvec, lru); |
508 | } | 509 | } |
509 | 510 | ||
510 | pvec = &per_cpu(lru_rotate_pvecs, cpu); | 511 | pvec = &per_cpu(lru_rotate_pvecs, cpu); |
@@ -616,7 +617,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
616 | } | 617 | } |
617 | VM_BUG_ON(!PageLRU(page)); | 618 | VM_BUG_ON(!PageLRU(page)); |
618 | __ClearPageLRU(page); | 619 | __ClearPageLRU(page); |
619 | del_page_from_lru(zone, page); | 620 | del_page_from_lru_list(zone, page, page_off_lru(page)); |
620 | } | 621 | } |
621 | 622 | ||
622 | list_add(&page->lru, &pages_to_free); | 623 | list_add(&page->lru, &pages_to_free); |
@@ -644,9 +645,9 @@ void __pagevec_release(struct pagevec *pvec) | |||
644 | release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); | 645 | release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); |
645 | pagevec_reinit(pvec); | 646 | pagevec_reinit(pvec); |
646 | } | 647 | } |
647 | |||
648 | EXPORT_SYMBOL(__pagevec_release); | 648 | EXPORT_SYMBOL(__pagevec_release); |
649 | 649 | ||
650 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
650 | /* used by __split_huge_page_refcount() */ | 651 | /* used by __split_huge_page_refcount() */ |
651 | void lru_add_page_tail(struct zone* zone, | 652 | void lru_add_page_tail(struct zone* zone, |
652 | struct page *page, struct page *page_tail) | 653 | struct page *page, struct page *page_tail) |
@@ -654,7 +655,6 @@ void lru_add_page_tail(struct zone* zone, | |||
654 | int active; | 655 | int active; |
655 | enum lru_list lru; | 656 | enum lru_list lru; |
656 | const int file = 0; | 657 | const int file = 0; |
657 | struct list_head *head; | ||
658 | 658 | ||
659 | VM_BUG_ON(!PageHead(page)); | 659 | VM_BUG_ON(!PageHead(page)); |
660 | VM_BUG_ON(PageCompound(page_tail)); | 660 | VM_BUG_ON(PageCompound(page_tail)); |
@@ -673,18 +673,30 @@ void lru_add_page_tail(struct zone* zone, | |||
673 | lru = LRU_INACTIVE_ANON; | 673 | lru = LRU_INACTIVE_ANON; |
674 | } | 674 | } |
675 | update_page_reclaim_stat(zone, page_tail, file, active); | 675 | update_page_reclaim_stat(zone, page_tail, file, active); |
676 | if (likely(PageLRU(page))) | ||
677 | head = page->lru.prev; | ||
678 | else | ||
679 | head = &zone->lru[lru].list; | ||
680 | __add_page_to_lru_list(zone, page_tail, lru, head); | ||
681 | } else { | 676 | } else { |
682 | SetPageUnevictable(page_tail); | 677 | SetPageUnevictable(page_tail); |
683 | add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); | 678 | lru = LRU_UNEVICTABLE; |
679 | } | ||
680 | |||
681 | if (likely(PageLRU(page))) | ||
682 | list_add_tail(&page_tail->lru, &page->lru); | ||
683 | else { | ||
684 | struct list_head *list_head; | ||
685 | /* | ||
686 | * Head page has not yet been counted, as an hpage, | ||
687 | * so we must account for each subpage individually. | ||
688 | * | ||
689 | * Use the standard add function to put page_tail on the list, | ||
690 | * but then correct its position so they all end up in order. | ||
691 | */ | ||
692 | add_page_to_lru_list(zone, page_tail, lru); | ||
693 | list_head = page_tail->lru.prev; | ||
694 | list_move_tail(&page_tail->lru, list_head); | ||
684 | } | 695 | } |
685 | } | 696 | } |
697 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
686 | 698 | ||
687 | static void ____pagevec_lru_add_fn(struct page *page, void *arg) | 699 | static void __pagevec_lru_add_fn(struct page *page, void *arg) |
688 | { | 700 | { |
689 | enum lru_list lru = (enum lru_list)arg; | 701 | enum lru_list lru = (enum lru_list)arg; |
690 | struct zone *zone = page_zone(page); | 702 | struct zone *zone = page_zone(page); |
@@ -706,32 +718,13 @@ static void ____pagevec_lru_add_fn(struct page *page, void *arg) | |||
706 | * Add the passed pages to the LRU, then drop the caller's refcount | 718 | * Add the passed pages to the LRU, then drop the caller's refcount |
707 | * on them. Reinitialises the caller's pagevec. | 719 | * on them. Reinitialises the caller's pagevec. |
708 | */ | 720 | */ |
709 | void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) | 721 | void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) |
710 | { | 722 | { |
711 | VM_BUG_ON(is_unevictable_lru(lru)); | 723 | VM_BUG_ON(is_unevictable_lru(lru)); |
712 | 724 | ||
713 | pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); | 725 | pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru); |
714 | } | ||
715 | |||
716 | EXPORT_SYMBOL(____pagevec_lru_add); | ||
717 | |||
718 | /* | ||
719 | * Try to drop buffers from the pages in a pagevec | ||
720 | */ | ||
721 | void pagevec_strip(struct pagevec *pvec) | ||
722 | { | ||
723 | int i; | ||
724 | |||
725 | for (i = 0; i < pagevec_count(pvec); i++) { | ||
726 | struct page *page = pvec->pages[i]; | ||
727 | |||
728 | if (page_has_private(page) && trylock_page(page)) { | ||
729 | if (page_has_private(page)) | ||
730 | try_to_release_page(page, 0); | ||
731 | unlock_page(page); | ||
732 | } | ||
733 | } | ||
734 | } | 726 | } |
727 | EXPORT_SYMBOL(__pagevec_lru_add); | ||
735 | 728 | ||
736 | /** | 729 | /** |
737 | * pagevec_lookup - gang pagecache lookup | 730 | * pagevec_lookup - gang pagecache lookup |
@@ -755,7 +748,6 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, | |||
755 | pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); | 748 | pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); |
756 | return pagevec_count(pvec); | 749 | return pagevec_count(pvec); |
757 | } | 750 | } |
758 | |||
759 | EXPORT_SYMBOL(pagevec_lookup); | 751 | EXPORT_SYMBOL(pagevec_lookup); |
760 | 752 | ||
761 | unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, | 753 | unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, |
@@ -765,7 +757,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, | |||
765 | nr_pages, pvec->pages); | 757 | nr_pages, pvec->pages); |
766 | return pagevec_count(pvec); | 758 | return pagevec_count(pvec); |
767 | } | 759 | } |
768 | |||
769 | EXPORT_SYMBOL(pagevec_lookup_tag); | 760 | EXPORT_SYMBOL(pagevec_lookup_tag); |
770 | 761 | ||
771 | /* | 762 | /* |
diff --git a/mm/swap_state.c b/mm/swap_state.c index ea6b32d61873..470038a91873 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -300,6 +300,16 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
300 | new_page = alloc_page_vma(gfp_mask, vma, addr); | 300 | new_page = alloc_page_vma(gfp_mask, vma, addr); |
301 | if (!new_page) | 301 | if (!new_page) |
302 | break; /* Out of memory */ | 302 | break; /* Out of memory */ |
303 | /* | ||
304 | * The memcg-specific accounting when moving | ||
305 | * pages around the LRU lists relies on the | ||
306 | * page's owner (memcg) to be valid. Usually, | ||
307 | * pages are assigned to a new owner before | ||
308 | * being put on the LRU list, but since this | ||
309 | * is not the case here, the stale owner from | ||
310 | * a previous allocation cycle must be reset. | ||
311 | */ | ||
312 | mem_cgroup_reset_owner(new_page); | ||
303 | } | 313 | } |
304 | 314 | ||
305 | /* | 315 | /* |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 9520592d4231..d999f090dfda 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -847,12 +847,13 @@ unsigned int count_swap_pages(int type, int free) | |||
847 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | 847 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, |
848 | unsigned long addr, swp_entry_t entry, struct page *page) | 848 | unsigned long addr, swp_entry_t entry, struct page *page) |
849 | { | 849 | { |
850 | struct mem_cgroup *ptr; | 850 | struct mem_cgroup *memcg; |
851 | spinlock_t *ptl; | 851 | spinlock_t *ptl; |
852 | pte_t *pte; | 852 | pte_t *pte; |
853 | int ret = 1; | 853 | int ret = 1; |
854 | 854 | ||
855 | if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { | 855 | if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, |
856 | GFP_KERNEL, &memcg)) { | ||
856 | ret = -ENOMEM; | 857 | ret = -ENOMEM; |
857 | goto out_nolock; | 858 | goto out_nolock; |
858 | } | 859 | } |
@@ -860,7 +861,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
860 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 861 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
861 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { | 862 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { |
862 | if (ret > 0) | 863 | if (ret > 0) |
863 | mem_cgroup_cancel_charge_swapin(ptr); | 864 | mem_cgroup_cancel_charge_swapin(memcg); |
864 | ret = 0; | 865 | ret = 0; |
865 | goto out; | 866 | goto out; |
866 | } | 867 | } |
@@ -871,7 +872,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
871 | set_pte_at(vma->vm_mm, addr, pte, | 872 | set_pte_at(vma->vm_mm, addr, pte, |
872 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 873 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
873 | page_add_anon_rmap(page, vma, addr); | 874 | page_add_anon_rmap(page, vma, addr); |
874 | mem_cgroup_commit_charge_swapin(page, ptr); | 875 | mem_cgroup_commit_charge_swapin(page, memcg); |
875 | swap_free(entry); | 876 | swap_free(entry); |
876 | /* | 877 | /* |
877 | * Move the page to the active list so it is not | 878 | * Move the page to the active list so it is not |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 877ca046f43d..86ce9a526c17 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -2378,7 +2378,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | |||
2378 | vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); | 2378 | vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); |
2379 | vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); | 2379 | vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); |
2380 | if (!vas || !vms) | 2380 | if (!vas || !vms) |
2381 | goto err_free; | 2381 | goto err_free2; |
2382 | 2382 | ||
2383 | for (area = 0; area < nr_vms; area++) { | 2383 | for (area = 0; area < nr_vms; area++) { |
2384 | vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); | 2384 | vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); |
@@ -2476,11 +2476,10 @@ found: | |||
2476 | 2476 | ||
2477 | err_free: | 2477 | err_free: |
2478 | for (area = 0; area < nr_vms; area++) { | 2478 | for (area = 0; area < nr_vms; area++) { |
2479 | if (vas) | 2479 | kfree(vas[area]); |
2480 | kfree(vas[area]); | 2480 | kfree(vms[area]); |
2481 | if (vms) | ||
2482 | kfree(vms[area]); | ||
2483 | } | 2481 | } |
2482 | err_free2: | ||
2484 | kfree(vas); | 2483 | kfree(vas); |
2485 | kfree(vms); | 2484 | kfree(vms); |
2486 | return NULL; | 2485 | return NULL; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 26f4a8a4e0c7..2880396f7953 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -103,8 +103,11 @@ struct scan_control { | |||
103 | */ | 103 | */ |
104 | reclaim_mode_t reclaim_mode; | 104 | reclaim_mode_t reclaim_mode; |
105 | 105 | ||
106 | /* Which cgroup do we reclaim from */ | 106 | /* |
107 | struct mem_cgroup *mem_cgroup; | 107 | * The memory cgroup that hit its limit and as a result is the |
108 | * primary target of this reclaim invocation. | ||
109 | */ | ||
110 | struct mem_cgroup *target_mem_cgroup; | ||
108 | 111 | ||
109 | /* | 112 | /* |
110 | * Nodemask of nodes allowed by the caller. If NULL, all nodes | 113 | * Nodemask of nodes allowed by the caller. If NULL, all nodes |
@@ -113,6 +116,11 @@ struct scan_control { | |||
113 | nodemask_t *nodemask; | 116 | nodemask_t *nodemask; |
114 | }; | 117 | }; |
115 | 118 | ||
119 | struct mem_cgroup_zone { | ||
120 | struct mem_cgroup *mem_cgroup; | ||
121 | struct zone *zone; | ||
122 | }; | ||
123 | |||
116 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 124 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
117 | 125 | ||
118 | #ifdef ARCH_HAS_PREFETCH | 126 | #ifdef ARCH_HAS_PREFETCH |
@@ -153,28 +161,45 @@ static LIST_HEAD(shrinker_list); | |||
153 | static DECLARE_RWSEM(shrinker_rwsem); | 161 | static DECLARE_RWSEM(shrinker_rwsem); |
154 | 162 | ||
155 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 163 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
156 | #define scanning_global_lru(sc) (!(sc)->mem_cgroup) | 164 | static bool global_reclaim(struct scan_control *sc) |
165 | { | ||
166 | return !sc->target_mem_cgroup; | ||
167 | } | ||
168 | |||
169 | static bool scanning_global_lru(struct mem_cgroup_zone *mz) | ||
170 | { | ||
171 | return !mz->mem_cgroup; | ||
172 | } | ||
157 | #else | 173 | #else |
158 | #define scanning_global_lru(sc) (1) | 174 | static bool global_reclaim(struct scan_control *sc) |
175 | { | ||
176 | return true; | ||
177 | } | ||
178 | |||
179 | static bool scanning_global_lru(struct mem_cgroup_zone *mz) | ||
180 | { | ||
181 | return true; | ||
182 | } | ||
159 | #endif | 183 | #endif |
160 | 184 | ||
161 | static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, | 185 | static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) |
162 | struct scan_control *sc) | ||
163 | { | 186 | { |
164 | if (!scanning_global_lru(sc)) | 187 | if (!scanning_global_lru(mz)) |
165 | return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); | 188 | return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone); |
166 | 189 | ||
167 | return &zone->reclaim_stat; | 190 | return &mz->zone->reclaim_stat; |
168 | } | 191 | } |
169 | 192 | ||
170 | static unsigned long zone_nr_lru_pages(struct zone *zone, | 193 | static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz, |
171 | struct scan_control *sc, enum lru_list lru) | 194 | enum lru_list lru) |
172 | { | 195 | { |
173 | if (!scanning_global_lru(sc)) | 196 | if (!scanning_global_lru(mz)) |
174 | return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, | 197 | return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup, |
175 | zone_to_nid(zone), zone_idx(zone), BIT(lru)); | 198 | zone_to_nid(mz->zone), |
199 | zone_idx(mz->zone), | ||
200 | BIT(lru)); | ||
176 | 201 | ||
177 | return zone_page_state(zone, NR_LRU_BASE + lru); | 202 | return zone_page_state(mz->zone, NR_LRU_BASE + lru); |
178 | } | 203 | } |
179 | 204 | ||
180 | 205 | ||
@@ -677,12 +702,13 @@ enum page_references { | |||
677 | }; | 702 | }; |
678 | 703 | ||
679 | static enum page_references page_check_references(struct page *page, | 704 | static enum page_references page_check_references(struct page *page, |
705 | struct mem_cgroup_zone *mz, | ||
680 | struct scan_control *sc) | 706 | struct scan_control *sc) |
681 | { | 707 | { |
682 | int referenced_ptes, referenced_page; | 708 | int referenced_ptes, referenced_page; |
683 | unsigned long vm_flags; | 709 | unsigned long vm_flags; |
684 | 710 | ||
685 | referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); | 711 | referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); |
686 | referenced_page = TestClearPageReferenced(page); | 712 | referenced_page = TestClearPageReferenced(page); |
687 | 713 | ||
688 | /* Lumpy reclaim - ignore references */ | 714 | /* Lumpy reclaim - ignore references */ |
@@ -738,7 +764,7 @@ static enum page_references page_check_references(struct page *page, | |||
738 | * shrink_page_list() returns the number of reclaimed pages | 764 | * shrink_page_list() returns the number of reclaimed pages |
739 | */ | 765 | */ |
740 | static unsigned long shrink_page_list(struct list_head *page_list, | 766 | static unsigned long shrink_page_list(struct list_head *page_list, |
741 | struct zone *zone, | 767 | struct mem_cgroup_zone *mz, |
742 | struct scan_control *sc, | 768 | struct scan_control *sc, |
743 | int priority, | 769 | int priority, |
744 | unsigned long *ret_nr_dirty, | 770 | unsigned long *ret_nr_dirty, |
@@ -769,7 +795,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
769 | goto keep; | 795 | goto keep; |
770 | 796 | ||
771 | VM_BUG_ON(PageActive(page)); | 797 | VM_BUG_ON(PageActive(page)); |
772 | VM_BUG_ON(page_zone(page) != zone); | 798 | VM_BUG_ON(page_zone(page) != mz->zone); |
773 | 799 | ||
774 | sc->nr_scanned++; | 800 | sc->nr_scanned++; |
775 | 801 | ||
@@ -803,7 +829,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
803 | } | 829 | } |
804 | } | 830 | } |
805 | 831 | ||
806 | references = page_check_references(page, sc); | 832 | references = page_check_references(page, mz, sc); |
807 | switch (references) { | 833 | switch (references) { |
808 | case PAGEREF_ACTIVATE: | 834 | case PAGEREF_ACTIVATE: |
809 | goto activate_locked; | 835 | goto activate_locked; |
@@ -994,8 +1020,8 @@ keep_lumpy: | |||
994 | * back off and wait for congestion to clear because further reclaim | 1020 | * back off and wait for congestion to clear because further reclaim |
995 | * will encounter the same problem | 1021 | * will encounter the same problem |
996 | */ | 1022 | */ |
997 | if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) | 1023 | if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) |
998 | zone_set_flag(zone, ZONE_CONGESTED); | 1024 | zone_set_flag(mz->zone, ZONE_CONGESTED); |
999 | 1025 | ||
1000 | free_hot_cold_page_list(&free_pages, 1); | 1026 | free_hot_cold_page_list(&free_pages, 1); |
1001 | 1027 | ||
@@ -1049,8 +1075,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) | |||
1049 | 1075 | ||
1050 | ret = -EBUSY; | 1076 | ret = -EBUSY; |
1051 | 1077 | ||
1052 | if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) | 1078 | /* |
1053 | return ret; | 1079 | * To minimise LRU disruption, the caller can indicate that it only |
1080 | * wants to isolate pages it will be able to operate on without | ||
1081 | * blocking - clean pages for the most part. | ||
1082 | * | ||
1083 | * ISOLATE_CLEAN means that only clean pages should be isolated. This | ||
1084 | * is used by reclaim when it is cannot write to backing storage | ||
1085 | * | ||
1086 | * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages | ||
1087 | * that it is possible to migrate without blocking | ||
1088 | */ | ||
1089 | if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { | ||
1090 | /* All the caller can do on PageWriteback is block */ | ||
1091 | if (PageWriteback(page)) | ||
1092 | return ret; | ||
1093 | |||
1094 | if (PageDirty(page)) { | ||
1095 | struct address_space *mapping; | ||
1096 | |||
1097 | /* ISOLATE_CLEAN means only clean pages */ | ||
1098 | if (mode & ISOLATE_CLEAN) | ||
1099 | return ret; | ||
1100 | |||
1101 | /* | ||
1102 | * Only pages without mappings or that have a | ||
1103 | * ->migratepage callback are possible to migrate | ||
1104 | * without blocking | ||
1105 | */ | ||
1106 | mapping = page_mapping(page); | ||
1107 | if (mapping && !mapping->a_ops->migratepage) | ||
1108 | return ret; | ||
1109 | } | ||
1110 | } | ||
1054 | 1111 | ||
1055 | if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) | 1112 | if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) |
1056 | return ret; | 1113 | return ret; |
@@ -1079,25 +1136,36 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) | |||
1079 | * Appropriate locks must be held before calling this function. | 1136 | * Appropriate locks must be held before calling this function. |
1080 | * | 1137 | * |
1081 | * @nr_to_scan: The number of pages to look through on the list. | 1138 | * @nr_to_scan: The number of pages to look through on the list. |
1082 | * @src: The LRU list to pull pages off. | 1139 | * @mz: The mem_cgroup_zone to pull pages from. |
1083 | * @dst: The temp list to put pages on to. | 1140 | * @dst: The temp list to put pages on to. |
1084 | * @scanned: The number of pages that were scanned. | 1141 | * @nr_scanned: The number of pages that were scanned. |
1085 | * @order: The caller's attempted allocation order | 1142 | * @order: The caller's attempted allocation order |
1086 | * @mode: One of the LRU isolation modes | 1143 | * @mode: One of the LRU isolation modes |
1144 | * @active: True [1] if isolating active pages | ||
1087 | * @file: True [1] if isolating file [!anon] pages | 1145 | * @file: True [1] if isolating file [!anon] pages |
1088 | * | 1146 | * |
1089 | * returns how many pages were moved onto *@dst. | 1147 | * returns how many pages were moved onto *@dst. |
1090 | */ | 1148 | */ |
1091 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 1149 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
1092 | struct list_head *src, struct list_head *dst, | 1150 | struct mem_cgroup_zone *mz, struct list_head *dst, |
1093 | unsigned long *scanned, int order, isolate_mode_t mode, | 1151 | unsigned long *nr_scanned, int order, isolate_mode_t mode, |
1094 | int file) | 1152 | int active, int file) |
1095 | { | 1153 | { |
1154 | struct lruvec *lruvec; | ||
1155 | struct list_head *src; | ||
1096 | unsigned long nr_taken = 0; | 1156 | unsigned long nr_taken = 0; |
1097 | unsigned long nr_lumpy_taken = 0; | 1157 | unsigned long nr_lumpy_taken = 0; |
1098 | unsigned long nr_lumpy_dirty = 0; | 1158 | unsigned long nr_lumpy_dirty = 0; |
1099 | unsigned long nr_lumpy_failed = 0; | 1159 | unsigned long nr_lumpy_failed = 0; |
1100 | unsigned long scan; | 1160 | unsigned long scan; |
1161 | int lru = LRU_BASE; | ||
1162 | |||
1163 | lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup); | ||
1164 | if (active) | ||
1165 | lru += LRU_ACTIVE; | ||
1166 | if (file) | ||
1167 | lru += LRU_FILE; | ||
1168 | src = &lruvec->lists[lru]; | ||
1101 | 1169 | ||
1102 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { | 1170 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { |
1103 | struct page *page; | 1171 | struct page *page; |
@@ -1113,15 +1181,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1113 | 1181 | ||
1114 | switch (__isolate_lru_page(page, mode, file)) { | 1182 | switch (__isolate_lru_page(page, mode, file)) { |
1115 | case 0: | 1183 | case 0: |
1184 | mem_cgroup_lru_del(page); | ||
1116 | list_move(&page->lru, dst); | 1185 | list_move(&page->lru, dst); |
1117 | mem_cgroup_del_lru(page); | ||
1118 | nr_taken += hpage_nr_pages(page); | 1186 | nr_taken += hpage_nr_pages(page); |
1119 | break; | 1187 | break; |
1120 | 1188 | ||
1121 | case -EBUSY: | 1189 | case -EBUSY: |
1122 | /* else it is being freed elsewhere */ | 1190 | /* else it is being freed elsewhere */ |
1123 | list_move(&page->lru, src); | 1191 | list_move(&page->lru, src); |
1124 | mem_cgroup_rotate_lru_list(page, page_lru(page)); | ||
1125 | continue; | 1192 | continue; |
1126 | 1193 | ||
1127 | default: | 1194 | default: |
@@ -1171,13 +1238,17 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1171 | break; | 1238 | break; |
1172 | 1239 | ||
1173 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { | 1240 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { |
1241 | unsigned int isolated_pages; | ||
1242 | |||
1243 | mem_cgroup_lru_del(cursor_page); | ||
1174 | list_move(&cursor_page->lru, dst); | 1244 | list_move(&cursor_page->lru, dst); |
1175 | mem_cgroup_del_lru(cursor_page); | 1245 | isolated_pages = hpage_nr_pages(cursor_page); |
1176 | nr_taken += hpage_nr_pages(cursor_page); | 1246 | nr_taken += isolated_pages; |
1177 | nr_lumpy_taken++; | 1247 | nr_lumpy_taken += isolated_pages; |
1178 | if (PageDirty(cursor_page)) | 1248 | if (PageDirty(cursor_page)) |
1179 | nr_lumpy_dirty++; | 1249 | nr_lumpy_dirty += isolated_pages; |
1180 | scan++; | 1250 | scan++; |
1251 | pfn += isolated_pages - 1; | ||
1181 | } else { | 1252 | } else { |
1182 | /* | 1253 | /* |
1183 | * Check if the page is freed already. | 1254 | * Check if the page is freed already. |
@@ -1203,57 +1274,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1203 | nr_lumpy_failed++; | 1274 | nr_lumpy_failed++; |
1204 | } | 1275 | } |
1205 | 1276 | ||
1206 | *scanned = scan; | 1277 | *nr_scanned = scan; |
1207 | 1278 | ||
1208 | trace_mm_vmscan_lru_isolate(order, | 1279 | trace_mm_vmscan_lru_isolate(order, |
1209 | nr_to_scan, scan, | 1280 | nr_to_scan, scan, |
1210 | nr_taken, | 1281 | nr_taken, |
1211 | nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, | 1282 | nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, |
1212 | mode); | 1283 | mode, file); |
1213 | return nr_taken; | 1284 | return nr_taken; |
1214 | } | 1285 | } |
1215 | 1286 | ||
1216 | static unsigned long isolate_pages_global(unsigned long nr, | ||
1217 | struct list_head *dst, | ||
1218 | unsigned long *scanned, int order, | ||
1219 | isolate_mode_t mode, | ||
1220 | struct zone *z, int active, int file) | ||
1221 | { | ||
1222 | int lru = LRU_BASE; | ||
1223 | if (active) | ||
1224 | lru += LRU_ACTIVE; | ||
1225 | if (file) | ||
1226 | lru += LRU_FILE; | ||
1227 | return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, | ||
1228 | mode, file); | ||
1229 | } | ||
1230 | |||
1231 | /* | ||
1232 | * clear_active_flags() is a helper for shrink_active_list(), clearing | ||
1233 | * any active bits from the pages in the list. | ||
1234 | */ | ||
1235 | static unsigned long clear_active_flags(struct list_head *page_list, | ||
1236 | unsigned int *count) | ||
1237 | { | ||
1238 | int nr_active = 0; | ||
1239 | int lru; | ||
1240 | struct page *page; | ||
1241 | |||
1242 | list_for_each_entry(page, page_list, lru) { | ||
1243 | int numpages = hpage_nr_pages(page); | ||
1244 | lru = page_lru_base_type(page); | ||
1245 | if (PageActive(page)) { | ||
1246 | lru += LRU_ACTIVE; | ||
1247 | ClearPageActive(page); | ||
1248 | nr_active += numpages; | ||
1249 | } | ||
1250 | if (count) | ||
1251 | count[lru] += numpages; | ||
1252 | } | ||
1253 | |||
1254 | return nr_active; | ||
1255 | } | ||
1256 | |||
1257 | /** | 1287 | /** |
1258 | * isolate_lru_page - tries to isolate a page from its LRU list | 1288 | * isolate_lru_page - tries to isolate a page from its LRU list |
1259 | * @page: page to isolate from its LRU list | 1289 | * @page: page to isolate from its LRU list |
@@ -1313,7 +1343,7 @@ static int too_many_isolated(struct zone *zone, int file, | |||
1313 | if (current_is_kswapd()) | 1343 | if (current_is_kswapd()) |
1314 | return 0; | 1344 | return 0; |
1315 | 1345 | ||
1316 | if (!scanning_global_lru(sc)) | 1346 | if (!global_reclaim(sc)) |
1317 | return 0; | 1347 | return 0; |
1318 | 1348 | ||
1319 | if (file) { | 1349 | if (file) { |
@@ -1327,27 +1357,21 @@ static int too_many_isolated(struct zone *zone, int file, | |||
1327 | return isolated > inactive; | 1357 | return isolated > inactive; |
1328 | } | 1358 | } |
1329 | 1359 | ||
1330 | /* | ||
1331 | * TODO: Try merging with migrations version of putback_lru_pages | ||
1332 | */ | ||
1333 | static noinline_for_stack void | 1360 | static noinline_for_stack void |
1334 | putback_lru_pages(struct zone *zone, struct scan_control *sc, | 1361 | putback_inactive_pages(struct mem_cgroup_zone *mz, |
1335 | unsigned long nr_anon, unsigned long nr_file, | 1362 | struct list_head *page_list) |
1336 | struct list_head *page_list) | ||
1337 | { | 1363 | { |
1338 | struct page *page; | 1364 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); |
1339 | struct pagevec pvec; | 1365 | struct zone *zone = mz->zone; |
1340 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1366 | LIST_HEAD(pages_to_free); |
1341 | |||
1342 | pagevec_init(&pvec, 1); | ||
1343 | 1367 | ||
1344 | /* | 1368 | /* |
1345 | * Put back any unfreeable pages. | 1369 | * Put back any unfreeable pages. |
1346 | */ | 1370 | */ |
1347 | spin_lock(&zone->lru_lock); | ||
1348 | while (!list_empty(page_list)) { | 1371 | while (!list_empty(page_list)) { |
1372 | struct page *page = lru_to_page(page_list); | ||
1349 | int lru; | 1373 | int lru; |
1350 | page = lru_to_page(page_list); | 1374 | |
1351 | VM_BUG_ON(PageLRU(page)); | 1375 | VM_BUG_ON(PageLRU(page)); |
1352 | list_del(&page->lru); | 1376 | list_del(&page->lru); |
1353 | if (unlikely(!page_evictable(page, NULL))) { | 1377 | if (unlikely(!page_evictable(page, NULL))) { |
@@ -1364,30 +1388,53 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, | |||
1364 | int numpages = hpage_nr_pages(page); | 1388 | int numpages = hpage_nr_pages(page); |
1365 | reclaim_stat->recent_rotated[file] += numpages; | 1389 | reclaim_stat->recent_rotated[file] += numpages; |
1366 | } | 1390 | } |
1367 | if (!pagevec_add(&pvec, page)) { | 1391 | if (put_page_testzero(page)) { |
1368 | spin_unlock_irq(&zone->lru_lock); | 1392 | __ClearPageLRU(page); |
1369 | __pagevec_release(&pvec); | 1393 | __ClearPageActive(page); |
1370 | spin_lock_irq(&zone->lru_lock); | 1394 | del_page_from_lru_list(zone, page, lru); |
1395 | |||
1396 | if (unlikely(PageCompound(page))) { | ||
1397 | spin_unlock_irq(&zone->lru_lock); | ||
1398 | (*get_compound_page_dtor(page))(page); | ||
1399 | spin_lock_irq(&zone->lru_lock); | ||
1400 | } else | ||
1401 | list_add(&page->lru, &pages_to_free); | ||
1371 | } | 1402 | } |
1372 | } | 1403 | } |
1373 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); | ||
1374 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); | ||
1375 | 1404 | ||
1376 | spin_unlock_irq(&zone->lru_lock); | 1405 | /* |
1377 | pagevec_release(&pvec); | 1406 | * To save our caller's stack, now use input list for pages to free. |
1407 | */ | ||
1408 | list_splice(&pages_to_free, page_list); | ||
1378 | } | 1409 | } |
1379 | 1410 | ||
1380 | static noinline_for_stack void update_isolated_counts(struct zone *zone, | 1411 | static noinline_for_stack void |
1381 | struct scan_control *sc, | 1412 | update_isolated_counts(struct mem_cgroup_zone *mz, |
1382 | unsigned long *nr_anon, | 1413 | struct list_head *page_list, |
1383 | unsigned long *nr_file, | 1414 | unsigned long *nr_anon, |
1384 | struct list_head *isolated_list) | 1415 | unsigned long *nr_file) |
1385 | { | 1416 | { |
1386 | unsigned long nr_active; | 1417 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); |
1418 | struct zone *zone = mz->zone; | ||
1387 | unsigned int count[NR_LRU_LISTS] = { 0, }; | 1419 | unsigned int count[NR_LRU_LISTS] = { 0, }; |
1388 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1420 | unsigned long nr_active = 0; |
1421 | struct page *page; | ||
1422 | int lru; | ||
1423 | |||
1424 | /* | ||
1425 | * Count pages and clear active flags | ||
1426 | */ | ||
1427 | list_for_each_entry(page, page_list, lru) { | ||
1428 | int numpages = hpage_nr_pages(page); | ||
1429 | lru = page_lru_base_type(page); | ||
1430 | if (PageActive(page)) { | ||
1431 | lru += LRU_ACTIVE; | ||
1432 | ClearPageActive(page); | ||
1433 | nr_active += numpages; | ||
1434 | } | ||
1435 | count[lru] += numpages; | ||
1436 | } | ||
1389 | 1437 | ||
1390 | nr_active = clear_active_flags(isolated_list, count); | ||
1391 | __count_vm_events(PGDEACTIVATE, nr_active); | 1438 | __count_vm_events(PGDEACTIVATE, nr_active); |
1392 | 1439 | ||
1393 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, | 1440 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, |
@@ -1401,8 +1448,6 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, | |||
1401 | 1448 | ||
1402 | *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; | 1449 | *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; |
1403 | *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; | 1450 | *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; |
1404 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); | ||
1405 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); | ||
1406 | 1451 | ||
1407 | reclaim_stat->recent_scanned[0] += *nr_anon; | 1452 | reclaim_stat->recent_scanned[0] += *nr_anon; |
1408 | reclaim_stat->recent_scanned[1] += *nr_file; | 1453 | reclaim_stat->recent_scanned[1] += *nr_file; |
@@ -1454,8 +1499,8 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, | |||
1454 | * of reclaimed pages | 1499 | * of reclaimed pages |
1455 | */ | 1500 | */ |
1456 | static noinline_for_stack unsigned long | 1501 | static noinline_for_stack unsigned long |
1457 | shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | 1502 | shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, |
1458 | struct scan_control *sc, int priority, int file) | 1503 | struct scan_control *sc, int priority, int file) |
1459 | { | 1504 | { |
1460 | LIST_HEAD(page_list); | 1505 | LIST_HEAD(page_list); |
1461 | unsigned long nr_scanned; | 1506 | unsigned long nr_scanned; |
@@ -1466,6 +1511,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1466 | unsigned long nr_dirty = 0; | 1511 | unsigned long nr_dirty = 0; |
1467 | unsigned long nr_writeback = 0; | 1512 | unsigned long nr_writeback = 0; |
1468 | isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; | 1513 | isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; |
1514 | struct zone *zone = mz->zone; | ||
1469 | 1515 | ||
1470 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1516 | while (unlikely(too_many_isolated(zone, file, sc))) { |
1471 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1517 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
@@ -1488,9 +1534,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1488 | 1534 | ||
1489 | spin_lock_irq(&zone->lru_lock); | 1535 | spin_lock_irq(&zone->lru_lock); |
1490 | 1536 | ||
1491 | if (scanning_global_lru(sc)) { | 1537 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, |
1492 | nr_taken = isolate_pages_global(nr_to_scan, &page_list, | 1538 | &nr_scanned, sc->order, |
1493 | &nr_scanned, sc->order, reclaim_mode, zone, 0, file); | 1539 | reclaim_mode, 0, file); |
1540 | if (global_reclaim(sc)) { | ||
1494 | zone->pages_scanned += nr_scanned; | 1541 | zone->pages_scanned += nr_scanned; |
1495 | if (current_is_kswapd()) | 1542 | if (current_is_kswapd()) |
1496 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, | 1543 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, |
@@ -1498,14 +1545,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1498 | else | 1545 | else |
1499 | __count_zone_vm_events(PGSCAN_DIRECT, zone, | 1546 | __count_zone_vm_events(PGSCAN_DIRECT, zone, |
1500 | nr_scanned); | 1547 | nr_scanned); |
1501 | } else { | ||
1502 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, | ||
1503 | &nr_scanned, sc->order, reclaim_mode, zone, | ||
1504 | sc->mem_cgroup, 0, file); | ||
1505 | /* | ||
1506 | * mem_cgroup_isolate_pages() keeps track of | ||
1507 | * scanned pages on its own. | ||
1508 | */ | ||
1509 | } | 1548 | } |
1510 | 1549 | ||
1511 | if (nr_taken == 0) { | 1550 | if (nr_taken == 0) { |
@@ -1513,26 +1552,37 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1513 | return 0; | 1552 | return 0; |
1514 | } | 1553 | } |
1515 | 1554 | ||
1516 | update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); | 1555 | update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); |
1556 | |||
1557 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); | ||
1558 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); | ||
1517 | 1559 | ||
1518 | spin_unlock_irq(&zone->lru_lock); | 1560 | spin_unlock_irq(&zone->lru_lock); |
1519 | 1561 | ||
1520 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, | 1562 | nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, |
1521 | &nr_dirty, &nr_writeback); | 1563 | &nr_dirty, &nr_writeback); |
1522 | 1564 | ||
1523 | /* Check if we should syncronously wait for writeback */ | 1565 | /* Check if we should syncronously wait for writeback */ |
1524 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | 1566 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { |
1525 | set_reclaim_mode(priority, sc, true); | 1567 | set_reclaim_mode(priority, sc, true); |
1526 | nr_reclaimed += shrink_page_list(&page_list, zone, sc, | 1568 | nr_reclaimed += shrink_page_list(&page_list, mz, sc, |
1527 | priority, &nr_dirty, &nr_writeback); | 1569 | priority, &nr_dirty, &nr_writeback); |
1528 | } | 1570 | } |
1529 | 1571 | ||
1530 | local_irq_disable(); | 1572 | spin_lock_irq(&zone->lru_lock); |
1573 | |||
1531 | if (current_is_kswapd()) | 1574 | if (current_is_kswapd()) |
1532 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); | 1575 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); |
1533 | __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); | 1576 | __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); |
1534 | 1577 | ||
1535 | putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); | 1578 | putback_inactive_pages(mz, &page_list); |
1579 | |||
1580 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); | ||
1581 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); | ||
1582 | |||
1583 | spin_unlock_irq(&zone->lru_lock); | ||
1584 | |||
1585 | free_hot_cold_page_list(&page_list, 1); | ||
1536 | 1586 | ||
1537 | /* | 1587 | /* |
1538 | * If reclaim is isolating dirty pages under writeback, it implies | 1588 | * If reclaim is isolating dirty pages under writeback, it implies |
@@ -1588,30 +1638,47 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1588 | 1638 | ||
1589 | static void move_active_pages_to_lru(struct zone *zone, | 1639 | static void move_active_pages_to_lru(struct zone *zone, |
1590 | struct list_head *list, | 1640 | struct list_head *list, |
1641 | struct list_head *pages_to_free, | ||
1591 | enum lru_list lru) | 1642 | enum lru_list lru) |
1592 | { | 1643 | { |
1593 | unsigned long pgmoved = 0; | 1644 | unsigned long pgmoved = 0; |
1594 | struct pagevec pvec; | ||
1595 | struct page *page; | 1645 | struct page *page; |
1596 | 1646 | ||
1597 | pagevec_init(&pvec, 1); | 1647 | if (buffer_heads_over_limit) { |
1648 | spin_unlock_irq(&zone->lru_lock); | ||
1649 | list_for_each_entry(page, list, lru) { | ||
1650 | if (page_has_private(page) && trylock_page(page)) { | ||
1651 | if (page_has_private(page)) | ||
1652 | try_to_release_page(page, 0); | ||
1653 | unlock_page(page); | ||
1654 | } | ||
1655 | } | ||
1656 | spin_lock_irq(&zone->lru_lock); | ||
1657 | } | ||
1598 | 1658 | ||
1599 | while (!list_empty(list)) { | 1659 | while (!list_empty(list)) { |
1660 | struct lruvec *lruvec; | ||
1661 | |||
1600 | page = lru_to_page(list); | 1662 | page = lru_to_page(list); |
1601 | 1663 | ||
1602 | VM_BUG_ON(PageLRU(page)); | 1664 | VM_BUG_ON(PageLRU(page)); |
1603 | SetPageLRU(page); | 1665 | SetPageLRU(page); |
1604 | 1666 | ||
1605 | list_move(&page->lru, &zone->lru[lru].list); | 1667 | lruvec = mem_cgroup_lru_add_list(zone, page, lru); |
1606 | mem_cgroup_add_lru_list(page, lru); | 1668 | list_move(&page->lru, &lruvec->lists[lru]); |
1607 | pgmoved += hpage_nr_pages(page); | 1669 | pgmoved += hpage_nr_pages(page); |
1608 | 1670 | ||
1609 | if (!pagevec_add(&pvec, page) || list_empty(list)) { | 1671 | if (put_page_testzero(page)) { |
1610 | spin_unlock_irq(&zone->lru_lock); | 1672 | __ClearPageLRU(page); |
1611 | if (buffer_heads_over_limit) | 1673 | __ClearPageActive(page); |
1612 | pagevec_strip(&pvec); | 1674 | del_page_from_lru_list(zone, page, lru); |
1613 | __pagevec_release(&pvec); | 1675 | |
1614 | spin_lock_irq(&zone->lru_lock); | 1676 | if (unlikely(PageCompound(page))) { |
1677 | spin_unlock_irq(&zone->lru_lock); | ||
1678 | (*get_compound_page_dtor(page))(page); | ||
1679 | spin_lock_irq(&zone->lru_lock); | ||
1680 | } else | ||
1681 | list_add(&page->lru, pages_to_free); | ||
1615 | } | 1682 | } |
1616 | } | 1683 | } |
1617 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | 1684 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); |
@@ -1619,19 +1686,22 @@ static void move_active_pages_to_lru(struct zone *zone, | |||
1619 | __count_vm_events(PGDEACTIVATE, pgmoved); | 1686 | __count_vm_events(PGDEACTIVATE, pgmoved); |
1620 | } | 1687 | } |
1621 | 1688 | ||
1622 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | 1689 | static void shrink_active_list(unsigned long nr_to_scan, |
1623 | struct scan_control *sc, int priority, int file) | 1690 | struct mem_cgroup_zone *mz, |
1691 | struct scan_control *sc, | ||
1692 | int priority, int file) | ||
1624 | { | 1693 | { |
1625 | unsigned long nr_taken; | 1694 | unsigned long nr_taken; |
1626 | unsigned long pgscanned; | 1695 | unsigned long nr_scanned; |
1627 | unsigned long vm_flags; | 1696 | unsigned long vm_flags; |
1628 | LIST_HEAD(l_hold); /* The pages which were snipped off */ | 1697 | LIST_HEAD(l_hold); /* The pages which were snipped off */ |
1629 | LIST_HEAD(l_active); | 1698 | LIST_HEAD(l_active); |
1630 | LIST_HEAD(l_inactive); | 1699 | LIST_HEAD(l_inactive); |
1631 | struct page *page; | 1700 | struct page *page; |
1632 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1701 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); |
1633 | unsigned long nr_rotated = 0; | 1702 | unsigned long nr_rotated = 0; |
1634 | isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; | 1703 | isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; |
1704 | struct zone *zone = mz->zone; | ||
1635 | 1705 | ||
1636 | lru_add_drain(); | 1706 | lru_add_drain(); |
1637 | 1707 | ||
@@ -1641,26 +1711,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1641 | reclaim_mode |= ISOLATE_CLEAN; | 1711 | reclaim_mode |= ISOLATE_CLEAN; |
1642 | 1712 | ||
1643 | spin_lock_irq(&zone->lru_lock); | 1713 | spin_lock_irq(&zone->lru_lock); |
1644 | if (scanning_global_lru(sc)) { | 1714 | |
1645 | nr_taken = isolate_pages_global(nr_pages, &l_hold, | 1715 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, |
1646 | &pgscanned, sc->order, | 1716 | &nr_scanned, sc->order, |
1647 | reclaim_mode, zone, | 1717 | reclaim_mode, 1, file); |
1648 | 1, file); | 1718 | if (global_reclaim(sc)) |
1649 | zone->pages_scanned += pgscanned; | 1719 | zone->pages_scanned += nr_scanned; |
1650 | } else { | ||
1651 | nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, | ||
1652 | &pgscanned, sc->order, | ||
1653 | reclaim_mode, zone, | ||
1654 | sc->mem_cgroup, 1, file); | ||
1655 | /* | ||
1656 | * mem_cgroup_isolate_pages() keeps track of | ||
1657 | * scanned pages on its own. | ||
1658 | */ | ||
1659 | } | ||
1660 | 1720 | ||
1661 | reclaim_stat->recent_scanned[file] += nr_taken; | 1721 | reclaim_stat->recent_scanned[file] += nr_taken; |
1662 | 1722 | ||
1663 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | 1723 | __count_zone_vm_events(PGREFILL, zone, nr_scanned); |
1664 | if (file) | 1724 | if (file) |
1665 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); | 1725 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); |
1666 | else | 1726 | else |
@@ -1678,7 +1738,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1678 | continue; | 1738 | continue; |
1679 | } | 1739 | } |
1680 | 1740 | ||
1681 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { | 1741 | if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { |
1682 | nr_rotated += hpage_nr_pages(page); | 1742 | nr_rotated += hpage_nr_pages(page); |
1683 | /* | 1743 | /* |
1684 | * Identify referenced, file-backed active pages and | 1744 | * Identify referenced, file-backed active pages and |
@@ -1711,12 +1771,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1711 | */ | 1771 | */ |
1712 | reclaim_stat->recent_rotated[file] += nr_rotated; | 1772 | reclaim_stat->recent_rotated[file] += nr_rotated; |
1713 | 1773 | ||
1714 | move_active_pages_to_lru(zone, &l_active, | 1774 | move_active_pages_to_lru(zone, &l_active, &l_hold, |
1715 | LRU_ACTIVE + file * LRU_FILE); | 1775 | LRU_ACTIVE + file * LRU_FILE); |
1716 | move_active_pages_to_lru(zone, &l_inactive, | 1776 | move_active_pages_to_lru(zone, &l_inactive, &l_hold, |
1717 | LRU_BASE + file * LRU_FILE); | 1777 | LRU_BASE + file * LRU_FILE); |
1718 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); | 1778 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); |
1719 | spin_unlock_irq(&zone->lru_lock); | 1779 | spin_unlock_irq(&zone->lru_lock); |
1780 | |||
1781 | free_hot_cold_page_list(&l_hold, 1); | ||
1720 | } | 1782 | } |
1721 | 1783 | ||
1722 | #ifdef CONFIG_SWAP | 1784 | #ifdef CONFIG_SWAP |
@@ -1741,10 +1803,8 @@ static int inactive_anon_is_low_global(struct zone *zone) | |||
1741 | * Returns true if the zone does not have enough inactive anon pages, | 1803 | * Returns true if the zone does not have enough inactive anon pages, |
1742 | * meaning some active anon pages need to be deactivated. | 1804 | * meaning some active anon pages need to be deactivated. |
1743 | */ | 1805 | */ |
1744 | static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) | 1806 | static int inactive_anon_is_low(struct mem_cgroup_zone *mz) |
1745 | { | 1807 | { |
1746 | int low; | ||
1747 | |||
1748 | /* | 1808 | /* |
1749 | * If we don't have swap space, anonymous page deactivation | 1809 | * If we don't have swap space, anonymous page deactivation |
1750 | * is pointless. | 1810 | * is pointless. |
@@ -1752,15 +1812,14 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) | |||
1752 | if (!total_swap_pages) | 1812 | if (!total_swap_pages) |
1753 | return 0; | 1813 | return 0; |
1754 | 1814 | ||
1755 | if (scanning_global_lru(sc)) | 1815 | if (!scanning_global_lru(mz)) |
1756 | low = inactive_anon_is_low_global(zone); | 1816 | return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, |
1757 | else | 1817 | mz->zone); |
1758 | low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone); | 1818 | |
1759 | return low; | 1819 | return inactive_anon_is_low_global(mz->zone); |
1760 | } | 1820 | } |
1761 | #else | 1821 | #else |
1762 | static inline int inactive_anon_is_low(struct zone *zone, | 1822 | static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz) |
1763 | struct scan_control *sc) | ||
1764 | { | 1823 | { |
1765 | return 0; | 1824 | return 0; |
1766 | } | 1825 | } |
@@ -1778,8 +1837,7 @@ static int inactive_file_is_low_global(struct zone *zone) | |||
1778 | 1837 | ||
1779 | /** | 1838 | /** |
1780 | * inactive_file_is_low - check if file pages need to be deactivated | 1839 | * inactive_file_is_low - check if file pages need to be deactivated |
1781 | * @zone: zone to check | 1840 | * @mz: memory cgroup and zone to check |
1782 | * @sc: scan control of this context | ||
1783 | * | 1841 | * |
1784 | * When the system is doing streaming IO, memory pressure here | 1842 | * When the system is doing streaming IO, memory pressure here |
1785 | * ensures that active file pages get deactivated, until more | 1843 | * ensures that active file pages get deactivated, until more |
@@ -1791,45 +1849,44 @@ static int inactive_file_is_low_global(struct zone *zone) | |||
1791 | * This uses a different ratio than the anonymous pages, because | 1849 | * This uses a different ratio than the anonymous pages, because |
1792 | * the page cache uses a use-once replacement algorithm. | 1850 | * the page cache uses a use-once replacement algorithm. |
1793 | */ | 1851 | */ |
1794 | static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) | 1852 | static int inactive_file_is_low(struct mem_cgroup_zone *mz) |
1795 | { | 1853 | { |
1796 | int low; | 1854 | if (!scanning_global_lru(mz)) |
1855 | return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, | ||
1856 | mz->zone); | ||
1797 | 1857 | ||
1798 | if (scanning_global_lru(sc)) | 1858 | return inactive_file_is_low_global(mz->zone); |
1799 | low = inactive_file_is_low_global(zone); | ||
1800 | else | ||
1801 | low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone); | ||
1802 | return low; | ||
1803 | } | 1859 | } |
1804 | 1860 | ||
1805 | static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, | 1861 | static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file) |
1806 | int file) | ||
1807 | { | 1862 | { |
1808 | if (file) | 1863 | if (file) |
1809 | return inactive_file_is_low(zone, sc); | 1864 | return inactive_file_is_low(mz); |
1810 | else | 1865 | else |
1811 | return inactive_anon_is_low(zone, sc); | 1866 | return inactive_anon_is_low(mz); |
1812 | } | 1867 | } |
1813 | 1868 | ||
1814 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1869 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
1815 | struct zone *zone, struct scan_control *sc, int priority) | 1870 | struct mem_cgroup_zone *mz, |
1871 | struct scan_control *sc, int priority) | ||
1816 | { | 1872 | { |
1817 | int file = is_file_lru(lru); | 1873 | int file = is_file_lru(lru); |
1818 | 1874 | ||
1819 | if (is_active_lru(lru)) { | 1875 | if (is_active_lru(lru)) { |
1820 | if (inactive_list_is_low(zone, sc, file)) | 1876 | if (inactive_list_is_low(mz, file)) |
1821 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | 1877 | shrink_active_list(nr_to_scan, mz, sc, priority, file); |
1822 | return 0; | 1878 | return 0; |
1823 | } | 1879 | } |
1824 | 1880 | ||
1825 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); | 1881 | return shrink_inactive_list(nr_to_scan, mz, sc, priority, file); |
1826 | } | 1882 | } |
1827 | 1883 | ||
1828 | static int vmscan_swappiness(struct scan_control *sc) | 1884 | static int vmscan_swappiness(struct mem_cgroup_zone *mz, |
1885 | struct scan_control *sc) | ||
1829 | { | 1886 | { |
1830 | if (scanning_global_lru(sc)) | 1887 | if (global_reclaim(sc)) |
1831 | return vm_swappiness; | 1888 | return vm_swappiness; |
1832 | return mem_cgroup_swappiness(sc->mem_cgroup); | 1889 | return mem_cgroup_swappiness(mz->mem_cgroup); |
1833 | } | 1890 | } |
1834 | 1891 | ||
1835 | /* | 1892 | /* |
@@ -1840,15 +1897,15 @@ static int vmscan_swappiness(struct scan_control *sc) | |||
1840 | * | 1897 | * |
1841 | * nr[0] = anon pages to scan; nr[1] = file pages to scan | 1898 | * nr[0] = anon pages to scan; nr[1] = file pages to scan |
1842 | */ | 1899 | */ |
1843 | static void get_scan_count(struct zone *zone, struct scan_control *sc, | 1900 | static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, |
1844 | unsigned long *nr, int priority) | 1901 | unsigned long *nr, int priority) |
1845 | { | 1902 | { |
1846 | unsigned long anon, file, free; | 1903 | unsigned long anon, file, free; |
1847 | unsigned long anon_prio, file_prio; | 1904 | unsigned long anon_prio, file_prio; |
1848 | unsigned long ap, fp; | 1905 | unsigned long ap, fp; |
1849 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1906 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); |
1850 | u64 fraction[2], denominator; | 1907 | u64 fraction[2], denominator; |
1851 | enum lru_list l; | 1908 | enum lru_list lru; |
1852 | int noswap = 0; | 1909 | int noswap = 0; |
1853 | bool force_scan = false; | 1910 | bool force_scan = false; |
1854 | 1911 | ||
@@ -1862,9 +1919,9 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1862 | * latencies, so it's better to scan a minimum amount there as | 1919 | * latencies, so it's better to scan a minimum amount there as |
1863 | * well. | 1920 | * well. |
1864 | */ | 1921 | */ |
1865 | if (scanning_global_lru(sc) && current_is_kswapd()) | 1922 | if (current_is_kswapd() && mz->zone->all_unreclaimable) |
1866 | force_scan = true; | 1923 | force_scan = true; |
1867 | if (!scanning_global_lru(sc)) | 1924 | if (!global_reclaim(sc)) |
1868 | force_scan = true; | 1925 | force_scan = true; |
1869 | 1926 | ||
1870 | /* If we have no swap space, do not bother scanning anon pages. */ | 1927 | /* If we have no swap space, do not bother scanning anon pages. */ |
@@ -1876,16 +1933,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1876 | goto out; | 1933 | goto out; |
1877 | } | 1934 | } |
1878 | 1935 | ||
1879 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | 1936 | anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) + |
1880 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | 1937 | zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); |
1881 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | 1938 | file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) + |
1882 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | 1939 | zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); |
1883 | 1940 | ||
1884 | if (scanning_global_lru(sc)) { | 1941 | if (global_reclaim(sc)) { |
1885 | free = zone_page_state(zone, NR_FREE_PAGES); | 1942 | free = zone_page_state(mz->zone, NR_FREE_PAGES); |
1886 | /* If we have very few page cache pages, | 1943 | /* If we have very few page cache pages, |
1887 | force-scan anon pages. */ | 1944 | force-scan anon pages. */ |
1888 | if (unlikely(file + free <= high_wmark_pages(zone))) { | 1945 | if (unlikely(file + free <= high_wmark_pages(mz->zone))) { |
1889 | fraction[0] = 1; | 1946 | fraction[0] = 1; |
1890 | fraction[1] = 0; | 1947 | fraction[1] = 0; |
1891 | denominator = 1; | 1948 | denominator = 1; |
@@ -1897,8 +1954,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1897 | * With swappiness at 100, anonymous and file have the same priority. | 1954 | * With swappiness at 100, anonymous and file have the same priority. |
1898 | * This scanning priority is essentially the inverse of IO cost. | 1955 | * This scanning priority is essentially the inverse of IO cost. |
1899 | */ | 1956 | */ |
1900 | anon_prio = vmscan_swappiness(sc); | 1957 | anon_prio = vmscan_swappiness(mz, sc); |
1901 | file_prio = 200 - vmscan_swappiness(sc); | 1958 | file_prio = 200 - vmscan_swappiness(mz, sc); |
1902 | 1959 | ||
1903 | /* | 1960 | /* |
1904 | * OK, so we have swap space and a fair amount of page cache | 1961 | * OK, so we have swap space and a fair amount of page cache |
@@ -1911,7 +1968,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1911 | * | 1968 | * |
1912 | * anon in [0], file in [1] | 1969 | * anon in [0], file in [1] |
1913 | */ | 1970 | */ |
1914 | spin_lock_irq(&zone->lru_lock); | 1971 | spin_lock_irq(&mz->zone->lru_lock); |
1915 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { | 1972 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { |
1916 | reclaim_stat->recent_scanned[0] /= 2; | 1973 | reclaim_stat->recent_scanned[0] /= 2; |
1917 | reclaim_stat->recent_rotated[0] /= 2; | 1974 | reclaim_stat->recent_rotated[0] /= 2; |
@@ -1932,24 +1989,24 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1932 | 1989 | ||
1933 | fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); | 1990 | fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); |
1934 | fp /= reclaim_stat->recent_rotated[1] + 1; | 1991 | fp /= reclaim_stat->recent_rotated[1] + 1; |
1935 | spin_unlock_irq(&zone->lru_lock); | 1992 | spin_unlock_irq(&mz->zone->lru_lock); |
1936 | 1993 | ||
1937 | fraction[0] = ap; | 1994 | fraction[0] = ap; |
1938 | fraction[1] = fp; | 1995 | fraction[1] = fp; |
1939 | denominator = ap + fp + 1; | 1996 | denominator = ap + fp + 1; |
1940 | out: | 1997 | out: |
1941 | for_each_evictable_lru(l) { | 1998 | for_each_evictable_lru(lru) { |
1942 | int file = is_file_lru(l); | 1999 | int file = is_file_lru(lru); |
1943 | unsigned long scan; | 2000 | unsigned long scan; |
1944 | 2001 | ||
1945 | scan = zone_nr_lru_pages(zone, sc, l); | 2002 | scan = zone_nr_lru_pages(mz, lru); |
1946 | if (priority || noswap) { | 2003 | if (priority || noswap) { |
1947 | scan >>= priority; | 2004 | scan >>= priority; |
1948 | if (!scan && force_scan) | 2005 | if (!scan && force_scan) |
1949 | scan = SWAP_CLUSTER_MAX; | 2006 | scan = SWAP_CLUSTER_MAX; |
1950 | scan = div64_u64(scan * fraction[file], denominator); | 2007 | scan = div64_u64(scan * fraction[file], denominator); |
1951 | } | 2008 | } |
1952 | nr[l] = scan; | 2009 | nr[lru] = scan; |
1953 | } | 2010 | } |
1954 | } | 2011 | } |
1955 | 2012 | ||
@@ -1960,7 +2017,7 @@ out: | |||
1960 | * back to the allocator and call try_to_compact_zone(), we ensure that | 2017 | * back to the allocator and call try_to_compact_zone(), we ensure that |
1961 | * there are enough free pages for it to be likely successful | 2018 | * there are enough free pages for it to be likely successful |
1962 | */ | 2019 | */ |
1963 | static inline bool should_continue_reclaim(struct zone *zone, | 2020 | static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, |
1964 | unsigned long nr_reclaimed, | 2021 | unsigned long nr_reclaimed, |
1965 | unsigned long nr_scanned, | 2022 | unsigned long nr_scanned, |
1966 | struct scan_control *sc) | 2023 | struct scan_control *sc) |
@@ -2000,15 +2057,15 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
2000 | * inactive lists are large enough, continue reclaiming | 2057 | * inactive lists are large enough, continue reclaiming |
2001 | */ | 2058 | */ |
2002 | pages_for_compaction = (2UL << sc->order); | 2059 | pages_for_compaction = (2UL << sc->order); |
2003 | inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | 2060 | inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); |
2004 | if (nr_swap_pages > 0) | 2061 | if (nr_swap_pages > 0) |
2005 | inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | 2062 | inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); |
2006 | if (sc->nr_reclaimed < pages_for_compaction && | 2063 | if (sc->nr_reclaimed < pages_for_compaction && |
2007 | inactive_lru_pages > pages_for_compaction) | 2064 | inactive_lru_pages > pages_for_compaction) |
2008 | return true; | 2065 | return true; |
2009 | 2066 | ||
2010 | /* If compaction would go ahead or the allocation would succeed, stop */ | 2067 | /* If compaction would go ahead or the allocation would succeed, stop */ |
2011 | switch (compaction_suitable(zone, sc->order)) { | 2068 | switch (compaction_suitable(mz->zone, sc->order)) { |
2012 | case COMPACT_PARTIAL: | 2069 | case COMPACT_PARTIAL: |
2013 | case COMPACT_CONTINUE: | 2070 | case COMPACT_CONTINUE: |
2014 | return false; | 2071 | return false; |
@@ -2020,12 +2077,12 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
2020 | /* | 2077 | /* |
2021 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 2078 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
2022 | */ | 2079 | */ |
2023 | static void shrink_zone(int priority, struct zone *zone, | 2080 | static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, |
2024 | struct scan_control *sc) | 2081 | struct scan_control *sc) |
2025 | { | 2082 | { |
2026 | unsigned long nr[NR_LRU_LISTS]; | 2083 | unsigned long nr[NR_LRU_LISTS]; |
2027 | unsigned long nr_to_scan; | 2084 | unsigned long nr_to_scan; |
2028 | enum lru_list l; | 2085 | enum lru_list lru; |
2029 | unsigned long nr_reclaimed, nr_scanned; | 2086 | unsigned long nr_reclaimed, nr_scanned; |
2030 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 2087 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
2031 | struct blk_plug plug; | 2088 | struct blk_plug plug; |
@@ -2033,19 +2090,19 @@ static void shrink_zone(int priority, struct zone *zone, | |||
2033 | restart: | 2090 | restart: |
2034 | nr_reclaimed = 0; | 2091 | nr_reclaimed = 0; |
2035 | nr_scanned = sc->nr_scanned; | 2092 | nr_scanned = sc->nr_scanned; |
2036 | get_scan_count(zone, sc, nr, priority); | 2093 | get_scan_count(mz, sc, nr, priority); |
2037 | 2094 | ||
2038 | blk_start_plug(&plug); | 2095 | blk_start_plug(&plug); |
2039 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 2096 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
2040 | nr[LRU_INACTIVE_FILE]) { | 2097 | nr[LRU_INACTIVE_FILE]) { |
2041 | for_each_evictable_lru(l) { | 2098 | for_each_evictable_lru(lru) { |
2042 | if (nr[l]) { | 2099 | if (nr[lru]) { |
2043 | nr_to_scan = min_t(unsigned long, | 2100 | nr_to_scan = min_t(unsigned long, |
2044 | nr[l], SWAP_CLUSTER_MAX); | 2101 | nr[lru], SWAP_CLUSTER_MAX); |
2045 | nr[l] -= nr_to_scan; | 2102 | nr[lru] -= nr_to_scan; |
2046 | 2103 | ||
2047 | nr_reclaimed += shrink_list(l, nr_to_scan, | 2104 | nr_reclaimed += shrink_list(lru, nr_to_scan, |
2048 | zone, sc, priority); | 2105 | mz, sc, priority); |
2049 | } | 2106 | } |
2050 | } | 2107 | } |
2051 | /* | 2108 | /* |
@@ -2066,17 +2123,89 @@ restart: | |||
2066 | * Even if we did not try to evict anon pages at all, we want to | 2123 | * Even if we did not try to evict anon pages at all, we want to |
2067 | * rebalance the anon lru active/inactive ratio. | 2124 | * rebalance the anon lru active/inactive ratio. |
2068 | */ | 2125 | */ |
2069 | if (inactive_anon_is_low(zone, sc)) | 2126 | if (inactive_anon_is_low(mz)) |
2070 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 2127 | shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0); |
2071 | 2128 | ||
2072 | /* reclaim/compaction might need reclaim to continue */ | 2129 | /* reclaim/compaction might need reclaim to continue */ |
2073 | if (should_continue_reclaim(zone, nr_reclaimed, | 2130 | if (should_continue_reclaim(mz, nr_reclaimed, |
2074 | sc->nr_scanned - nr_scanned, sc)) | 2131 | sc->nr_scanned - nr_scanned, sc)) |
2075 | goto restart; | 2132 | goto restart; |
2076 | 2133 | ||
2077 | throttle_vm_writeout(sc->gfp_mask); | 2134 | throttle_vm_writeout(sc->gfp_mask); |
2078 | } | 2135 | } |
2079 | 2136 | ||
2137 | static void shrink_zone(int priority, struct zone *zone, | ||
2138 | struct scan_control *sc) | ||
2139 | { | ||
2140 | struct mem_cgroup *root = sc->target_mem_cgroup; | ||
2141 | struct mem_cgroup_reclaim_cookie reclaim = { | ||
2142 | .zone = zone, | ||
2143 | .priority = priority, | ||
2144 | }; | ||
2145 | struct mem_cgroup *memcg; | ||
2146 | |||
2147 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | ||
2148 | do { | ||
2149 | struct mem_cgroup_zone mz = { | ||
2150 | .mem_cgroup = memcg, | ||
2151 | .zone = zone, | ||
2152 | }; | ||
2153 | |||
2154 | shrink_mem_cgroup_zone(priority, &mz, sc); | ||
2155 | /* | ||
2156 | * Limit reclaim has historically picked one memcg and | ||
2157 | * scanned it with decreasing priority levels until | ||
2158 | * nr_to_reclaim had been reclaimed. This priority | ||
2159 | * cycle is thus over after a single memcg. | ||
2160 | * | ||
2161 | * Direct reclaim and kswapd, on the other hand, have | ||
2162 | * to scan all memory cgroups to fulfill the overall | ||
2163 | * scan target for the zone. | ||
2164 | */ | ||
2165 | if (!global_reclaim(sc)) { | ||
2166 | mem_cgroup_iter_break(root, memcg); | ||
2167 | break; | ||
2168 | } | ||
2169 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | ||
2170 | } while (memcg); | ||
2171 | } | ||
2172 | |||
2173 | /* Returns true if compaction should go ahead for a high-order request */ | ||
2174 | static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | ||
2175 | { | ||
2176 | unsigned long balance_gap, watermark; | ||
2177 | bool watermark_ok; | ||
2178 | |||
2179 | /* Do not consider compaction for orders reclaim is meant to satisfy */ | ||
2180 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) | ||
2181 | return false; | ||
2182 | |||
2183 | /* | ||
2184 | * Compaction takes time to run and there are potentially other | ||
2185 | * callers using the pages just freed. Continue reclaiming until | ||
2186 | * there is a buffer of free pages available to give compaction | ||
2187 | * a reasonable chance of completing and allocating the page | ||
2188 | */ | ||
2189 | balance_gap = min(low_wmark_pages(zone), | ||
2190 | (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | ||
2191 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | ||
2192 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); | ||
2193 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); | ||
2194 | |||
2195 | /* | ||
2196 | * If compaction is deferred, reclaim up to a point where | ||
2197 | * compaction will have a chance of success when re-enabled | ||
2198 | */ | ||
2199 | if (compaction_deferred(zone)) | ||
2200 | return watermark_ok; | ||
2201 | |||
2202 | /* If compaction is not ready to start, keep reclaiming */ | ||
2203 | if (!compaction_suitable(zone, sc->order)) | ||
2204 | return false; | ||
2205 | |||
2206 | return watermark_ok; | ||
2207 | } | ||
2208 | |||
2080 | /* | 2209 | /* |
2081 | * This is the direct reclaim path, for page-allocating processes. We only | 2210 | * This is the direct reclaim path, for page-allocating processes. We only |
2082 | * try to reclaim pages from zones which will satisfy the caller's allocation | 2211 | * try to reclaim pages from zones which will satisfy the caller's allocation |
@@ -2094,8 +2223,9 @@ restart: | |||
2094 | * scan then give up on it. | 2223 | * scan then give up on it. |
2095 | * | 2224 | * |
2096 | * This function returns true if a zone is being reclaimed for a costly | 2225 | * This function returns true if a zone is being reclaimed for a costly |
2097 | * high-order allocation and compaction is either ready to begin or deferred. | 2226 | * high-order allocation and compaction is ready to begin. This indicates to |
2098 | * This indicates to the caller that it should retry the allocation or fail. | 2227 | * the caller that it should consider retrying the allocation instead of |
2228 | * further reclaim. | ||
2099 | */ | 2229 | */ |
2100 | static bool shrink_zones(int priority, struct zonelist *zonelist, | 2230 | static bool shrink_zones(int priority, struct zonelist *zonelist, |
2101 | struct scan_control *sc) | 2231 | struct scan_control *sc) |
@@ -2104,7 +2234,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, | |||
2104 | struct zone *zone; | 2234 | struct zone *zone; |
2105 | unsigned long nr_soft_reclaimed; | 2235 | unsigned long nr_soft_reclaimed; |
2106 | unsigned long nr_soft_scanned; | 2236 | unsigned long nr_soft_scanned; |
2107 | bool should_abort_reclaim = false; | 2237 | bool aborted_reclaim = false; |
2108 | 2238 | ||
2109 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2239 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2110 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2240 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
@@ -2114,7 +2244,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, | |||
2114 | * Take care memory controller reclaiming has small influence | 2244 | * Take care memory controller reclaiming has small influence |
2115 | * to global LRU. | 2245 | * to global LRU. |
2116 | */ | 2246 | */ |
2117 | if (scanning_global_lru(sc)) { | 2247 | if (global_reclaim(sc)) { |
2118 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2248 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2119 | continue; | 2249 | continue; |
2120 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2250 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
@@ -2129,10 +2259,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, | |||
2129 | * noticable problem, like transparent huge page | 2259 | * noticable problem, like transparent huge page |
2130 | * allocations. | 2260 | * allocations. |
2131 | */ | 2261 | */ |
2132 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER && | 2262 | if (compaction_ready(zone, sc)) { |
2133 | (compaction_suitable(zone, sc->order) || | 2263 | aborted_reclaim = true; |
2134 | compaction_deferred(zone))) { | ||
2135 | should_abort_reclaim = true; | ||
2136 | continue; | 2264 | continue; |
2137 | } | 2265 | } |
2138 | } | 2266 | } |
@@ -2154,7 +2282,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, | |||
2154 | shrink_zone(priority, zone, sc); | 2282 | shrink_zone(priority, zone, sc); |
2155 | } | 2283 | } |
2156 | 2284 | ||
2157 | return should_abort_reclaim; | 2285 | return aborted_reclaim; |
2158 | } | 2286 | } |
2159 | 2287 | ||
2160 | static bool zone_reclaimable(struct zone *zone) | 2288 | static bool zone_reclaimable(struct zone *zone) |
@@ -2208,25 +2336,25 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2208 | struct zoneref *z; | 2336 | struct zoneref *z; |
2209 | struct zone *zone; | 2337 | struct zone *zone; |
2210 | unsigned long writeback_threshold; | 2338 | unsigned long writeback_threshold; |
2339 | bool aborted_reclaim; | ||
2211 | 2340 | ||
2212 | get_mems_allowed(); | 2341 | get_mems_allowed(); |
2213 | delayacct_freepages_start(); | 2342 | delayacct_freepages_start(); |
2214 | 2343 | ||
2215 | if (scanning_global_lru(sc)) | 2344 | if (global_reclaim(sc)) |
2216 | count_vm_event(ALLOCSTALL); | 2345 | count_vm_event(ALLOCSTALL); |
2217 | 2346 | ||
2218 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2347 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
2219 | sc->nr_scanned = 0; | 2348 | sc->nr_scanned = 0; |
2220 | if (!priority) | 2349 | if (!priority) |
2221 | disable_swap_token(sc->mem_cgroup); | 2350 | disable_swap_token(sc->target_mem_cgroup); |
2222 | if (shrink_zones(priority, zonelist, sc)) | 2351 | aborted_reclaim = shrink_zones(priority, zonelist, sc); |
2223 | break; | ||
2224 | 2352 | ||
2225 | /* | 2353 | /* |
2226 | * Don't shrink slabs when reclaiming memory from | 2354 | * Don't shrink slabs when reclaiming memory from |
2227 | * over limit cgroups | 2355 | * over limit cgroups |
2228 | */ | 2356 | */ |
2229 | if (scanning_global_lru(sc)) { | 2357 | if (global_reclaim(sc)) { |
2230 | unsigned long lru_pages = 0; | 2358 | unsigned long lru_pages = 0; |
2231 | for_each_zone_zonelist(zone, z, zonelist, | 2359 | for_each_zone_zonelist(zone, z, zonelist, |
2232 | gfp_zone(sc->gfp_mask)) { | 2360 | gfp_zone(sc->gfp_mask)) { |
@@ -2287,8 +2415,12 @@ out: | |||
2287 | if (oom_killer_disabled) | 2415 | if (oom_killer_disabled) |
2288 | return 0; | 2416 | return 0; |
2289 | 2417 | ||
2418 | /* Aborted reclaim to try compaction? don't OOM, then */ | ||
2419 | if (aborted_reclaim) | ||
2420 | return 1; | ||
2421 | |||
2290 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 2422 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
2291 | if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) | 2423 | if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) |
2292 | return 1; | 2424 | return 1; |
2293 | 2425 | ||
2294 | return 0; | 2426 | return 0; |
@@ -2305,7 +2437,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2305 | .may_unmap = 1, | 2437 | .may_unmap = 1, |
2306 | .may_swap = 1, | 2438 | .may_swap = 1, |
2307 | .order = order, | 2439 | .order = order, |
2308 | .mem_cgroup = NULL, | 2440 | .target_mem_cgroup = NULL, |
2309 | .nodemask = nodemask, | 2441 | .nodemask = nodemask, |
2310 | }; | 2442 | }; |
2311 | struct shrink_control shrink = { | 2443 | struct shrink_control shrink = { |
@@ -2325,7 +2457,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2325 | 2457 | ||
2326 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 2458 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
2327 | 2459 | ||
2328 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | 2460 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, |
2329 | gfp_t gfp_mask, bool noswap, | 2461 | gfp_t gfp_mask, bool noswap, |
2330 | struct zone *zone, | 2462 | struct zone *zone, |
2331 | unsigned long *nr_scanned) | 2463 | unsigned long *nr_scanned) |
@@ -2337,7 +2469,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2337 | .may_unmap = 1, | 2469 | .may_unmap = 1, |
2338 | .may_swap = !noswap, | 2470 | .may_swap = !noswap, |
2339 | .order = 0, | 2471 | .order = 0, |
2340 | .mem_cgroup = mem, | 2472 | .target_mem_cgroup = memcg, |
2473 | }; | ||
2474 | struct mem_cgroup_zone mz = { | ||
2475 | .mem_cgroup = memcg, | ||
2476 | .zone = zone, | ||
2341 | }; | 2477 | }; |
2342 | 2478 | ||
2343 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2479 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
@@ -2354,7 +2490,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2354 | * will pick up pages from other mem cgroup's as well. We hack | 2490 | * will pick up pages from other mem cgroup's as well. We hack |
2355 | * the priority and make it zero. | 2491 | * the priority and make it zero. |
2356 | */ | 2492 | */ |
2357 | shrink_zone(0, zone, &sc); | 2493 | shrink_mem_cgroup_zone(0, &mz, &sc); |
2358 | 2494 | ||
2359 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); | 2495 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); |
2360 | 2496 | ||
@@ -2362,7 +2498,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2362 | return sc.nr_reclaimed; | 2498 | return sc.nr_reclaimed; |
2363 | } | 2499 | } |
2364 | 2500 | ||
2365 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | 2501 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, |
2366 | gfp_t gfp_mask, | 2502 | gfp_t gfp_mask, |
2367 | bool noswap) | 2503 | bool noswap) |
2368 | { | 2504 | { |
@@ -2375,7 +2511,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2375 | .may_swap = !noswap, | 2511 | .may_swap = !noswap, |
2376 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2512 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2377 | .order = 0, | 2513 | .order = 0, |
2378 | .mem_cgroup = mem_cont, | 2514 | .target_mem_cgroup = memcg, |
2379 | .nodemask = NULL, /* we don't care the placement */ | 2515 | .nodemask = NULL, /* we don't care the placement */ |
2380 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2516 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2381 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), | 2517 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), |
@@ -2389,7 +2525,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2389 | * take care of from where we get pages. So the node where we start the | 2525 | * take care of from where we get pages. So the node where we start the |
2390 | * scan does not need to be the current node. | 2526 | * scan does not need to be the current node. |
2391 | */ | 2527 | */ |
2392 | nid = mem_cgroup_select_victim_node(mem_cont); | 2528 | nid = mem_cgroup_select_victim_node(memcg); |
2393 | 2529 | ||
2394 | zonelist = NODE_DATA(nid)->node_zonelists; | 2530 | zonelist = NODE_DATA(nid)->node_zonelists; |
2395 | 2531 | ||
@@ -2405,6 +2541,29 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2405 | } | 2541 | } |
2406 | #endif | 2542 | #endif |
2407 | 2543 | ||
2544 | static void age_active_anon(struct zone *zone, struct scan_control *sc, | ||
2545 | int priority) | ||
2546 | { | ||
2547 | struct mem_cgroup *memcg; | ||
2548 | |||
2549 | if (!total_swap_pages) | ||
2550 | return; | ||
2551 | |||
2552 | memcg = mem_cgroup_iter(NULL, NULL, NULL); | ||
2553 | do { | ||
2554 | struct mem_cgroup_zone mz = { | ||
2555 | .mem_cgroup = memcg, | ||
2556 | .zone = zone, | ||
2557 | }; | ||
2558 | |||
2559 | if (inactive_anon_is_low(&mz)) | ||
2560 | shrink_active_list(SWAP_CLUSTER_MAX, &mz, | ||
2561 | sc, priority, 0); | ||
2562 | |||
2563 | memcg = mem_cgroup_iter(NULL, memcg, NULL); | ||
2564 | } while (memcg); | ||
2565 | } | ||
2566 | |||
2408 | /* | 2567 | /* |
2409 | * pgdat_balanced is used when checking if a node is balanced for high-order | 2568 | * pgdat_balanced is used when checking if a node is balanced for high-order |
2410 | * allocations. Only zones that meet watermarks and are in a zone allowed | 2569 | * allocations. Only zones that meet watermarks and are in a zone allowed |
@@ -2525,7 +2684,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2525 | */ | 2684 | */ |
2526 | .nr_to_reclaim = ULONG_MAX, | 2685 | .nr_to_reclaim = ULONG_MAX, |
2527 | .order = order, | 2686 | .order = order, |
2528 | .mem_cgroup = NULL, | 2687 | .target_mem_cgroup = NULL, |
2529 | }; | 2688 | }; |
2530 | struct shrink_control shrink = { | 2689 | struct shrink_control shrink = { |
2531 | .gfp_mask = sc.gfp_mask, | 2690 | .gfp_mask = sc.gfp_mask, |
@@ -2564,9 +2723,7 @@ loop_again: | |||
2564 | * Do some background aging of the anon list, to give | 2723 | * Do some background aging of the anon list, to give |
2565 | * pages a chance to be referenced before reclaiming. | 2724 | * pages a chance to be referenced before reclaiming. |
2566 | */ | 2725 | */ |
2567 | if (inactive_anon_is_low(zone, &sc)) | 2726 | age_active_anon(zone, &sc, priority); |
2568 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | ||
2569 | &sc, priority, 0); | ||
2570 | 2727 | ||
2571 | if (!zone_watermark_ok_safe(zone, order, | 2728 | if (!zone_watermark_ok_safe(zone, order, |
2572 | high_wmark_pages(zone), 0, 0)) { | 2729 | high_wmark_pages(zone), 0, 0)) { |
@@ -3355,16 +3512,18 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) | |||
3355 | */ | 3512 | */ |
3356 | static void check_move_unevictable_page(struct page *page, struct zone *zone) | 3513 | static void check_move_unevictable_page(struct page *page, struct zone *zone) |
3357 | { | 3514 | { |
3358 | VM_BUG_ON(PageActive(page)); | 3515 | struct lruvec *lruvec; |
3359 | 3516 | ||
3517 | VM_BUG_ON(PageActive(page)); | ||
3360 | retry: | 3518 | retry: |
3361 | ClearPageUnevictable(page); | 3519 | ClearPageUnevictable(page); |
3362 | if (page_evictable(page, NULL)) { | 3520 | if (page_evictable(page, NULL)) { |
3363 | enum lru_list l = page_lru_base_type(page); | 3521 | enum lru_list l = page_lru_base_type(page); |
3364 | 3522 | ||
3365 | __dec_zone_state(zone, NR_UNEVICTABLE); | 3523 | __dec_zone_state(zone, NR_UNEVICTABLE); |
3366 | list_move(&page->lru, &zone->lru[l].list); | 3524 | lruvec = mem_cgroup_lru_move_lists(zone, page, |
3367 | mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); | 3525 | LRU_UNEVICTABLE, l); |
3526 | list_move(&page->lru, &lruvec->lists[l]); | ||
3368 | __inc_zone_state(zone, NR_INACTIVE_ANON + l); | 3527 | __inc_zone_state(zone, NR_INACTIVE_ANON + l); |
3369 | __count_vm_event(UNEVICTABLE_PGRESCUED); | 3528 | __count_vm_event(UNEVICTABLE_PGRESCUED); |
3370 | } else { | 3529 | } else { |
@@ -3372,8 +3531,9 @@ retry: | |||
3372 | * rotate unevictable list | 3531 | * rotate unevictable list |
3373 | */ | 3532 | */ |
3374 | SetPageUnevictable(page); | 3533 | SetPageUnevictable(page); |
3375 | list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); | 3534 | lruvec = mem_cgroup_lru_move_lists(zone, page, LRU_UNEVICTABLE, |
3376 | mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); | 3535 | LRU_UNEVICTABLE); |
3536 | list_move(&page->lru, &lruvec->lists[LRU_UNEVICTABLE]); | ||
3377 | if (page_evictable(page, NULL)) | 3537 | if (page_evictable(page, NULL)) |
3378 | goto retry; | 3538 | goto retry; |
3379 | } | 3539 | } |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 8fd603b1665e..f600557a7659 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -295,7 +295,7 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) | |||
295 | } | 295 | } |
296 | EXPORT_SYMBOL(__dec_zone_page_state); | 296 | EXPORT_SYMBOL(__dec_zone_page_state); |
297 | 297 | ||
298 | #ifdef CONFIG_CMPXCHG_LOCAL | 298 | #ifdef CONFIG_HAVE_CMPXCHG_LOCAL |
299 | /* | 299 | /* |
300 | * If we have cmpxchg_local support then we do not need to incur the overhead | 300 | * If we have cmpxchg_local support then we do not need to incur the overhead |
301 | * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. | 301 | * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. |
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile new file mode 100644 index 000000000000..4ec84018cc13 --- /dev/null +++ b/tools/testing/selftests/Makefile | |||
@@ -0,0 +1,11 @@ | |||
1 | TARGETS = breakpoints | ||
2 | |||
3 | all: | ||
4 | for TARGET in $(TARGETS); do \ | ||
5 | make -C $$TARGET; \ | ||
6 | done; | ||
7 | |||
8 | clean: | ||
9 | for TARGET in $(TARGETS); do \ | ||
10 | make -C $$TARGET clean; \ | ||
11 | done; | ||
diff --git a/tools/testing/selftests/breakpoints/Makefile b/tools/testing/selftests/breakpoints/Makefile new file mode 100644 index 000000000000..f362722cdce7 --- /dev/null +++ b/tools/testing/selftests/breakpoints/Makefile | |||
@@ -0,0 +1,20 @@ | |||
1 | # Taken from perf makefile | ||
2 | uname_M := $(shell uname -m 2>/dev/null || echo not) | ||
3 | ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) | ||
4 | ifeq ($(ARCH),i386) | ||
5 | ARCH := x86 | ||
6 | endif | ||
7 | ifeq ($(ARCH),x86_64) | ||
8 | ARCH := x86 | ||
9 | endif | ||
10 | |||
11 | |||
12 | all: | ||
13 | ifeq ($(ARCH),x86) | ||
14 | gcc breakpoint_test.c -o run_test | ||
15 | else | ||
16 | echo "Not an x86 target, can't build breakpoints selftests" | ||
17 | endif | ||
18 | |||
19 | clean: | ||
20 | rm -fr run_test | ||
diff --git a/tools/testing/selftests/breakpoints/breakpoint_test.c b/tools/testing/selftests/breakpoints/breakpoint_test.c new file mode 100644 index 000000000000..a0743f3b2b57 --- /dev/null +++ b/tools/testing/selftests/breakpoints/breakpoint_test.c | |||
@@ -0,0 +1,394 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2011 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com> | ||
3 | * | ||
4 | * Licensed under the terms of the GNU GPL License version 2 | ||
5 | * | ||
6 | * Selftests for breakpoints (and more generally the do_debug() path) in x86. | ||
7 | */ | ||
8 | |||
9 | |||
10 | #include <sys/ptrace.h> | ||
11 | #include <unistd.h> | ||
12 | #include <stddef.h> | ||
13 | #include <sys/user.h> | ||
14 | #include <stdio.h> | ||
15 | #include <stdlib.h> | ||
16 | #include <signal.h> | ||
17 | #include <sys/types.h> | ||
18 | #include <sys/wait.h> | ||
19 | |||
20 | |||
21 | /* Breakpoint access modes */ | ||
22 | enum { | ||
23 | BP_X = 1, | ||
24 | BP_RW = 2, | ||
25 | BP_W = 4, | ||
26 | }; | ||
27 | |||
28 | static pid_t child_pid; | ||
29 | |||
30 | /* | ||
31 | * Ensures the child and parent are always "talking" about | ||
32 | * the same test sequence. (ie: that we haven't forgotten | ||
33 | * to call check_trapped() somewhere). | ||
34 | */ | ||
35 | static int nr_tests; | ||
36 | |||
37 | static void set_breakpoint_addr(void *addr, int n) | ||
38 | { | ||
39 | int ret; | ||
40 | |||
41 | ret = ptrace(PTRACE_POKEUSER, child_pid, | ||
42 | offsetof(struct user, u_debugreg[n]), addr); | ||
43 | if (ret) { | ||
44 | perror("Can't set breakpoint addr\n"); | ||
45 | exit(-1); | ||
46 | } | ||
47 | } | ||
48 | |||
49 | static void toggle_breakpoint(int n, int type, int len, | ||
50 | int local, int global, int set) | ||
51 | { | ||
52 | int ret; | ||
53 | |||
54 | int xtype, xlen; | ||
55 | unsigned long vdr7, dr7; | ||
56 | |||
57 | switch (type) { | ||
58 | case BP_X: | ||
59 | xtype = 0; | ||
60 | break; | ||
61 | case BP_W: | ||
62 | xtype = 1; | ||
63 | break; | ||
64 | case BP_RW: | ||
65 | xtype = 3; | ||
66 | break; | ||
67 | } | ||
68 | |||
69 | switch (len) { | ||
70 | case 1: | ||
71 | xlen = 0; | ||
72 | break; | ||
73 | case 2: | ||
74 | xlen = 4; | ||
75 | break; | ||
76 | case 4: | ||
77 | xlen = 0xc; | ||
78 | break; | ||
79 | case 8: | ||
80 | xlen = 8; | ||
81 | break; | ||
82 | } | ||
83 | |||
84 | dr7 = ptrace(PTRACE_PEEKUSER, child_pid, | ||
85 | offsetof(struct user, u_debugreg[7]), 0); | ||
86 | |||
87 | vdr7 = (xlen | xtype) << 16; | ||
88 | vdr7 <<= 4 * n; | ||
89 | |||
90 | if (local) { | ||
91 | vdr7 |= 1 << (2 * n); | ||
92 | vdr7 |= 1 << 8; | ||
93 | } | ||
94 | if (global) { | ||
95 | vdr7 |= 2 << (2 * n); | ||
96 | vdr7 |= 1 << 9; | ||
97 | } | ||
98 | |||
99 | if (set) | ||
100 | dr7 |= vdr7; | ||
101 | else | ||
102 | dr7 &= ~vdr7; | ||
103 | |||
104 | ret = ptrace(PTRACE_POKEUSER, child_pid, | ||
105 | offsetof(struct user, u_debugreg[7]), dr7); | ||
106 | if (ret) { | ||
107 | perror("Can't set dr7"); | ||
108 | exit(-1); | ||
109 | } | ||
110 | } | ||
111 | |||
112 | /* Dummy variables to test read/write accesses */ | ||
113 | static unsigned long long dummy_var[4]; | ||
114 | |||
115 | /* Dummy functions to test execution accesses */ | ||
116 | static void dummy_func(void) { } | ||
117 | static void dummy_func1(void) { } | ||
118 | static void dummy_func2(void) { } | ||
119 | static void dummy_func3(void) { } | ||
120 | |||
121 | static void (*dummy_funcs[])(void) = { | ||
122 | dummy_func, | ||
123 | dummy_func1, | ||
124 | dummy_func2, | ||
125 | dummy_func3, | ||
126 | }; | ||
127 | |||
128 | static int trapped; | ||
129 | |||
130 | static void check_trapped(void) | ||
131 | { | ||
132 | /* | ||
133 | * If we haven't trapped, wake up the parent | ||
134 | * so that it notices the failure. | ||
135 | */ | ||
136 | if (!trapped) | ||
137 | kill(getpid(), SIGUSR1); | ||
138 | trapped = 0; | ||
139 | |||
140 | nr_tests++; | ||
141 | } | ||
142 | |||
143 | static void write_var(int len) | ||
144 | { | ||
145 | char *pcval; short *psval; int *pival; long long *plval; | ||
146 | int i; | ||
147 | |||
148 | for (i = 0; i < 4; i++) { | ||
149 | switch (len) { | ||
150 | case 1: | ||
151 | pcval = (char *)&dummy_var[i]; | ||
152 | *pcval = 0xff; | ||
153 | break; | ||
154 | case 2: | ||
155 | psval = (short *)&dummy_var[i]; | ||
156 | *psval = 0xffff; | ||
157 | break; | ||
158 | case 4: | ||
159 | pival = (int *)&dummy_var[i]; | ||
160 | *pival = 0xffffffff; | ||
161 | break; | ||
162 | case 8: | ||
163 | plval = (long long *)&dummy_var[i]; | ||
164 | *plval = 0xffffffffffffffffLL; | ||
165 | break; | ||
166 | } | ||
167 | check_trapped(); | ||
168 | } | ||
169 | } | ||
170 | |||
171 | static void read_var(int len) | ||
172 | { | ||
173 | char cval; short sval; int ival; long long lval; | ||
174 | int i; | ||
175 | |||
176 | for (i = 0; i < 4; i++) { | ||
177 | switch (len) { | ||
178 | case 1: | ||
179 | cval = *(char *)&dummy_var[i]; | ||
180 | break; | ||
181 | case 2: | ||
182 | sval = *(short *)&dummy_var[i]; | ||
183 | break; | ||
184 | case 4: | ||
185 | ival = *(int *)&dummy_var[i]; | ||
186 | break; | ||
187 | case 8: | ||
188 | lval = *(long long *)&dummy_var[i]; | ||
189 | break; | ||
190 | } | ||
191 | check_trapped(); | ||
192 | } | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * Do the r/w/x accesses to trigger the breakpoints. And run | ||
197 | * the usual traps. | ||
198 | */ | ||
199 | static void trigger_tests(void) | ||
200 | { | ||
201 | int len, local, global, i; | ||
202 | char val; | ||
203 | int ret; | ||
204 | |||
205 | ret = ptrace(PTRACE_TRACEME, 0, NULL, 0); | ||
206 | if (ret) { | ||
207 | perror("Can't be traced?\n"); | ||
208 | return; | ||
209 | } | ||
210 | |||
211 | /* Wake up father so that it sets up the first test */ | ||
212 | kill(getpid(), SIGUSR1); | ||
213 | |||
214 | /* Test instruction breakpoints */ | ||
215 | for (local = 0; local < 2; local++) { | ||
216 | for (global = 0; global < 2; global++) { | ||
217 | if (!local && !global) | ||
218 | continue; | ||
219 | |||
220 | for (i = 0; i < 4; i++) { | ||
221 | dummy_funcs[i](); | ||
222 | check_trapped(); | ||
223 | } | ||
224 | } | ||
225 | } | ||
226 | |||
227 | /* Test write watchpoints */ | ||
228 | for (len = 1; len <= sizeof(long); len <<= 1) { | ||
229 | for (local = 0; local < 2; local++) { | ||
230 | for (global = 0; global < 2; global++) { | ||
231 | if (!local && !global) | ||
232 | continue; | ||
233 | write_var(len); | ||
234 | } | ||
235 | } | ||
236 | } | ||
237 | |||
238 | /* Test read/write watchpoints (on read accesses) */ | ||
239 | for (len = 1; len <= sizeof(long); len <<= 1) { | ||
240 | for (local = 0; local < 2; local++) { | ||
241 | for (global = 0; global < 2; global++) { | ||
242 | if (!local && !global) | ||
243 | continue; | ||
244 | read_var(len); | ||
245 | } | ||
246 | } | ||
247 | } | ||
248 | |||
249 | /* Icebp trap */ | ||
250 | asm(".byte 0xf1\n"); | ||
251 | check_trapped(); | ||
252 | |||
253 | /* Int 3 trap */ | ||
254 | asm("int $3\n"); | ||
255 | check_trapped(); | ||
256 | |||
257 | kill(getpid(), SIGUSR1); | ||
258 | } | ||
259 | |||
260 | static void check_success(const char *msg) | ||
261 | { | ||
262 | const char *msg2; | ||
263 | int child_nr_tests; | ||
264 | int status; | ||
265 | |||
266 | /* Wait for the child to SIGTRAP */ | ||
267 | wait(&status); | ||
268 | |||
269 | msg2 = "Failed"; | ||
270 | |||
271 | if (WSTOPSIG(status) == SIGTRAP) { | ||
272 | child_nr_tests = ptrace(PTRACE_PEEKDATA, child_pid, | ||
273 | &nr_tests, 0); | ||
274 | if (child_nr_tests == nr_tests) | ||
275 | msg2 = "Ok"; | ||
276 | if (ptrace(PTRACE_POKEDATA, child_pid, &trapped, 1)) { | ||
277 | perror("Can't poke\n"); | ||
278 | exit(-1); | ||
279 | } | ||
280 | } | ||
281 | |||
282 | nr_tests++; | ||
283 | |||
284 | printf("%s [%s]\n", msg, msg2); | ||
285 | } | ||
286 | |||
287 | static void launch_instruction_breakpoints(char *buf, int local, int global) | ||
288 | { | ||
289 | int i; | ||
290 | |||
291 | for (i = 0; i < 4; i++) { | ||
292 | set_breakpoint_addr(dummy_funcs[i], i); | ||
293 | toggle_breakpoint(i, BP_X, 1, local, global, 1); | ||
294 | ptrace(PTRACE_CONT, child_pid, NULL, 0); | ||
295 | sprintf(buf, "Test breakpoint %d with local: %d global: %d", | ||
296 | i, local, global); | ||
297 | check_success(buf); | ||
298 | toggle_breakpoint(i, BP_X, 1, local, global, 0); | ||
299 | } | ||
300 | } | ||
301 | |||
302 | static void launch_watchpoints(char *buf, int mode, int len, | ||
303 | int local, int global) | ||
304 | { | ||
305 | const char *mode_str; | ||
306 | int i; | ||
307 | |||
308 | if (mode == BP_W) | ||
309 | mode_str = "write"; | ||
310 | else | ||
311 | mode_str = "read"; | ||
312 | |||
313 | for (i = 0; i < 4; i++) { | ||
314 | set_breakpoint_addr(&dummy_var[i], i); | ||
315 | toggle_breakpoint(i, mode, len, local, global, 1); | ||
316 | ptrace(PTRACE_CONT, child_pid, NULL, 0); | ||
317 | sprintf(buf, "Test %s watchpoint %d with len: %d local: " | ||
318 | "%d global: %d", mode_str, i, len, local, global); | ||
319 | check_success(buf); | ||
320 | toggle_breakpoint(i, mode, len, local, global, 0); | ||
321 | } | ||
322 | } | ||
323 | |||
324 | /* Set the breakpoints and check the child successfully trigger them */ | ||
325 | static void launch_tests(void) | ||
326 | { | ||
327 | char buf[1024]; | ||
328 | int len, local, global, i; | ||
329 | |||
330 | /* Instruction breakpoints */ | ||
331 | for (local = 0; local < 2; local++) { | ||
332 | for (global = 0; global < 2; global++) { | ||
333 | if (!local && !global) | ||
334 | continue; | ||
335 | launch_instruction_breakpoints(buf, local, global); | ||
336 | } | ||
337 | } | ||
338 | |||
339 | /* Write watchpoint */ | ||
340 | for (len = 1; len <= sizeof(long); len <<= 1) { | ||
341 | for (local = 0; local < 2; local++) { | ||
342 | for (global = 0; global < 2; global++) { | ||
343 | if (!local && !global) | ||
344 | continue; | ||
345 | launch_watchpoints(buf, BP_W, len, | ||
346 | local, global); | ||
347 | } | ||
348 | } | ||
349 | } | ||
350 | |||
351 | /* Read-Write watchpoint */ | ||
352 | for (len = 1; len <= sizeof(long); len <<= 1) { | ||
353 | for (local = 0; local < 2; local++) { | ||
354 | for (global = 0; global < 2; global++) { | ||
355 | if (!local && !global) | ||
356 | continue; | ||
357 | launch_watchpoints(buf, BP_RW, len, | ||
358 | local, global); | ||
359 | } | ||
360 | } | ||
361 | } | ||
362 | |||
363 | /* Icebp traps */ | ||
364 | ptrace(PTRACE_CONT, child_pid, NULL, 0); | ||
365 | check_success("Test icebp"); | ||
366 | |||
367 | /* Int 3 traps */ | ||
368 | ptrace(PTRACE_CONT, child_pid, NULL, 0); | ||
369 | check_success("Test int 3 trap"); | ||
370 | |||
371 | ptrace(PTRACE_CONT, child_pid, NULL, 0); | ||
372 | } | ||
373 | |||
374 | int main(int argc, char **argv) | ||
375 | { | ||
376 | pid_t pid; | ||
377 | int ret; | ||
378 | |||
379 | pid = fork(); | ||
380 | if (!pid) { | ||
381 | trigger_tests(); | ||
382 | return 0; | ||
383 | } | ||
384 | |||
385 | child_pid = pid; | ||
386 | |||
387 | wait(NULL); | ||
388 | |||
389 | launch_tests(); | ||
390 | |||
391 | wait(NULL); | ||
392 | |||
393 | return 0; | ||
394 | } | ||
diff --git a/tools/testing/selftests/run_tests b/tools/testing/selftests/run_tests new file mode 100644 index 000000000000..320718a4e6bf --- /dev/null +++ b/tools/testing/selftests/run_tests | |||
@@ -0,0 +1,8 @@ | |||
1 | #!/bin/bash | ||
2 | |||
3 | TARGETS=breakpoints | ||
4 | |||
5 | for TARGET in $TARGETS | ||
6 | do | ||
7 | $TARGET/run_test | ||
8 | done | ||