diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 14 | ||||
-rw-r--r-- | mm/Makefile | 8 | ||||
-rw-r--r-- | mm/filemap.c | 6 | ||||
-rw-r--r-- | mm/hugetlb.c | 12 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 41 | ||||
-rw-r--r-- | mm/ksm.c | 14 | ||||
-rw-r--r-- | mm/madvise.c | 30 | ||||
-rw-r--r-- | mm/memcontrol.c | 737 | ||||
-rw-r--r-- | mm/memory-failure.c | 832 | ||||
-rw-r--r-- | mm/memory.c | 86 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 6 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/mremap.c | 4 | ||||
-rw-r--r-- | mm/nommu.c | 85 | ||||
-rw-r--r-- | mm/page-writeback.c | 27 | ||||
-rw-r--r-- | mm/page_alloc.c | 44 | ||||
-rw-r--r-- | mm/quicklist.c | 3 | ||||
-rw-r--r-- | mm/rmap.c | 60 | ||||
-rw-r--r-- | mm/shmem.c | 5 | ||||
-rw-r--r-- | mm/swapfile.c | 4 | ||||
-rw-r--r-- | mm/truncate.c | 136 | ||||
-rw-r--r-- | mm/vmalloc.c | 2 | ||||
-rw-r--r-- | mm/vmscan.c | 51 |
23 files changed, 1901 insertions, 308 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 71eb0b4cce8d..247760729593 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -245,6 +245,20 @@ config DEFAULT_MMAP_MIN_ADDR | |||
245 | /proc/sys/vm/mmap_min_addr tunable. | 245 | /proc/sys/vm/mmap_min_addr tunable. |
246 | 246 | ||
247 | 247 | ||
248 | config MEMORY_FAILURE | ||
249 | depends on MMU | ||
250 | depends on X86_MCE | ||
251 | bool "Enable recovery from hardware memory errors" | ||
252 | help | ||
253 | Enables code to recover from some memory failures on systems | ||
254 | with MCA recovery. This allows a system to continue running | ||
255 | even when some of its memory has uncorrected errors. This requires | ||
256 | special hardware support and typically ECC memory. | ||
257 | |||
258 | config HWPOISON_INJECT | ||
259 | tristate "Poison pages injector" | ||
260 | depends on MEMORY_FAILURE && DEBUG_KERNEL | ||
261 | |||
248 | config NOMMU_INITIAL_TRIM_EXCESS | 262 | config NOMMU_INITIAL_TRIM_EXCESS |
249 | int "Turn on mmap() excess space trimming before booting" | 263 | int "Turn on mmap() excess space trimming before booting" |
250 | depends on !MMU | 264 | depends on !MMU |
diff --git a/mm/Makefile b/mm/Makefile index 728a9fde49d1..ebf849042ed3 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -5,16 +5,16 @@ | |||
5 | mmu-y := nommu.o | 5 | mmu-y := nommu.o |
6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | 6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ |
7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
8 | vmalloc.o | 8 | vmalloc.o pagewalk.o |
9 | 9 | ||
10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ |
11 | maccess.o page_alloc.o page-writeback.o \ | 11 | maccess.o page_alloc.o page-writeback.o \ |
12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
14 | page_isolation.o mm_init.o mmu_context.o $(mmu-y) | 14 | page_isolation.o mm_init.o mmu_context.o \ |
15 | $(mmu-y) | ||
15 | obj-y += init-mm.o | 16 | obj-y += init-mm.o |
16 | 17 | ||
17 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o | ||
18 | obj-$(CONFIG_BOUNCE) += bounce.o | 18 | obj-$(CONFIG_BOUNCE) += bounce.o |
19 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | 19 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o |
20 | obj-$(CONFIG_HAS_DMA) += dmapool.o | 20 | obj-$(CONFIG_HAS_DMA) += dmapool.o |
@@ -41,5 +41,7 @@ obj-$(CONFIG_SMP) += allocpercpu.o | |||
41 | endif | 41 | endif |
42 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 42 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
43 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 43 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o |
44 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | ||
45 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | ||
44 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | 46 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o |
45 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | 47 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o |
diff --git a/mm/filemap.c b/mm/filemap.c index bcc7372aebbc..6c84e598b4a9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -58,7 +58,7 @@ | |||
58 | /* | 58 | /* |
59 | * Lock ordering: | 59 | * Lock ordering: |
60 | * | 60 | * |
61 | * ->i_mmap_lock (vmtruncate) | 61 | * ->i_mmap_lock (truncate_pagecache) |
62 | * ->private_lock (__free_pte->__set_page_dirty_buffers) | 62 | * ->private_lock (__free_pte->__set_page_dirty_buffers) |
63 | * ->swap_lock (exclusive_swap_page, others) | 63 | * ->swap_lock (exclusive_swap_page, others) |
64 | * ->mapping->tree_lock | 64 | * ->mapping->tree_lock |
@@ -104,6 +104,10 @@ | |||
104 | * | 104 | * |
105 | * ->task->proc_lock | 105 | * ->task->proc_lock |
106 | * ->dcache_lock (proc_pid_lookup) | 106 | * ->dcache_lock (proc_pid_lookup) |
107 | * | ||
108 | * (code doesn't rely on that order, so you could switch it around) | ||
109 | * ->tasklist_lock (memory_failure, collect_procs_ao) | ||
110 | * ->i_mmap_lock | ||
107 | */ | 111 | */ |
108 | 112 | ||
109 | /* | 113 | /* |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 815dbd4a6dcb..6f048fcc749c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -1537,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) | |||
1537 | 1537 | ||
1538 | #ifdef CONFIG_SYSCTL | 1538 | #ifdef CONFIG_SYSCTL |
1539 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1539 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, |
1540 | struct file *file, void __user *buffer, | 1540 | void __user *buffer, |
1541 | size_t *length, loff_t *ppos) | 1541 | size_t *length, loff_t *ppos) |
1542 | { | 1542 | { |
1543 | struct hstate *h = &default_hstate; | 1543 | struct hstate *h = &default_hstate; |
@@ -1548,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, | |||
1548 | 1548 | ||
1549 | table->data = &tmp; | 1549 | table->data = &tmp; |
1550 | table->maxlen = sizeof(unsigned long); | 1550 | table->maxlen = sizeof(unsigned long); |
1551 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | 1551 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1552 | 1552 | ||
1553 | if (write) | 1553 | if (write) |
1554 | h->max_huge_pages = set_max_huge_pages(h, tmp); | 1554 | h->max_huge_pages = set_max_huge_pages(h, tmp); |
@@ -1557,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, | |||
1557 | } | 1557 | } |
1558 | 1558 | ||
1559 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, | 1559 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, |
1560 | struct file *file, void __user *buffer, | 1560 | void __user *buffer, |
1561 | size_t *length, loff_t *ppos) | 1561 | size_t *length, loff_t *ppos) |
1562 | { | 1562 | { |
1563 | proc_dointvec(table, write, file, buffer, length, ppos); | 1563 | proc_dointvec(table, write, buffer, length, ppos); |
1564 | if (hugepages_treat_as_movable) | 1564 | if (hugepages_treat_as_movable) |
1565 | htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; | 1565 | htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; |
1566 | else | 1566 | else |
@@ -1569,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write, | |||
1569 | } | 1569 | } |
1570 | 1570 | ||
1571 | int hugetlb_overcommit_handler(struct ctl_table *table, int write, | 1571 | int hugetlb_overcommit_handler(struct ctl_table *table, int write, |
1572 | struct file *file, void __user *buffer, | 1572 | void __user *buffer, |
1573 | size_t *length, loff_t *ppos) | 1573 | size_t *length, loff_t *ppos) |
1574 | { | 1574 | { |
1575 | struct hstate *h = &default_hstate; | 1575 | struct hstate *h = &default_hstate; |
@@ -1580,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
1580 | 1580 | ||
1581 | table->data = &tmp; | 1581 | table->data = &tmp; |
1582 | table->maxlen = sizeof(unsigned long); | 1582 | table->maxlen = sizeof(unsigned long); |
1583 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | 1583 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1584 | 1584 | ||
1585 | if (write) { | 1585 | if (write) { |
1586 | spin_lock(&hugetlb_lock); | 1586 | spin_lock(&hugetlb_lock); |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c new file mode 100644 index 000000000000..e1d85137f086 --- /dev/null +++ b/mm/hwpoison-inject.c | |||
@@ -0,0 +1,41 @@ | |||
1 | /* Inject a hwpoison memory failure on a arbitary pfn */ | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/debugfs.h> | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/mm.h> | ||
6 | |||
7 | static struct dentry *hwpoison_dir, *corrupt_pfn; | ||
8 | |||
9 | static int hwpoison_inject(void *data, u64 val) | ||
10 | { | ||
11 | if (!capable(CAP_SYS_ADMIN)) | ||
12 | return -EPERM; | ||
13 | printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); | ||
14 | return __memory_failure(val, 18, 0); | ||
15 | } | ||
16 | |||
17 | DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); | ||
18 | |||
19 | static void pfn_inject_exit(void) | ||
20 | { | ||
21 | if (hwpoison_dir) | ||
22 | debugfs_remove_recursive(hwpoison_dir); | ||
23 | } | ||
24 | |||
25 | static int pfn_inject_init(void) | ||
26 | { | ||
27 | hwpoison_dir = debugfs_create_dir("hwpoison", NULL); | ||
28 | if (hwpoison_dir == NULL) | ||
29 | return -ENOMEM; | ||
30 | corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, | ||
31 | NULL, &hwpoison_fops); | ||
32 | if (corrupt_pfn == NULL) { | ||
33 | pfn_inject_exit(); | ||
34 | return -ENOMEM; | ||
35 | } | ||
36 | return 0; | ||
37 | } | ||
38 | |||
39 | module_init(pfn_inject_init); | ||
40 | module_exit(pfn_inject_exit); | ||
41 | MODULE_LICENSE("GPL"); | ||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
31 | #include <linux/rbtree.h> | 31 | #include <linux/rbtree.h> |
32 | #include <linux/mmu_notifier.h> | 32 | #include <linux/mmu_notifier.h> |
33 | #include <linux/swap.h> | ||
33 | #include <linux/ksm.h> | 34 | #include <linux/ksm.h> |
34 | 35 | ||
35 | #include <asm/tlbflush.h> | 36 | #include <asm/tlbflush.h> |
@@ -162,10 +163,10 @@ static unsigned long ksm_pages_unshared; | |||
162 | static unsigned long ksm_rmap_items; | 163 | static unsigned long ksm_rmap_items; |
163 | 164 | ||
164 | /* Limit on the number of unswappable pages used */ | 165 | /* Limit on the number of unswappable pages used */ |
165 | static unsigned long ksm_max_kernel_pages = 2000; | 166 | static unsigned long ksm_max_kernel_pages; |
166 | 167 | ||
167 | /* Number of pages ksmd should scan in one batch */ | 168 | /* Number of pages ksmd should scan in one batch */ |
168 | static unsigned int ksm_thread_pages_to_scan = 200; | 169 | static unsigned int ksm_thread_pages_to_scan = 100; |
169 | 170 | ||
170 | /* Milliseconds ksmd should sleep between batches */ | 171 | /* Milliseconds ksmd should sleep between batches */ |
171 | static unsigned int ksm_thread_sleep_millisecs = 20; | 172 | static unsigned int ksm_thread_sleep_millisecs = 20; |
@@ -173,7 +174,7 @@ static unsigned int ksm_thread_sleep_millisecs = 20; | |||
173 | #define KSM_RUN_STOP 0 | 174 | #define KSM_RUN_STOP 0 |
174 | #define KSM_RUN_MERGE 1 | 175 | #define KSM_RUN_MERGE 1 |
175 | #define KSM_RUN_UNMERGE 2 | 176 | #define KSM_RUN_UNMERGE 2 |
176 | static unsigned int ksm_run = KSM_RUN_MERGE; | 177 | static unsigned int ksm_run = KSM_RUN_STOP; |
177 | 178 | ||
178 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); | 179 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); |
179 | static DEFINE_MUTEX(ksm_thread_mutex); | 180 | static DEFINE_MUTEX(ksm_thread_mutex); |
@@ -183,6 +184,11 @@ static DEFINE_SPINLOCK(ksm_mmlist_lock); | |||
183 | sizeof(struct __struct), __alignof__(struct __struct),\ | 184 | sizeof(struct __struct), __alignof__(struct __struct),\ |
184 | (__flags), NULL) | 185 | (__flags), NULL) |
185 | 186 | ||
187 | static void __init ksm_init_max_kernel_pages(void) | ||
188 | { | ||
189 | ksm_max_kernel_pages = nr_free_buffer_pages() / 4; | ||
190 | } | ||
191 | |||
186 | static int __init ksm_slab_init(void) | 192 | static int __init ksm_slab_init(void) |
187 | { | 193 | { |
188 | rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); | 194 | rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); |
@@ -1667,6 +1673,8 @@ static int __init ksm_init(void) | |||
1667 | struct task_struct *ksm_thread; | 1673 | struct task_struct *ksm_thread; |
1668 | int err; | 1674 | int err; |
1669 | 1675 | ||
1676 | ksm_init_max_kernel_pages(); | ||
1677 | |||
1670 | err = ksm_slab_init(); | 1678 | err = ksm_slab_init(); |
1671 | if (err) | 1679 | if (err) |
1672 | goto out; | 1680 | goto out; |
diff --git a/mm/madvise.c b/mm/madvise.c index d9ae2067952e..35b1479b7c9d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -218,6 +218,32 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
218 | return error; | 218 | return error; |
219 | } | 219 | } |
220 | 220 | ||
221 | #ifdef CONFIG_MEMORY_FAILURE | ||
222 | /* | ||
223 | * Error injection support for memory error handling. | ||
224 | */ | ||
225 | static int madvise_hwpoison(unsigned long start, unsigned long end) | ||
226 | { | ||
227 | int ret = 0; | ||
228 | |||
229 | if (!capable(CAP_SYS_ADMIN)) | ||
230 | return -EPERM; | ||
231 | for (; start < end; start += PAGE_SIZE) { | ||
232 | struct page *p; | ||
233 | int ret = get_user_pages(current, current->mm, start, 1, | ||
234 | 0, 0, &p, NULL); | ||
235 | if (ret != 1) | ||
236 | return ret; | ||
237 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", | ||
238 | page_to_pfn(p), start); | ||
239 | /* Ignore return value for now */ | ||
240 | __memory_failure(page_to_pfn(p), 0, 1); | ||
241 | put_page(p); | ||
242 | } | ||
243 | return ret; | ||
244 | } | ||
245 | #endif | ||
246 | |||
221 | static long | 247 | static long |
222 | madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | 248 | madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, |
223 | unsigned long start, unsigned long end, int behavior) | 249 | unsigned long start, unsigned long end, int behavior) |
@@ -308,6 +334,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
308 | int write; | 334 | int write; |
309 | size_t len; | 335 | size_t len; |
310 | 336 | ||
337 | #ifdef CONFIG_MEMORY_FAILURE | ||
338 | if (behavior == MADV_HWPOISON) | ||
339 | return madvise_hwpoison(start, start+len_in); | ||
340 | #endif | ||
311 | if (!madvise_behavior_valid(behavior)) | 341 | if (!madvise_behavior_valid(behavior)) |
312 | return error; | 342 | return error; |
313 | 343 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9b10d8753784..e2b98a6875c0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/rcupdate.h> | 29 | #include <linux/rcupdate.h> |
30 | #include <linux/limits.h> | 30 | #include <linux/limits.h> |
31 | #include <linux/mutex.h> | 31 | #include <linux/mutex.h> |
32 | #include <linux/rbtree.h> | ||
32 | #include <linux/slab.h> | 33 | #include <linux/slab.h> |
33 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
34 | #include <linux/spinlock.h> | 35 | #include <linux/spinlock.h> |
@@ -43,6 +44,7 @@ | |||
43 | 44 | ||
44 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 45 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
45 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 46 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
47 | struct mem_cgroup *root_mem_cgroup __read_mostly; | ||
46 | 48 | ||
47 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 49 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
48 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | 50 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
@@ -53,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
53 | #endif | 55 | #endif |
54 | 56 | ||
55 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | 57 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ |
58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | ||
56 | 59 | ||
57 | /* | 60 | /* |
58 | * Statistics for memory cgroup. | 61 | * Statistics for memory cgroup. |
@@ -66,6 +69,8 @@ enum mem_cgroup_stat_index { | |||
66 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | 69 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ |
67 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
68 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | ||
73 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | ||
69 | 74 | ||
70 | MEM_CGROUP_STAT_NSTATS, | 75 | MEM_CGROUP_STAT_NSTATS, |
71 | }; | 76 | }; |
@@ -78,6 +83,20 @@ struct mem_cgroup_stat { | |||
78 | struct mem_cgroup_stat_cpu cpustat[0]; | 83 | struct mem_cgroup_stat_cpu cpustat[0]; |
79 | }; | 84 | }; |
80 | 85 | ||
86 | static inline void | ||
87 | __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, | ||
88 | enum mem_cgroup_stat_index idx) | ||
89 | { | ||
90 | stat->count[idx] = 0; | ||
91 | } | ||
92 | |||
93 | static inline s64 | ||
94 | __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, | ||
95 | enum mem_cgroup_stat_index idx) | ||
96 | { | ||
97 | return stat->count[idx]; | ||
98 | } | ||
99 | |||
81 | /* | 100 | /* |
82 | * For accounting under irq disable, no need for increment preempt count. | 101 | * For accounting under irq disable, no need for increment preempt count. |
83 | */ | 102 | */ |
@@ -117,6 +136,12 @@ struct mem_cgroup_per_zone { | |||
117 | unsigned long count[NR_LRU_LISTS]; | 136 | unsigned long count[NR_LRU_LISTS]; |
118 | 137 | ||
119 | struct zone_reclaim_stat reclaim_stat; | 138 | struct zone_reclaim_stat reclaim_stat; |
139 | struct rb_node tree_node; /* RB tree node */ | ||
140 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
141 | /* the soft limit is exceeded*/ | ||
142 | bool on_tree; | ||
143 | struct mem_cgroup *mem; /* Back pointer, we cannot */ | ||
144 | /* use container_of */ | ||
120 | }; | 145 | }; |
121 | /* Macro for accessing counter */ | 146 | /* Macro for accessing counter */ |
122 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | 147 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) |
@@ -130,6 +155,26 @@ struct mem_cgroup_lru_info { | |||
130 | }; | 155 | }; |
131 | 156 | ||
132 | /* | 157 | /* |
158 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
159 | * their hierarchy representation | ||
160 | */ | ||
161 | |||
162 | struct mem_cgroup_tree_per_zone { | ||
163 | struct rb_root rb_root; | ||
164 | spinlock_t lock; | ||
165 | }; | ||
166 | |||
167 | struct mem_cgroup_tree_per_node { | ||
168 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
169 | }; | ||
170 | |||
171 | struct mem_cgroup_tree { | ||
172 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
173 | }; | ||
174 | |||
175 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
176 | |||
177 | /* | ||
133 | * The memory controller data structure. The memory controller controls both | 178 | * The memory controller data structure. The memory controller controls both |
134 | * page cache and RSS per cgroup. We would eventually like to provide | 179 | * page cache and RSS per cgroup. We would eventually like to provide |
135 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | 180 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, |
@@ -186,6 +231,13 @@ struct mem_cgroup { | |||
186 | struct mem_cgroup_stat stat; | 231 | struct mem_cgroup_stat stat; |
187 | }; | 232 | }; |
188 | 233 | ||
234 | /* | ||
235 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | ||
236 | * limit reclaim to prevent infinite loops, if they ever occur. | ||
237 | */ | ||
238 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) | ||
239 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) | ||
240 | |||
189 | enum charge_type { | 241 | enum charge_type { |
190 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 242 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
191 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 243 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
@@ -200,13 +252,8 @@ enum charge_type { | |||
200 | #define PCGF_CACHE (1UL << PCG_CACHE) | 252 | #define PCGF_CACHE (1UL << PCG_CACHE) |
201 | #define PCGF_USED (1UL << PCG_USED) | 253 | #define PCGF_USED (1UL << PCG_USED) |
202 | #define PCGF_LOCK (1UL << PCG_LOCK) | 254 | #define PCGF_LOCK (1UL << PCG_LOCK) |
203 | static const unsigned long | 255 | /* Not used, but added here for completeness */ |
204 | pcg_default_flags[NR_CHARGE_TYPE] = { | 256 | #define PCGF_ACCT (1UL << PCG_ACCT) |
205 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ | ||
206 | PCGF_USED | PCGF_LOCK, /* Anon */ | ||
207 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ | ||
208 | 0, /* FORCE */ | ||
209 | }; | ||
210 | 257 | ||
211 | /* for encoding cft->private value on file */ | 258 | /* for encoding cft->private value on file */ |
212 | #define _MEM (0) | 259 | #define _MEM (0) |
@@ -215,15 +262,241 @@ pcg_default_flags[NR_CHARGE_TYPE] = { | |||
215 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 262 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) |
216 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 263 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
217 | 264 | ||
265 | /* | ||
266 | * Reclaim flags for mem_cgroup_hierarchical_reclaim | ||
267 | */ | ||
268 | #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 | ||
269 | #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) | ||
270 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 | ||
271 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) | ||
272 | #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 | ||
273 | #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) | ||
274 | |||
218 | static void mem_cgroup_get(struct mem_cgroup *mem); | 275 | static void mem_cgroup_get(struct mem_cgroup *mem); |
219 | static void mem_cgroup_put(struct mem_cgroup *mem); | 276 | static void mem_cgroup_put(struct mem_cgroup *mem); |
220 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
221 | 278 | ||
279 | static struct mem_cgroup_per_zone * | ||
280 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
281 | { | ||
282 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
283 | } | ||
284 | |||
285 | static struct mem_cgroup_per_zone * | ||
286 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
287 | { | ||
288 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
289 | int nid = page_cgroup_nid(pc); | ||
290 | int zid = page_cgroup_zid(pc); | ||
291 | |||
292 | if (!mem) | ||
293 | return NULL; | ||
294 | |||
295 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
296 | } | ||
297 | |||
298 | static struct mem_cgroup_tree_per_zone * | ||
299 | soft_limit_tree_node_zone(int nid, int zid) | ||
300 | { | ||
301 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
302 | } | ||
303 | |||
304 | static struct mem_cgroup_tree_per_zone * | ||
305 | soft_limit_tree_from_page(struct page *page) | ||
306 | { | ||
307 | int nid = page_to_nid(page); | ||
308 | int zid = page_zonenum(page); | ||
309 | |||
310 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
311 | } | ||
312 | |||
313 | static void | ||
314 | __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | ||
315 | struct mem_cgroup_per_zone *mz, | ||
316 | struct mem_cgroup_tree_per_zone *mctz) | ||
317 | { | ||
318 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
319 | struct rb_node *parent = NULL; | ||
320 | struct mem_cgroup_per_zone *mz_node; | ||
321 | |||
322 | if (mz->on_tree) | ||
323 | return; | ||
324 | |||
325 | mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res); | ||
326 | while (*p) { | ||
327 | parent = *p; | ||
328 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
329 | tree_node); | ||
330 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
331 | p = &(*p)->rb_left; | ||
332 | /* | ||
333 | * We can't avoid mem cgroups that are over their soft | ||
334 | * limit by the same amount | ||
335 | */ | ||
336 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
337 | p = &(*p)->rb_right; | ||
338 | } | ||
339 | rb_link_node(&mz->tree_node, parent, p); | ||
340 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
341 | mz->on_tree = true; | ||
342 | } | ||
343 | |||
344 | static void | ||
345 | __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | ||
346 | struct mem_cgroup_per_zone *mz, | ||
347 | struct mem_cgroup_tree_per_zone *mctz) | ||
348 | { | ||
349 | if (!mz->on_tree) | ||
350 | return; | ||
351 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
352 | mz->on_tree = false; | ||
353 | } | ||
354 | |||
355 | static void | ||
356 | mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | ||
357 | struct mem_cgroup_per_zone *mz, | ||
358 | struct mem_cgroup_tree_per_zone *mctz) | ||
359 | { | ||
360 | spin_lock(&mctz->lock); | ||
361 | __mem_cgroup_insert_exceeded(mem, mz, mctz); | ||
362 | spin_unlock(&mctz->lock); | ||
363 | } | ||
364 | |||
365 | static void | ||
366 | mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | ||
367 | struct mem_cgroup_per_zone *mz, | ||
368 | struct mem_cgroup_tree_per_zone *mctz) | ||
369 | { | ||
370 | spin_lock(&mctz->lock); | ||
371 | __mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
372 | spin_unlock(&mctz->lock); | ||
373 | } | ||
374 | |||
375 | static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) | ||
376 | { | ||
377 | bool ret = false; | ||
378 | int cpu; | ||
379 | s64 val; | ||
380 | struct mem_cgroup_stat_cpu *cpustat; | ||
381 | |||
382 | cpu = get_cpu(); | ||
383 | cpustat = &mem->stat.cpustat[cpu]; | ||
384 | val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
385 | if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { | ||
386 | __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
387 | ret = true; | ||
388 | } | ||
389 | put_cpu(); | ||
390 | return ret; | ||
391 | } | ||
392 | |||
393 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | ||
394 | { | ||
395 | unsigned long long prev_usage_in_excess, new_usage_in_excess; | ||
396 | bool updated_tree = false; | ||
397 | struct mem_cgroup_per_zone *mz; | ||
398 | struct mem_cgroup_tree_per_zone *mctz; | ||
399 | |||
400 | mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page)); | ||
401 | mctz = soft_limit_tree_from_page(page); | ||
402 | |||
403 | /* | ||
404 | * We do updates in lazy mode, mem's are removed | ||
405 | * lazily from the per-zone, per-node rb tree | ||
406 | */ | ||
407 | prev_usage_in_excess = mz->usage_in_excess; | ||
408 | |||
409 | new_usage_in_excess = res_counter_soft_limit_excess(&mem->res); | ||
410 | if (prev_usage_in_excess) { | ||
411 | mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
412 | updated_tree = true; | ||
413 | } | ||
414 | if (!new_usage_in_excess) | ||
415 | goto done; | ||
416 | mem_cgroup_insert_exceeded(mem, mz, mctz); | ||
417 | |||
418 | done: | ||
419 | if (updated_tree) { | ||
420 | spin_lock(&mctz->lock); | ||
421 | mz->usage_in_excess = new_usage_in_excess; | ||
422 | spin_unlock(&mctz->lock); | ||
423 | } | ||
424 | } | ||
425 | |||
426 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | ||
427 | { | ||
428 | int node, zone; | ||
429 | struct mem_cgroup_per_zone *mz; | ||
430 | struct mem_cgroup_tree_per_zone *mctz; | ||
431 | |||
432 | for_each_node_state(node, N_POSSIBLE) { | ||
433 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
434 | mz = mem_cgroup_zoneinfo(mem, node, zone); | ||
435 | mctz = soft_limit_tree_node_zone(node, zone); | ||
436 | mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
437 | } | ||
438 | } | ||
439 | } | ||
440 | |||
441 | static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) | ||
442 | { | ||
443 | return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; | ||
444 | } | ||
445 | |||
446 | static struct mem_cgroup_per_zone * | ||
447 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
448 | { | ||
449 | struct rb_node *rightmost = NULL; | ||
450 | struct mem_cgroup_per_zone *mz = NULL; | ||
451 | |||
452 | retry: | ||
453 | rightmost = rb_last(&mctz->rb_root); | ||
454 | if (!rightmost) | ||
455 | goto done; /* Nothing to reclaim from */ | ||
456 | |||
457 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | ||
458 | /* | ||
459 | * Remove the node now but someone else can add it back, | ||
460 | * we will to add it back at the end of reclaim to its correct | ||
461 | * position in the tree. | ||
462 | */ | ||
463 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | ||
464 | if (!res_counter_soft_limit_excess(&mz->mem->res) || | ||
465 | !css_tryget(&mz->mem->css)) | ||
466 | goto retry; | ||
467 | done: | ||
468 | return mz; | ||
469 | } | ||
470 | |||
471 | static struct mem_cgroup_per_zone * | ||
472 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
473 | { | ||
474 | struct mem_cgroup_per_zone *mz; | ||
475 | |||
476 | spin_lock(&mctz->lock); | ||
477 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | ||
478 | spin_unlock(&mctz->lock); | ||
479 | return mz; | ||
480 | } | ||
481 | |||
482 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | ||
483 | bool charge) | ||
484 | { | ||
485 | int val = (charge) ? 1 : -1; | ||
486 | struct mem_cgroup_stat *stat = &mem->stat; | ||
487 | struct mem_cgroup_stat_cpu *cpustat; | ||
488 | int cpu = get_cpu(); | ||
489 | |||
490 | cpustat = &stat->cpustat[cpu]; | ||
491 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); | ||
492 | put_cpu(); | ||
493 | } | ||
494 | |||
222 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 495 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
223 | struct page_cgroup *pc, | 496 | struct page_cgroup *pc, |
224 | bool charge) | 497 | bool charge) |
225 | { | 498 | { |
226 | int val = (charge)? 1 : -1; | 499 | int val = (charge) ? 1 : -1; |
227 | struct mem_cgroup_stat *stat = &mem->stat; | 500 | struct mem_cgroup_stat *stat = &mem->stat; |
228 | struct mem_cgroup_stat_cpu *cpustat; | 501 | struct mem_cgroup_stat_cpu *cpustat; |
229 | int cpu = get_cpu(); | 502 | int cpu = get_cpu(); |
@@ -240,28 +513,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
240 | else | 513 | else |
241 | __mem_cgroup_stat_add_safe(cpustat, | 514 | __mem_cgroup_stat_add_safe(cpustat, |
242 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 515 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); |
516 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); | ||
243 | put_cpu(); | 517 | put_cpu(); |
244 | } | 518 | } |
245 | 519 | ||
246 | static struct mem_cgroup_per_zone * | ||
247 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
248 | { | ||
249 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
250 | } | ||
251 | |||
252 | static struct mem_cgroup_per_zone * | ||
253 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
254 | { | ||
255 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
256 | int nid = page_cgroup_nid(pc); | ||
257 | int zid = page_cgroup_zid(pc); | ||
258 | |||
259 | if (!mem) | ||
260 | return NULL; | ||
261 | |||
262 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
263 | } | ||
264 | |||
265 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 520 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
266 | enum lru_list idx) | 521 | enum lru_list idx) |
267 | { | 522 | { |
@@ -354,6 +609,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, | |||
354 | return ret; | 609 | return ret; |
355 | } | 610 | } |
356 | 611 | ||
612 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) | ||
613 | { | ||
614 | return (mem == root_mem_cgroup); | ||
615 | } | ||
616 | |||
357 | /* | 617 | /* |
358 | * Following LRU functions are allowed to be used without PCG_LOCK. | 618 | * Following LRU functions are allowed to be used without PCG_LOCK. |
359 | * Operations are called by routine of global LRU independently from memcg. | 619 | * Operations are called by routine of global LRU independently from memcg. |
@@ -371,22 +631,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, | |||
371 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | 631 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) |
372 | { | 632 | { |
373 | struct page_cgroup *pc; | 633 | struct page_cgroup *pc; |
374 | struct mem_cgroup *mem; | ||
375 | struct mem_cgroup_per_zone *mz; | 634 | struct mem_cgroup_per_zone *mz; |
376 | 635 | ||
377 | if (mem_cgroup_disabled()) | 636 | if (mem_cgroup_disabled()) |
378 | return; | 637 | return; |
379 | pc = lookup_page_cgroup(page); | 638 | pc = lookup_page_cgroup(page); |
380 | /* can happen while we handle swapcache. */ | 639 | /* can happen while we handle swapcache. */ |
381 | if (list_empty(&pc->lru) || !pc->mem_cgroup) | 640 | if (!TestClearPageCgroupAcctLRU(pc)) |
382 | return; | 641 | return; |
642 | VM_BUG_ON(!pc->mem_cgroup); | ||
383 | /* | 643 | /* |
384 | * We don't check PCG_USED bit. It's cleared when the "page" is finally | 644 | * We don't check PCG_USED bit. It's cleared when the "page" is finally |
385 | * removed from global LRU. | 645 | * removed from global LRU. |
386 | */ | 646 | */ |
387 | mz = page_cgroup_zoneinfo(pc); | 647 | mz = page_cgroup_zoneinfo(pc); |
388 | mem = pc->mem_cgroup; | ||
389 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 648 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; |
649 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
650 | return; | ||
651 | VM_BUG_ON(list_empty(&pc->lru)); | ||
390 | list_del_init(&pc->lru); | 652 | list_del_init(&pc->lru); |
391 | return; | 653 | return; |
392 | } | 654 | } |
@@ -410,8 +672,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | |||
410 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | 672 | * For making pc->mem_cgroup visible, insert smp_rmb() here. |
411 | */ | 673 | */ |
412 | smp_rmb(); | 674 | smp_rmb(); |
413 | /* unused page is not rotated. */ | 675 | /* unused or root page is not rotated. */ |
414 | if (!PageCgroupUsed(pc)) | 676 | if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) |
415 | return; | 677 | return; |
416 | mz = page_cgroup_zoneinfo(pc); | 678 | mz = page_cgroup_zoneinfo(pc); |
417 | list_move(&pc->lru, &mz->lists[lru]); | 679 | list_move(&pc->lru, &mz->lists[lru]); |
@@ -425,6 +687,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
425 | if (mem_cgroup_disabled()) | 687 | if (mem_cgroup_disabled()) |
426 | return; | 688 | return; |
427 | pc = lookup_page_cgroup(page); | 689 | pc = lookup_page_cgroup(page); |
690 | VM_BUG_ON(PageCgroupAcctLRU(pc)); | ||
428 | /* | 691 | /* |
429 | * Used bit is set without atomic ops but after smp_wmb(). | 692 | * Used bit is set without atomic ops but after smp_wmb(). |
430 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | 693 | * For making pc->mem_cgroup visible, insert smp_rmb() here. |
@@ -435,6 +698,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
435 | 698 | ||
436 | mz = page_cgroup_zoneinfo(pc); | 699 | mz = page_cgroup_zoneinfo(pc); |
437 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 700 | MEM_CGROUP_ZSTAT(mz, lru) += 1; |
701 | SetPageCgroupAcctLRU(pc); | ||
702 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
703 | return; | ||
438 | list_add(&pc->lru, &mz->lists[lru]); | 704 | list_add(&pc->lru, &mz->lists[lru]); |
439 | } | 705 | } |
440 | 706 | ||
@@ -469,7 +735,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) | |||
469 | 735 | ||
470 | spin_lock_irqsave(&zone->lru_lock, flags); | 736 | spin_lock_irqsave(&zone->lru_lock, flags); |
471 | /* link when the page is linked to LRU but page_cgroup isn't */ | 737 | /* link when the page is linked to LRU but page_cgroup isn't */ |
472 | if (PageLRU(page) && list_empty(&pc->lru)) | 738 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) |
473 | mem_cgroup_add_lru_list(page, page_lru(page)); | 739 | mem_cgroup_add_lru_list(page, page_lru(page)); |
474 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 740 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
475 | } | 741 | } |
@@ -855,28 +1121,62 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
855 | * If shrink==true, for avoiding to free too much, this returns immedieately. | 1121 | * If shrink==true, for avoiding to free too much, this returns immedieately. |
856 | */ | 1122 | */ |
857 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | 1123 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, |
858 | gfp_t gfp_mask, bool noswap, bool shrink) | 1124 | struct zone *zone, |
1125 | gfp_t gfp_mask, | ||
1126 | unsigned long reclaim_options) | ||
859 | { | 1127 | { |
860 | struct mem_cgroup *victim; | 1128 | struct mem_cgroup *victim; |
861 | int ret, total = 0; | 1129 | int ret, total = 0; |
862 | int loop = 0; | 1130 | int loop = 0; |
1131 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; | ||
1132 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; | ||
1133 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; | ||
1134 | unsigned long excess = mem_cgroup_get_excess(root_mem); | ||
863 | 1135 | ||
864 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1136 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
865 | if (root_mem->memsw_is_minimum) | 1137 | if (root_mem->memsw_is_minimum) |
866 | noswap = true; | 1138 | noswap = true; |
867 | 1139 | ||
868 | while (loop < 2) { | 1140 | while (1) { |
869 | victim = mem_cgroup_select_victim(root_mem); | 1141 | victim = mem_cgroup_select_victim(root_mem); |
870 | if (victim == root_mem) | 1142 | if (victim == root_mem) { |
871 | loop++; | 1143 | loop++; |
1144 | if (loop >= 2) { | ||
1145 | /* | ||
1146 | * If we have not been able to reclaim | ||
1147 | * anything, it might because there are | ||
1148 | * no reclaimable pages under this hierarchy | ||
1149 | */ | ||
1150 | if (!check_soft || !total) { | ||
1151 | css_put(&victim->css); | ||
1152 | break; | ||
1153 | } | ||
1154 | /* | ||
1155 | * We want to do more targetted reclaim. | ||
1156 | * excess >> 2 is not to excessive so as to | ||
1157 | * reclaim too much, nor too less that we keep | ||
1158 | * coming back to reclaim from this cgroup | ||
1159 | */ | ||
1160 | if (total >= (excess >> 2) || | ||
1161 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { | ||
1162 | css_put(&victim->css); | ||
1163 | break; | ||
1164 | } | ||
1165 | } | ||
1166 | } | ||
872 | if (!mem_cgroup_local_usage(&victim->stat)) { | 1167 | if (!mem_cgroup_local_usage(&victim->stat)) { |
873 | /* this cgroup's local usage == 0 */ | 1168 | /* this cgroup's local usage == 0 */ |
874 | css_put(&victim->css); | 1169 | css_put(&victim->css); |
875 | continue; | 1170 | continue; |
876 | } | 1171 | } |
877 | /* we use swappiness of local cgroup */ | 1172 | /* we use swappiness of local cgroup */ |
878 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, | 1173 | if (check_soft) |
879 | get_swappiness(victim)); | 1174 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
1175 | noswap, get_swappiness(victim), zone, | ||
1176 | zone->zone_pgdat->node_id); | ||
1177 | else | ||
1178 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | ||
1179 | noswap, get_swappiness(victim)); | ||
880 | css_put(&victim->css); | 1180 | css_put(&victim->css); |
881 | /* | 1181 | /* |
882 | * At shrinking usage, we can't check we should stop here or | 1182 | * At shrinking usage, we can't check we should stop here or |
@@ -886,7 +1186,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
886 | if (shrink) | 1186 | if (shrink) |
887 | return ret; | 1187 | return ret; |
888 | total += ret; | 1188 | total += ret; |
889 | if (mem_cgroup_check_under_limit(root_mem)) | 1189 | if (check_soft) { |
1190 | if (res_counter_check_under_soft_limit(&root_mem->res)) | ||
1191 | return total; | ||
1192 | } else if (mem_cgroup_check_under_limit(root_mem)) | ||
890 | return 1 + total; | 1193 | return 1 + total; |
891 | } | 1194 | } |
892 | return total; | 1195 | return total; |
@@ -965,11 +1268,11 @@ done: | |||
965 | */ | 1268 | */ |
966 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1269 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
967 | gfp_t gfp_mask, struct mem_cgroup **memcg, | 1270 | gfp_t gfp_mask, struct mem_cgroup **memcg, |
968 | bool oom) | 1271 | bool oom, struct page *page) |
969 | { | 1272 | { |
970 | struct mem_cgroup *mem, *mem_over_limit; | 1273 | struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit; |
971 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1274 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
972 | struct res_counter *fail_res; | 1275 | struct res_counter *fail_res, *soft_fail_res = NULL; |
973 | 1276 | ||
974 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1277 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { |
975 | /* Don't account this! */ | 1278 | /* Don't account this! */ |
@@ -996,20 +1299,23 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
996 | VM_BUG_ON(css_is_removed(&mem->css)); | 1299 | VM_BUG_ON(css_is_removed(&mem->css)); |
997 | 1300 | ||
998 | while (1) { | 1301 | while (1) { |
999 | int ret; | 1302 | int ret = 0; |
1000 | bool noswap = false; | 1303 | unsigned long flags = 0; |
1001 | 1304 | ||
1002 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1305 | if (mem_cgroup_is_root(mem)) |
1306 | goto done; | ||
1307 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, | ||
1308 | &soft_fail_res); | ||
1003 | if (likely(!ret)) { | 1309 | if (likely(!ret)) { |
1004 | if (!do_swap_account) | 1310 | if (!do_swap_account) |
1005 | break; | 1311 | break; |
1006 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | 1312 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, |
1007 | &fail_res); | 1313 | &fail_res, NULL); |
1008 | if (likely(!ret)) | 1314 | if (likely(!ret)) |
1009 | break; | 1315 | break; |
1010 | /* mem+swap counter fails */ | 1316 | /* mem+swap counter fails */ |
1011 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1317 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
1012 | noswap = true; | 1318 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1013 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1319 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1014 | memsw); | 1320 | memsw); |
1015 | } else | 1321 | } else |
@@ -1020,8 +1326,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1020 | if (!(gfp_mask & __GFP_WAIT)) | 1326 | if (!(gfp_mask & __GFP_WAIT)) |
1021 | goto nomem; | 1327 | goto nomem; |
1022 | 1328 | ||
1023 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, | 1329 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, |
1024 | noswap, false); | 1330 | gfp_mask, flags); |
1025 | if (ret) | 1331 | if (ret) |
1026 | continue; | 1332 | continue; |
1027 | 1333 | ||
@@ -1046,13 +1352,24 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1046 | goto nomem; | 1352 | goto nomem; |
1047 | } | 1353 | } |
1048 | } | 1354 | } |
1355 | /* | ||
1356 | * Insert just the ancestor, we should trickle down to the correct | ||
1357 | * cgroup for reclaim, since the other nodes will be below their | ||
1358 | * soft limit | ||
1359 | */ | ||
1360 | if (soft_fail_res) { | ||
1361 | mem_over_soft_limit = | ||
1362 | mem_cgroup_from_res_counter(soft_fail_res, res); | ||
1363 | if (mem_cgroup_soft_limit_check(mem_over_soft_limit)) | ||
1364 | mem_cgroup_update_tree(mem_over_soft_limit, page); | ||
1365 | } | ||
1366 | done: | ||
1049 | return 0; | 1367 | return 0; |
1050 | nomem: | 1368 | nomem: |
1051 | css_put(&mem->css); | 1369 | css_put(&mem->css); |
1052 | return -ENOMEM; | 1370 | return -ENOMEM; |
1053 | } | 1371 | } |
1054 | 1372 | ||
1055 | |||
1056 | /* | 1373 | /* |
1057 | * A helper function to get mem_cgroup from ID. must be called under | 1374 | * A helper function to get mem_cgroup from ID. must be called under |
1058 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 1375 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
@@ -1119,15 +1436,38 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1119 | lock_page_cgroup(pc); | 1436 | lock_page_cgroup(pc); |
1120 | if (unlikely(PageCgroupUsed(pc))) { | 1437 | if (unlikely(PageCgroupUsed(pc))) { |
1121 | unlock_page_cgroup(pc); | 1438 | unlock_page_cgroup(pc); |
1122 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1439 | if (!mem_cgroup_is_root(mem)) { |
1123 | if (do_swap_account) | 1440 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
1124 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1441 | if (do_swap_account) |
1442 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, | ||
1443 | NULL); | ||
1444 | } | ||
1125 | css_put(&mem->css); | 1445 | css_put(&mem->css); |
1126 | return; | 1446 | return; |
1127 | } | 1447 | } |
1448 | |||
1128 | pc->mem_cgroup = mem; | 1449 | pc->mem_cgroup = mem; |
1450 | /* | ||
1451 | * We access a page_cgroup asynchronously without lock_page_cgroup(). | ||
1452 | * Especially when a page_cgroup is taken from a page, pc->mem_cgroup | ||
1453 | * is accessed after testing USED bit. To make pc->mem_cgroup visible | ||
1454 | * before USED bit, we need memory barrier here. | ||
1455 | * See mem_cgroup_add_lru_list(), etc. | ||
1456 | */ | ||
1129 | smp_wmb(); | 1457 | smp_wmb(); |
1130 | pc->flags = pcg_default_flags[ctype]; | 1458 | switch (ctype) { |
1459 | case MEM_CGROUP_CHARGE_TYPE_CACHE: | ||
1460 | case MEM_CGROUP_CHARGE_TYPE_SHMEM: | ||
1461 | SetPageCgroupCache(pc); | ||
1462 | SetPageCgroupUsed(pc); | ||
1463 | break; | ||
1464 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | ||
1465 | ClearPageCgroupCache(pc); | ||
1466 | SetPageCgroupUsed(pc); | ||
1467 | break; | ||
1468 | default: | ||
1469 | break; | ||
1470 | } | ||
1131 | 1471 | ||
1132 | mem_cgroup_charge_statistics(mem, pc, true); | 1472 | mem_cgroup_charge_statistics(mem, pc, true); |
1133 | 1473 | ||
@@ -1178,7 +1518,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1178 | if (pc->mem_cgroup != from) | 1518 | if (pc->mem_cgroup != from) |
1179 | goto out; | 1519 | goto out; |
1180 | 1520 | ||
1181 | res_counter_uncharge(&from->res, PAGE_SIZE); | 1521 | if (!mem_cgroup_is_root(from)) |
1522 | res_counter_uncharge(&from->res, PAGE_SIZE, NULL); | ||
1182 | mem_cgroup_charge_statistics(from, pc, false); | 1523 | mem_cgroup_charge_statistics(from, pc, false); |
1183 | 1524 | ||
1184 | page = pc->page; | 1525 | page = pc->page; |
@@ -1197,8 +1538,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1197 | 1); | 1538 | 1); |
1198 | } | 1539 | } |
1199 | 1540 | ||
1200 | if (do_swap_account) | 1541 | if (do_swap_account && !mem_cgroup_is_root(from)) |
1201 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | 1542 | res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); |
1202 | css_put(&from->css); | 1543 | css_put(&from->css); |
1203 | 1544 | ||
1204 | css_get(&to->css); | 1545 | css_get(&to->css); |
@@ -1238,7 +1579,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
1238 | parent = mem_cgroup_from_cont(pcg); | 1579 | parent = mem_cgroup_from_cont(pcg); |
1239 | 1580 | ||
1240 | 1581 | ||
1241 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | 1582 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); |
1242 | if (ret || !parent) | 1583 | if (ret || !parent) |
1243 | return ret; | 1584 | return ret; |
1244 | 1585 | ||
@@ -1268,9 +1609,11 @@ uncharge: | |||
1268 | /* drop extra refcnt by try_charge() */ | 1609 | /* drop extra refcnt by try_charge() */ |
1269 | css_put(&parent->css); | 1610 | css_put(&parent->css); |
1270 | /* uncharge if move fails */ | 1611 | /* uncharge if move fails */ |
1271 | res_counter_uncharge(&parent->res, PAGE_SIZE); | 1612 | if (!mem_cgroup_is_root(parent)) { |
1272 | if (do_swap_account) | 1613 | res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); |
1273 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | 1614 | if (do_swap_account) |
1615 | res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); | ||
1616 | } | ||
1274 | return ret; | 1617 | return ret; |
1275 | } | 1618 | } |
1276 | 1619 | ||
@@ -1295,7 +1638,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
1295 | prefetchw(pc); | 1638 | prefetchw(pc); |
1296 | 1639 | ||
1297 | mem = memcg; | 1640 | mem = memcg; |
1298 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | 1641 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); |
1299 | if (ret || !mem) | 1642 | if (ret || !mem) |
1300 | return ret; | 1643 | return ret; |
1301 | 1644 | ||
@@ -1414,14 +1757,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
1414 | if (!mem) | 1757 | if (!mem) |
1415 | goto charge_cur_mm; | 1758 | goto charge_cur_mm; |
1416 | *ptr = mem; | 1759 | *ptr = mem; |
1417 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | 1760 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); |
1418 | /* drop extra refcnt from tryget */ | 1761 | /* drop extra refcnt from tryget */ |
1419 | css_put(&mem->css); | 1762 | css_put(&mem->css); |
1420 | return ret; | 1763 | return ret; |
1421 | charge_cur_mm: | 1764 | charge_cur_mm: |
1422 | if (unlikely(!mm)) | 1765 | if (unlikely(!mm)) |
1423 | mm = &init_mm; | 1766 | mm = &init_mm; |
1424 | return __mem_cgroup_try_charge(mm, mask, ptr, true); | 1767 | return __mem_cgroup_try_charge(mm, mask, ptr, true, page); |
1425 | } | 1768 | } |
1426 | 1769 | ||
1427 | static void | 1770 | static void |
@@ -1459,7 +1802,10 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
1459 | * This recorded memcg can be obsolete one. So, avoid | 1802 | * This recorded memcg can be obsolete one. So, avoid |
1460 | * calling css_tryget | 1803 | * calling css_tryget |
1461 | */ | 1804 | */ |
1462 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 1805 | if (!mem_cgroup_is_root(memcg)) |
1806 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE, | ||
1807 | NULL); | ||
1808 | mem_cgroup_swap_statistics(memcg, false); | ||
1463 | mem_cgroup_put(memcg); | 1809 | mem_cgroup_put(memcg); |
1464 | } | 1810 | } |
1465 | rcu_read_unlock(); | 1811 | rcu_read_unlock(); |
@@ -1484,9 +1830,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
1484 | return; | 1830 | return; |
1485 | if (!mem) | 1831 | if (!mem) |
1486 | return; | 1832 | return; |
1487 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1833 | if (!mem_cgroup_is_root(mem)) { |
1488 | if (do_swap_account) | 1834 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
1489 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1835 | if (do_swap_account) |
1836 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); | ||
1837 | } | ||
1490 | css_put(&mem->css); | 1838 | css_put(&mem->css); |
1491 | } | 1839 | } |
1492 | 1840 | ||
@@ -1500,6 +1848,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1500 | struct page_cgroup *pc; | 1848 | struct page_cgroup *pc; |
1501 | struct mem_cgroup *mem = NULL; | 1849 | struct mem_cgroup *mem = NULL; |
1502 | struct mem_cgroup_per_zone *mz; | 1850 | struct mem_cgroup_per_zone *mz; |
1851 | bool soft_limit_excess = false; | ||
1503 | 1852 | ||
1504 | if (mem_cgroup_disabled()) | 1853 | if (mem_cgroup_disabled()) |
1505 | return NULL; | 1854 | return NULL; |
@@ -1538,9 +1887,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1538 | break; | 1887 | break; |
1539 | } | 1888 | } |
1540 | 1889 | ||
1541 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1890 | if (!mem_cgroup_is_root(mem)) { |
1542 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | 1891 | res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); |
1543 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1892 | if (do_swap_account && |
1893 | (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1894 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); | ||
1895 | } | ||
1896 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
1897 | mem_cgroup_swap_statistics(mem, true); | ||
1544 | mem_cgroup_charge_statistics(mem, pc, false); | 1898 | mem_cgroup_charge_statistics(mem, pc, false); |
1545 | 1899 | ||
1546 | ClearPageCgroupUsed(pc); | 1900 | ClearPageCgroupUsed(pc); |
@@ -1554,6 +1908,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1554 | mz = page_cgroup_zoneinfo(pc); | 1908 | mz = page_cgroup_zoneinfo(pc); |
1555 | unlock_page_cgroup(pc); | 1909 | unlock_page_cgroup(pc); |
1556 | 1910 | ||
1911 | if (soft_limit_excess && mem_cgroup_soft_limit_check(mem)) | ||
1912 | mem_cgroup_update_tree(mem, page); | ||
1557 | /* at swapout, this memcg will be accessed to record to swap */ | 1913 | /* at swapout, this memcg will be accessed to record to swap */ |
1558 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 1914 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1559 | css_put(&mem->css); | 1915 | css_put(&mem->css); |
@@ -1629,7 +1985,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
1629 | * We uncharge this because swap is freed. | 1985 | * We uncharge this because swap is freed. |
1630 | * This memcg can be obsolete one. We avoid calling css_tryget | 1986 | * This memcg can be obsolete one. We avoid calling css_tryget |
1631 | */ | 1987 | */ |
1632 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 1988 | if (!mem_cgroup_is_root(memcg)) |
1989 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); | ||
1990 | mem_cgroup_swap_statistics(memcg, false); | ||
1633 | mem_cgroup_put(memcg); | 1991 | mem_cgroup_put(memcg); |
1634 | } | 1992 | } |
1635 | rcu_read_unlock(); | 1993 | rcu_read_unlock(); |
@@ -1658,7 +2016,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
1658 | unlock_page_cgroup(pc); | 2016 | unlock_page_cgroup(pc); |
1659 | 2017 | ||
1660 | if (mem) { | 2018 | if (mem) { |
1661 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | 2019 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, |
2020 | page); | ||
1662 | css_put(&mem->css); | 2021 | css_put(&mem->css); |
1663 | } | 2022 | } |
1664 | *ptr = mem; | 2023 | *ptr = mem; |
@@ -1798,8 +2157,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
1798 | if (!ret) | 2157 | if (!ret) |
1799 | break; | 2158 | break; |
1800 | 2159 | ||
1801 | progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, | 2160 | progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, |
1802 | false, true); | 2161 | GFP_KERNEL, |
2162 | MEM_CGROUP_RECLAIM_SHRINK); | ||
1803 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2163 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
1804 | /* Usage is reduced ? */ | 2164 | /* Usage is reduced ? */ |
1805 | if (curusage >= oldusage) | 2165 | if (curusage >= oldusage) |
@@ -1851,7 +2211,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
1851 | if (!ret) | 2211 | if (!ret) |
1852 | break; | 2212 | break; |
1853 | 2213 | ||
1854 | mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); | 2214 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
2215 | MEM_CGROUP_RECLAIM_NOSWAP | | ||
2216 | MEM_CGROUP_RECLAIM_SHRINK); | ||
1855 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 2217 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
1856 | /* Usage is reduced ? */ | 2218 | /* Usage is reduced ? */ |
1857 | if (curusage >= oldusage) | 2219 | if (curusage >= oldusage) |
@@ -1862,6 +2224,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
1862 | return ret; | 2224 | return ret; |
1863 | } | 2225 | } |
1864 | 2226 | ||
2227 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | ||
2228 | gfp_t gfp_mask, int nid, | ||
2229 | int zid) | ||
2230 | { | ||
2231 | unsigned long nr_reclaimed = 0; | ||
2232 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | ||
2233 | unsigned long reclaimed; | ||
2234 | int loop = 0; | ||
2235 | struct mem_cgroup_tree_per_zone *mctz; | ||
2236 | |||
2237 | if (order > 0) | ||
2238 | return 0; | ||
2239 | |||
2240 | mctz = soft_limit_tree_node_zone(nid, zid); | ||
2241 | /* | ||
2242 | * This loop can run a while, specially if mem_cgroup's continuously | ||
2243 | * keep exceeding their soft limit and putting the system under | ||
2244 | * pressure | ||
2245 | */ | ||
2246 | do { | ||
2247 | if (next_mz) | ||
2248 | mz = next_mz; | ||
2249 | else | ||
2250 | mz = mem_cgroup_largest_soft_limit_node(mctz); | ||
2251 | if (!mz) | ||
2252 | break; | ||
2253 | |||
2254 | reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, | ||
2255 | gfp_mask, | ||
2256 | MEM_CGROUP_RECLAIM_SOFT); | ||
2257 | nr_reclaimed += reclaimed; | ||
2258 | spin_lock(&mctz->lock); | ||
2259 | |||
2260 | /* | ||
2261 | * If we failed to reclaim anything from this memory cgroup | ||
2262 | * it is time to move on to the next cgroup | ||
2263 | */ | ||
2264 | next_mz = NULL; | ||
2265 | if (!reclaimed) { | ||
2266 | do { | ||
2267 | /* | ||
2268 | * Loop until we find yet another one. | ||
2269 | * | ||
2270 | * By the time we get the soft_limit lock | ||
2271 | * again, someone might have aded the | ||
2272 | * group back on the RB tree. Iterate to | ||
2273 | * make sure we get a different mem. | ||
2274 | * mem_cgroup_largest_soft_limit_node returns | ||
2275 | * NULL if no other cgroup is present on | ||
2276 | * the tree | ||
2277 | */ | ||
2278 | next_mz = | ||
2279 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
2280 | if (next_mz == mz) { | ||
2281 | css_put(&next_mz->mem->css); | ||
2282 | next_mz = NULL; | ||
2283 | } else /* next_mz == NULL or other memcg */ | ||
2284 | break; | ||
2285 | } while (1); | ||
2286 | } | ||
2287 | mz->usage_in_excess = | ||
2288 | res_counter_soft_limit_excess(&mz->mem->res); | ||
2289 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | ||
2290 | /* | ||
2291 | * One school of thought says that we should not add | ||
2292 | * back the node to the tree if reclaim returns 0. | ||
2293 | * But our reclaim could return 0, simply because due | ||
2294 | * to priority we are exposing a smaller subset of | ||
2295 | * memory to reclaim from. Consider this as a longer | ||
2296 | * term TODO. | ||
2297 | */ | ||
2298 | if (mz->usage_in_excess) | ||
2299 | __mem_cgroup_insert_exceeded(mz->mem, mz, mctz); | ||
2300 | spin_unlock(&mctz->lock); | ||
2301 | css_put(&mz->mem->css); | ||
2302 | loop++; | ||
2303 | /* | ||
2304 | * Could not reclaim anything and there are no more | ||
2305 | * mem cgroups to try or we seem to be looping without | ||
2306 | * reclaiming anything. | ||
2307 | */ | ||
2308 | if (!nr_reclaimed && | ||
2309 | (next_mz == NULL || | ||
2310 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | ||
2311 | break; | ||
2312 | } while (!nr_reclaimed); | ||
2313 | if (next_mz) | ||
2314 | css_put(&next_mz->mem->css); | ||
2315 | return nr_reclaimed; | ||
2316 | } | ||
2317 | |||
1865 | /* | 2318 | /* |
1866 | * This routine traverse page_cgroup in given list and drop them all. | 2319 | * This routine traverse page_cgroup in given list and drop them all. |
1867 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 2320 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
@@ -2046,20 +2499,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
2046 | return retval; | 2499 | return retval; |
2047 | } | 2500 | } |
2048 | 2501 | ||
2502 | struct mem_cgroup_idx_data { | ||
2503 | s64 val; | ||
2504 | enum mem_cgroup_stat_index idx; | ||
2505 | }; | ||
2506 | |||
2507 | static int | ||
2508 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) | ||
2509 | { | ||
2510 | struct mem_cgroup_idx_data *d = data; | ||
2511 | d->val += mem_cgroup_read_stat(&mem->stat, d->idx); | ||
2512 | return 0; | ||
2513 | } | ||
2514 | |||
2515 | static void | ||
2516 | mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | ||
2517 | enum mem_cgroup_stat_index idx, s64 *val) | ||
2518 | { | ||
2519 | struct mem_cgroup_idx_data d; | ||
2520 | d.idx = idx; | ||
2521 | d.val = 0; | ||
2522 | mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); | ||
2523 | *val = d.val; | ||
2524 | } | ||
2525 | |||
2049 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 2526 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
2050 | { | 2527 | { |
2051 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2528 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
2052 | u64 val = 0; | 2529 | u64 idx_val, val; |
2053 | int type, name; | 2530 | int type, name; |
2054 | 2531 | ||
2055 | type = MEMFILE_TYPE(cft->private); | 2532 | type = MEMFILE_TYPE(cft->private); |
2056 | name = MEMFILE_ATTR(cft->private); | 2533 | name = MEMFILE_ATTR(cft->private); |
2057 | switch (type) { | 2534 | switch (type) { |
2058 | case _MEM: | 2535 | case _MEM: |
2059 | val = res_counter_read_u64(&mem->res, name); | 2536 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { |
2537 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2538 | MEM_CGROUP_STAT_CACHE, &idx_val); | ||
2539 | val = idx_val; | ||
2540 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2541 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2542 | val += idx_val; | ||
2543 | val <<= PAGE_SHIFT; | ||
2544 | } else | ||
2545 | val = res_counter_read_u64(&mem->res, name); | ||
2060 | break; | 2546 | break; |
2061 | case _MEMSWAP: | 2547 | case _MEMSWAP: |
2062 | val = res_counter_read_u64(&mem->memsw, name); | 2548 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { |
2549 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2550 | MEM_CGROUP_STAT_CACHE, &idx_val); | ||
2551 | val = idx_val; | ||
2552 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2553 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2554 | val += idx_val; | ||
2555 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2556 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
2557 | val <<= PAGE_SHIFT; | ||
2558 | } else | ||
2559 | val = res_counter_read_u64(&mem->memsw, name); | ||
2063 | break; | 2560 | break; |
2064 | default: | 2561 | default: |
2065 | BUG(); | 2562 | BUG(); |
@@ -2083,6 +2580,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
2083 | name = MEMFILE_ATTR(cft->private); | 2580 | name = MEMFILE_ATTR(cft->private); |
2084 | switch (name) { | 2581 | switch (name) { |
2085 | case RES_LIMIT: | 2582 | case RES_LIMIT: |
2583 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ | ||
2584 | ret = -EINVAL; | ||
2585 | break; | ||
2586 | } | ||
2086 | /* This function does all necessary parse...reuse it */ | 2587 | /* This function does all necessary parse...reuse it */ |
2087 | ret = res_counter_memparse_write_strategy(buffer, &val); | 2588 | ret = res_counter_memparse_write_strategy(buffer, &val); |
2088 | if (ret) | 2589 | if (ret) |
@@ -2092,6 +2593,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
2092 | else | 2593 | else |
2093 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | 2594 | ret = mem_cgroup_resize_memsw_limit(memcg, val); |
2094 | break; | 2595 | break; |
2596 | case RES_SOFT_LIMIT: | ||
2597 | ret = res_counter_memparse_write_strategy(buffer, &val); | ||
2598 | if (ret) | ||
2599 | break; | ||
2600 | /* | ||
2601 | * For memsw, soft limits are hard to implement in terms | ||
2602 | * of semantics, for now, we support soft limits for | ||
2603 | * control without swap | ||
2604 | */ | ||
2605 | if (type == _MEM) | ||
2606 | ret = res_counter_set_soft_limit(&memcg->res, val); | ||
2607 | else | ||
2608 | ret = -EINVAL; | ||
2609 | break; | ||
2095 | default: | 2610 | default: |
2096 | ret = -EINVAL; /* should be BUG() ? */ | 2611 | ret = -EINVAL; /* should be BUG() ? */ |
2097 | break; | 2612 | break; |
@@ -2149,6 +2664,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
2149 | res_counter_reset_failcnt(&mem->memsw); | 2664 | res_counter_reset_failcnt(&mem->memsw); |
2150 | break; | 2665 | break; |
2151 | } | 2666 | } |
2667 | |||
2152 | return 0; | 2668 | return 0; |
2153 | } | 2669 | } |
2154 | 2670 | ||
@@ -2160,6 +2676,7 @@ enum { | |||
2160 | MCS_MAPPED_FILE, | 2676 | MCS_MAPPED_FILE, |
2161 | MCS_PGPGIN, | 2677 | MCS_PGPGIN, |
2162 | MCS_PGPGOUT, | 2678 | MCS_PGPGOUT, |
2679 | MCS_SWAP, | ||
2163 | MCS_INACTIVE_ANON, | 2680 | MCS_INACTIVE_ANON, |
2164 | MCS_ACTIVE_ANON, | 2681 | MCS_ACTIVE_ANON, |
2165 | MCS_INACTIVE_FILE, | 2682 | MCS_INACTIVE_FILE, |
@@ -2181,6 +2698,7 @@ struct { | |||
2181 | {"mapped_file", "total_mapped_file"}, | 2698 | {"mapped_file", "total_mapped_file"}, |
2182 | {"pgpgin", "total_pgpgin"}, | 2699 | {"pgpgin", "total_pgpgin"}, |
2183 | {"pgpgout", "total_pgpgout"}, | 2700 | {"pgpgout", "total_pgpgout"}, |
2701 | {"swap", "total_swap"}, | ||
2184 | {"inactive_anon", "total_inactive_anon"}, | 2702 | {"inactive_anon", "total_inactive_anon"}, |
2185 | {"active_anon", "total_active_anon"}, | 2703 | {"active_anon", "total_active_anon"}, |
2186 | {"inactive_file", "total_inactive_file"}, | 2704 | {"inactive_file", "total_inactive_file"}, |
@@ -2205,6 +2723,10 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
2205 | s->stat[MCS_PGPGIN] += val; | 2723 | s->stat[MCS_PGPGIN] += val; |
2206 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 2724 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
2207 | s->stat[MCS_PGPGOUT] += val; | 2725 | s->stat[MCS_PGPGOUT] += val; |
2726 | if (do_swap_account) { | ||
2727 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); | ||
2728 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | ||
2729 | } | ||
2208 | 2730 | ||
2209 | /* per zone stat */ | 2731 | /* per zone stat */ |
2210 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); | 2732 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); |
@@ -2236,8 +2758,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
2236 | memset(&mystat, 0, sizeof(mystat)); | 2758 | memset(&mystat, 0, sizeof(mystat)); |
2237 | mem_cgroup_get_local_stat(mem_cont, &mystat); | 2759 | mem_cgroup_get_local_stat(mem_cont, &mystat); |
2238 | 2760 | ||
2239 | for (i = 0; i < NR_MCS_STAT; i++) | 2761 | for (i = 0; i < NR_MCS_STAT; i++) { |
2762 | if (i == MCS_SWAP && !do_swap_account) | ||
2763 | continue; | ||
2240 | cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); | 2764 | cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); |
2765 | } | ||
2241 | 2766 | ||
2242 | /* Hierarchical information */ | 2767 | /* Hierarchical information */ |
2243 | { | 2768 | { |
@@ -2250,9 +2775,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
2250 | 2775 | ||
2251 | memset(&mystat, 0, sizeof(mystat)); | 2776 | memset(&mystat, 0, sizeof(mystat)); |
2252 | mem_cgroup_get_total_stat(mem_cont, &mystat); | 2777 | mem_cgroup_get_total_stat(mem_cont, &mystat); |
2253 | for (i = 0; i < NR_MCS_STAT; i++) | 2778 | for (i = 0; i < NR_MCS_STAT; i++) { |
2779 | if (i == MCS_SWAP && !do_swap_account) | ||
2780 | continue; | ||
2254 | cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); | 2781 | cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); |
2255 | 2782 | } | |
2256 | 2783 | ||
2257 | #ifdef CONFIG_DEBUG_VM | 2784 | #ifdef CONFIG_DEBUG_VM |
2258 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); | 2785 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); |
@@ -2345,6 +2872,12 @@ static struct cftype mem_cgroup_files[] = { | |||
2345 | .read_u64 = mem_cgroup_read, | 2872 | .read_u64 = mem_cgroup_read, |
2346 | }, | 2873 | }, |
2347 | { | 2874 | { |
2875 | .name = "soft_limit_in_bytes", | ||
2876 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), | ||
2877 | .write_string = mem_cgroup_write, | ||
2878 | .read_u64 = mem_cgroup_read, | ||
2879 | }, | ||
2880 | { | ||
2348 | .name = "failcnt", | 2881 | .name = "failcnt", |
2349 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), | 2882 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), |
2350 | .trigger = mem_cgroup_reset, | 2883 | .trigger = mem_cgroup_reset, |
@@ -2438,6 +2971,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
2438 | mz = &pn->zoneinfo[zone]; | 2971 | mz = &pn->zoneinfo[zone]; |
2439 | for_each_lru(l) | 2972 | for_each_lru(l) |
2440 | INIT_LIST_HEAD(&mz->lists[l]); | 2973 | INIT_LIST_HEAD(&mz->lists[l]); |
2974 | mz->usage_in_excess = 0; | ||
2975 | mz->on_tree = false; | ||
2976 | mz->mem = mem; | ||
2441 | } | 2977 | } |
2442 | return 0; | 2978 | return 0; |
2443 | } | 2979 | } |
@@ -2483,6 +3019,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) | |||
2483 | { | 3019 | { |
2484 | int node; | 3020 | int node; |
2485 | 3021 | ||
3022 | mem_cgroup_remove_from_trees(mem); | ||
2486 | free_css_id(&mem_cgroup_subsys, &mem->css); | 3023 | free_css_id(&mem_cgroup_subsys, &mem->css); |
2487 | 3024 | ||
2488 | for_each_node_state(node, N_POSSIBLE) | 3025 | for_each_node_state(node, N_POSSIBLE) |
@@ -2531,6 +3068,31 @@ static void __init enable_swap_cgroup(void) | |||
2531 | } | 3068 | } |
2532 | #endif | 3069 | #endif |
2533 | 3070 | ||
3071 | static int mem_cgroup_soft_limit_tree_init(void) | ||
3072 | { | ||
3073 | struct mem_cgroup_tree_per_node *rtpn; | ||
3074 | struct mem_cgroup_tree_per_zone *rtpz; | ||
3075 | int tmp, node, zone; | ||
3076 | |||
3077 | for_each_node_state(node, N_POSSIBLE) { | ||
3078 | tmp = node; | ||
3079 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
3080 | tmp = -1; | ||
3081 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
3082 | if (!rtpn) | ||
3083 | return 1; | ||
3084 | |||
3085 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
3086 | |||
3087 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
3088 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
3089 | rtpz->rb_root = RB_ROOT; | ||
3090 | spin_lock_init(&rtpz->lock); | ||
3091 | } | ||
3092 | } | ||
3093 | return 0; | ||
3094 | } | ||
3095 | |||
2534 | static struct cgroup_subsys_state * __ref | 3096 | static struct cgroup_subsys_state * __ref |
2535 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 3097 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
2536 | { | 3098 | { |
@@ -2545,10 +3107,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
2545 | for_each_node_state(node, N_POSSIBLE) | 3107 | for_each_node_state(node, N_POSSIBLE) |
2546 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 3108 | if (alloc_mem_cgroup_per_zone_info(mem, node)) |
2547 | goto free_out; | 3109 | goto free_out; |
3110 | |||
2548 | /* root ? */ | 3111 | /* root ? */ |
2549 | if (cont->parent == NULL) { | 3112 | if (cont->parent == NULL) { |
2550 | enable_swap_cgroup(); | 3113 | enable_swap_cgroup(); |
2551 | parent = NULL; | 3114 | parent = NULL; |
3115 | root_mem_cgroup = mem; | ||
3116 | if (mem_cgroup_soft_limit_tree_init()) | ||
3117 | goto free_out; | ||
3118 | |||
2552 | } else { | 3119 | } else { |
2553 | parent = mem_cgroup_from_cont(cont->parent); | 3120 | parent = mem_cgroup_from_cont(cont->parent); |
2554 | mem->use_hierarchy = parent->use_hierarchy; | 3121 | mem->use_hierarchy = parent->use_hierarchy; |
@@ -2577,6 +3144,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
2577 | return &mem->css; | 3144 | return &mem->css; |
2578 | free_out: | 3145 | free_out: |
2579 | __mem_cgroup_free(mem); | 3146 | __mem_cgroup_free(mem); |
3147 | root_mem_cgroup = NULL; | ||
2580 | return ERR_PTR(error); | 3148 | return ERR_PTR(error); |
2581 | } | 3149 | } |
2582 | 3150 | ||
@@ -2612,7 +3180,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
2612 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 3180 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
2613 | struct cgroup *cont, | 3181 | struct cgroup *cont, |
2614 | struct cgroup *old_cont, | 3182 | struct cgroup *old_cont, |
2615 | struct task_struct *p) | 3183 | struct task_struct *p, |
3184 | bool threadgroup) | ||
2616 | { | 3185 | { |
2617 | mutex_lock(&memcg_tasklist); | 3186 | mutex_lock(&memcg_tasklist); |
2618 | /* | 3187 | /* |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c new file mode 100644 index 000000000000..729d4b15b645 --- /dev/null +++ b/mm/memory-failure.c | |||
@@ -0,0 +1,832 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008, 2009 Intel Corporation | ||
3 | * Authors: Andi Kleen, Fengguang Wu | ||
4 | * | ||
5 | * This software may be redistributed and/or modified under the terms of | ||
6 | * the GNU General Public License ("GPL") version 2 only as published by the | ||
7 | * Free Software Foundation. | ||
8 | * | ||
9 | * High level machine check handler. Handles pages reported by the | ||
10 | * hardware as being corrupted usually due to a 2bit ECC memory or cache | ||
11 | * failure. | ||
12 | * | ||
13 | * Handles page cache pages in various states. The tricky part | ||
14 | * here is that we can access any page asynchronous to other VM | ||
15 | * users, because memory failures could happen anytime and anywhere, | ||
16 | * possibly violating some of their assumptions. This is why this code | ||
17 | * has to be extremely careful. Generally it tries to use normal locking | ||
18 | * rules, as in get the standard locks, even if that means the | ||
19 | * error handling takes potentially a long time. | ||
20 | * | ||
21 | * The operation to map back from RMAP chains to processes has to walk | ||
22 | * the complete process list and has non linear complexity with the number | ||
23 | * mappings. In short it can be quite slow. But since memory corruptions | ||
24 | * are rare we hope to get away with this. | ||
25 | */ | ||
26 | |||
27 | /* | ||
28 | * Notebook: | ||
29 | * - hugetlb needs more code | ||
30 | * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages | ||
31 | * - pass bad pages to kdump next kernel | ||
32 | */ | ||
33 | #define DEBUG 1 /* remove me in 2.6.34 */ | ||
34 | #include <linux/kernel.h> | ||
35 | #include <linux/mm.h> | ||
36 | #include <linux/page-flags.h> | ||
37 | #include <linux/sched.h> | ||
38 | #include <linux/rmap.h> | ||
39 | #include <linux/pagemap.h> | ||
40 | #include <linux/swap.h> | ||
41 | #include <linux/backing-dev.h> | ||
42 | #include "internal.h" | ||
43 | |||
44 | int sysctl_memory_failure_early_kill __read_mostly = 0; | ||
45 | |||
46 | int sysctl_memory_failure_recovery __read_mostly = 1; | ||
47 | |||
48 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); | ||
49 | |||
50 | /* | ||
51 | * Send all the processes who have the page mapped an ``action optional'' | ||
52 | * signal. | ||
53 | */ | ||
54 | static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | ||
55 | unsigned long pfn) | ||
56 | { | ||
57 | struct siginfo si; | ||
58 | int ret; | ||
59 | |||
60 | printk(KERN_ERR | ||
61 | "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", | ||
62 | pfn, t->comm, t->pid); | ||
63 | si.si_signo = SIGBUS; | ||
64 | si.si_errno = 0; | ||
65 | si.si_code = BUS_MCEERR_AO; | ||
66 | si.si_addr = (void *)addr; | ||
67 | #ifdef __ARCH_SI_TRAPNO | ||
68 | si.si_trapno = trapno; | ||
69 | #endif | ||
70 | si.si_addr_lsb = PAGE_SHIFT; | ||
71 | /* | ||
72 | * Don't use force here, it's convenient if the signal | ||
73 | * can be temporarily blocked. | ||
74 | * This could cause a loop when the user sets SIGBUS | ||
75 | * to SIG_IGN, but hopefully noone will do that? | ||
76 | */ | ||
77 | ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ | ||
78 | if (ret < 0) | ||
79 | printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", | ||
80 | t->comm, t->pid, ret); | ||
81 | return ret; | ||
82 | } | ||
83 | |||
84 | /* | ||
85 | * Kill all processes that have a poisoned page mapped and then isolate | ||
86 | * the page. | ||
87 | * | ||
88 | * General strategy: | ||
89 | * Find all processes having the page mapped and kill them. | ||
90 | * But we keep a page reference around so that the page is not | ||
91 | * actually freed yet. | ||
92 | * Then stash the page away | ||
93 | * | ||
94 | * There's no convenient way to get back to mapped processes | ||
95 | * from the VMAs. So do a brute-force search over all | ||
96 | * running processes. | ||
97 | * | ||
98 | * Remember that machine checks are not common (or rather | ||
99 | * if they are common you have other problems), so this shouldn't | ||
100 | * be a performance issue. | ||
101 | * | ||
102 | * Also there are some races possible while we get from the | ||
103 | * error detection to actually handle it. | ||
104 | */ | ||
105 | |||
106 | struct to_kill { | ||
107 | struct list_head nd; | ||
108 | struct task_struct *tsk; | ||
109 | unsigned long addr; | ||
110 | unsigned addr_valid:1; | ||
111 | }; | ||
112 | |||
113 | /* | ||
114 | * Failure handling: if we can't find or can't kill a process there's | ||
115 | * not much we can do. We just print a message and ignore otherwise. | ||
116 | */ | ||
117 | |||
118 | /* | ||
119 | * Schedule a process for later kill. | ||
120 | * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. | ||
121 | * TBD would GFP_NOIO be enough? | ||
122 | */ | ||
123 | static void add_to_kill(struct task_struct *tsk, struct page *p, | ||
124 | struct vm_area_struct *vma, | ||
125 | struct list_head *to_kill, | ||
126 | struct to_kill **tkc) | ||
127 | { | ||
128 | struct to_kill *tk; | ||
129 | |||
130 | if (*tkc) { | ||
131 | tk = *tkc; | ||
132 | *tkc = NULL; | ||
133 | } else { | ||
134 | tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); | ||
135 | if (!tk) { | ||
136 | printk(KERN_ERR | ||
137 | "MCE: Out of memory while machine check handling\n"); | ||
138 | return; | ||
139 | } | ||
140 | } | ||
141 | tk->addr = page_address_in_vma(p, vma); | ||
142 | tk->addr_valid = 1; | ||
143 | |||
144 | /* | ||
145 | * In theory we don't have to kill when the page was | ||
146 | * munmaped. But it could be also a mremap. Since that's | ||
147 | * likely very rare kill anyways just out of paranoia, but use | ||
148 | * a SIGKILL because the error is not contained anymore. | ||
149 | */ | ||
150 | if (tk->addr == -EFAULT) { | ||
151 | pr_debug("MCE: Unable to find user space address %lx in %s\n", | ||
152 | page_to_pfn(p), tsk->comm); | ||
153 | tk->addr_valid = 0; | ||
154 | } | ||
155 | get_task_struct(tsk); | ||
156 | tk->tsk = tsk; | ||
157 | list_add_tail(&tk->nd, to_kill); | ||
158 | } | ||
159 | |||
160 | /* | ||
161 | * Kill the processes that have been collected earlier. | ||
162 | * | ||
163 | * Only do anything when DOIT is set, otherwise just free the list | ||
164 | * (this is used for clean pages which do not need killing) | ||
165 | * Also when FAIL is set do a force kill because something went | ||
166 | * wrong earlier. | ||
167 | */ | ||
168 | static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, | ||
169 | int fail, unsigned long pfn) | ||
170 | { | ||
171 | struct to_kill *tk, *next; | ||
172 | |||
173 | list_for_each_entry_safe (tk, next, to_kill, nd) { | ||
174 | if (doit) { | ||
175 | /* | ||
176 | * In case something went wrong with munmaping | ||
177 | * make sure the process doesn't catch the | ||
178 | * signal and then access the memory. Just kill it. | ||
179 | * the signal handlers | ||
180 | */ | ||
181 | if (fail || tk->addr_valid == 0) { | ||
182 | printk(KERN_ERR | ||
183 | "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", | ||
184 | pfn, tk->tsk->comm, tk->tsk->pid); | ||
185 | force_sig(SIGKILL, tk->tsk); | ||
186 | } | ||
187 | |||
188 | /* | ||
189 | * In theory the process could have mapped | ||
190 | * something else on the address in-between. We could | ||
191 | * check for that, but we need to tell the | ||
192 | * process anyways. | ||
193 | */ | ||
194 | else if (kill_proc_ao(tk->tsk, tk->addr, trapno, | ||
195 | pfn) < 0) | ||
196 | printk(KERN_ERR | ||
197 | "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", | ||
198 | pfn, tk->tsk->comm, tk->tsk->pid); | ||
199 | } | ||
200 | put_task_struct(tk->tsk); | ||
201 | kfree(tk); | ||
202 | } | ||
203 | } | ||
204 | |||
205 | static int task_early_kill(struct task_struct *tsk) | ||
206 | { | ||
207 | if (!tsk->mm) | ||
208 | return 0; | ||
209 | if (tsk->flags & PF_MCE_PROCESS) | ||
210 | return !!(tsk->flags & PF_MCE_EARLY); | ||
211 | return sysctl_memory_failure_early_kill; | ||
212 | } | ||
213 | |||
214 | /* | ||
215 | * Collect processes when the error hit an anonymous page. | ||
216 | */ | ||
217 | static void collect_procs_anon(struct page *page, struct list_head *to_kill, | ||
218 | struct to_kill **tkc) | ||
219 | { | ||
220 | struct vm_area_struct *vma; | ||
221 | struct task_struct *tsk; | ||
222 | struct anon_vma *av; | ||
223 | |||
224 | read_lock(&tasklist_lock); | ||
225 | av = page_lock_anon_vma(page); | ||
226 | if (av == NULL) /* Not actually mapped anymore */ | ||
227 | goto out; | ||
228 | for_each_process (tsk) { | ||
229 | if (!task_early_kill(tsk)) | ||
230 | continue; | ||
231 | list_for_each_entry (vma, &av->head, anon_vma_node) { | ||
232 | if (!page_mapped_in_vma(page, vma)) | ||
233 | continue; | ||
234 | if (vma->vm_mm == tsk->mm) | ||
235 | add_to_kill(tsk, page, vma, to_kill, tkc); | ||
236 | } | ||
237 | } | ||
238 | page_unlock_anon_vma(av); | ||
239 | out: | ||
240 | read_unlock(&tasklist_lock); | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * Collect processes when the error hit a file mapped page. | ||
245 | */ | ||
246 | static void collect_procs_file(struct page *page, struct list_head *to_kill, | ||
247 | struct to_kill **tkc) | ||
248 | { | ||
249 | struct vm_area_struct *vma; | ||
250 | struct task_struct *tsk; | ||
251 | struct prio_tree_iter iter; | ||
252 | struct address_space *mapping = page->mapping; | ||
253 | |||
254 | /* | ||
255 | * A note on the locking order between the two locks. | ||
256 | * We don't rely on this particular order. | ||
257 | * If you have some other code that needs a different order | ||
258 | * feel free to switch them around. Or add a reverse link | ||
259 | * from mm_struct to task_struct, then this could be all | ||
260 | * done without taking tasklist_lock and looping over all tasks. | ||
261 | */ | ||
262 | |||
263 | read_lock(&tasklist_lock); | ||
264 | spin_lock(&mapping->i_mmap_lock); | ||
265 | for_each_process(tsk) { | ||
266 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
267 | |||
268 | if (!task_early_kill(tsk)) | ||
269 | continue; | ||
270 | |||
271 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, | ||
272 | pgoff) { | ||
273 | /* | ||
274 | * Send early kill signal to tasks where a vma covers | ||
275 | * the page but the corrupted page is not necessarily | ||
276 | * mapped it in its pte. | ||
277 | * Assume applications who requested early kill want | ||
278 | * to be informed of all such data corruptions. | ||
279 | */ | ||
280 | if (vma->vm_mm == tsk->mm) | ||
281 | add_to_kill(tsk, page, vma, to_kill, tkc); | ||
282 | } | ||
283 | } | ||
284 | spin_unlock(&mapping->i_mmap_lock); | ||
285 | read_unlock(&tasklist_lock); | ||
286 | } | ||
287 | |||
288 | /* | ||
289 | * Collect the processes who have the corrupted page mapped to kill. | ||
290 | * This is done in two steps for locking reasons. | ||
291 | * First preallocate one tokill structure outside the spin locks, | ||
292 | * so that we can kill at least one process reasonably reliable. | ||
293 | */ | ||
294 | static void collect_procs(struct page *page, struct list_head *tokill) | ||
295 | { | ||
296 | struct to_kill *tk; | ||
297 | |||
298 | if (!page->mapping) | ||
299 | return; | ||
300 | |||
301 | tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); | ||
302 | if (!tk) | ||
303 | return; | ||
304 | if (PageAnon(page)) | ||
305 | collect_procs_anon(page, tokill, &tk); | ||
306 | else | ||
307 | collect_procs_file(page, tokill, &tk); | ||
308 | kfree(tk); | ||
309 | } | ||
310 | |||
311 | /* | ||
312 | * Error handlers for various types of pages. | ||
313 | */ | ||
314 | |||
315 | enum outcome { | ||
316 | FAILED, /* Error handling failed */ | ||
317 | DELAYED, /* Will be handled later */ | ||
318 | IGNORED, /* Error safely ignored */ | ||
319 | RECOVERED, /* Successfully recovered */ | ||
320 | }; | ||
321 | |||
322 | static const char *action_name[] = { | ||
323 | [FAILED] = "Failed", | ||
324 | [DELAYED] = "Delayed", | ||
325 | [IGNORED] = "Ignored", | ||
326 | [RECOVERED] = "Recovered", | ||
327 | }; | ||
328 | |||
329 | /* | ||
330 | * Error hit kernel page. | ||
331 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | ||
332 | * could be more sophisticated. | ||
333 | */ | ||
334 | static int me_kernel(struct page *p, unsigned long pfn) | ||
335 | { | ||
336 | return DELAYED; | ||
337 | } | ||
338 | |||
339 | /* | ||
340 | * Already poisoned page. | ||
341 | */ | ||
342 | static int me_ignore(struct page *p, unsigned long pfn) | ||
343 | { | ||
344 | return IGNORED; | ||
345 | } | ||
346 | |||
347 | /* | ||
348 | * Page in unknown state. Do nothing. | ||
349 | */ | ||
350 | static int me_unknown(struct page *p, unsigned long pfn) | ||
351 | { | ||
352 | printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); | ||
353 | return FAILED; | ||
354 | } | ||
355 | |||
356 | /* | ||
357 | * Free memory | ||
358 | */ | ||
359 | static int me_free(struct page *p, unsigned long pfn) | ||
360 | { | ||
361 | return DELAYED; | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * Clean (or cleaned) page cache page. | ||
366 | */ | ||
367 | static int me_pagecache_clean(struct page *p, unsigned long pfn) | ||
368 | { | ||
369 | int err; | ||
370 | int ret = FAILED; | ||
371 | struct address_space *mapping; | ||
372 | |||
373 | if (!isolate_lru_page(p)) | ||
374 | page_cache_release(p); | ||
375 | |||
376 | /* | ||
377 | * For anonymous pages we're done the only reference left | ||
378 | * should be the one m_f() holds. | ||
379 | */ | ||
380 | if (PageAnon(p)) | ||
381 | return RECOVERED; | ||
382 | |||
383 | /* | ||
384 | * Now truncate the page in the page cache. This is really | ||
385 | * more like a "temporary hole punch" | ||
386 | * Don't do this for block devices when someone else | ||
387 | * has a reference, because it could be file system metadata | ||
388 | * and that's not safe to truncate. | ||
389 | */ | ||
390 | mapping = page_mapping(p); | ||
391 | if (!mapping) { | ||
392 | /* | ||
393 | * Page has been teared down in the meanwhile | ||
394 | */ | ||
395 | return FAILED; | ||
396 | } | ||
397 | |||
398 | /* | ||
399 | * Truncation is a bit tricky. Enable it per file system for now. | ||
400 | * | ||
401 | * Open: to take i_mutex or not for this? Right now we don't. | ||
402 | */ | ||
403 | if (mapping->a_ops->error_remove_page) { | ||
404 | err = mapping->a_ops->error_remove_page(mapping, p); | ||
405 | if (err != 0) { | ||
406 | printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", | ||
407 | pfn, err); | ||
408 | } else if (page_has_private(p) && | ||
409 | !try_to_release_page(p, GFP_NOIO)) { | ||
410 | pr_debug("MCE %#lx: failed to release buffers\n", pfn); | ||
411 | } else { | ||
412 | ret = RECOVERED; | ||
413 | } | ||
414 | } else { | ||
415 | /* | ||
416 | * If the file system doesn't support it just invalidate | ||
417 | * This fails on dirty or anything with private pages | ||
418 | */ | ||
419 | if (invalidate_inode_page(p)) | ||
420 | ret = RECOVERED; | ||
421 | else | ||
422 | printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", | ||
423 | pfn); | ||
424 | } | ||
425 | return ret; | ||
426 | } | ||
427 | |||
428 | /* | ||
429 | * Dirty cache page page | ||
430 | * Issues: when the error hit a hole page the error is not properly | ||
431 | * propagated. | ||
432 | */ | ||
433 | static int me_pagecache_dirty(struct page *p, unsigned long pfn) | ||
434 | { | ||
435 | struct address_space *mapping = page_mapping(p); | ||
436 | |||
437 | SetPageError(p); | ||
438 | /* TBD: print more information about the file. */ | ||
439 | if (mapping) { | ||
440 | /* | ||
441 | * IO error will be reported by write(), fsync(), etc. | ||
442 | * who check the mapping. | ||
443 | * This way the application knows that something went | ||
444 | * wrong with its dirty file data. | ||
445 | * | ||
446 | * There's one open issue: | ||
447 | * | ||
448 | * The EIO will be only reported on the next IO | ||
449 | * operation and then cleared through the IO map. | ||
450 | * Normally Linux has two mechanisms to pass IO error | ||
451 | * first through the AS_EIO flag in the address space | ||
452 | * and then through the PageError flag in the page. | ||
453 | * Since we drop pages on memory failure handling the | ||
454 | * only mechanism open to use is through AS_AIO. | ||
455 | * | ||
456 | * This has the disadvantage that it gets cleared on | ||
457 | * the first operation that returns an error, while | ||
458 | * the PageError bit is more sticky and only cleared | ||
459 | * when the page is reread or dropped. If an | ||
460 | * application assumes it will always get error on | ||
461 | * fsync, but does other operations on the fd before | ||
462 | * and the page is dropped inbetween then the error | ||
463 | * will not be properly reported. | ||
464 | * | ||
465 | * This can already happen even without hwpoisoned | ||
466 | * pages: first on metadata IO errors (which only | ||
467 | * report through AS_EIO) or when the page is dropped | ||
468 | * at the wrong time. | ||
469 | * | ||
470 | * So right now we assume that the application DTRT on | ||
471 | * the first EIO, but we're not worse than other parts | ||
472 | * of the kernel. | ||
473 | */ | ||
474 | mapping_set_error(mapping, EIO); | ||
475 | } | ||
476 | |||
477 | return me_pagecache_clean(p, pfn); | ||
478 | } | ||
479 | |||
480 | /* | ||
481 | * Clean and dirty swap cache. | ||
482 | * | ||
483 | * Dirty swap cache page is tricky to handle. The page could live both in page | ||
484 | * cache and swap cache(ie. page is freshly swapped in). So it could be | ||
485 | * referenced concurrently by 2 types of PTEs: | ||
486 | * normal PTEs and swap PTEs. We try to handle them consistently by calling | ||
487 | * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, | ||
488 | * and then | ||
489 | * - clear dirty bit to prevent IO | ||
490 | * - remove from LRU | ||
491 | * - but keep in the swap cache, so that when we return to it on | ||
492 | * a later page fault, we know the application is accessing | ||
493 | * corrupted data and shall be killed (we installed simple | ||
494 | * interception code in do_swap_page to catch it). | ||
495 | * | ||
496 | * Clean swap cache pages can be directly isolated. A later page fault will | ||
497 | * bring in the known good data from disk. | ||
498 | */ | ||
499 | static int me_swapcache_dirty(struct page *p, unsigned long pfn) | ||
500 | { | ||
501 | int ret = FAILED; | ||
502 | |||
503 | ClearPageDirty(p); | ||
504 | /* Trigger EIO in shmem: */ | ||
505 | ClearPageUptodate(p); | ||
506 | |||
507 | if (!isolate_lru_page(p)) { | ||
508 | page_cache_release(p); | ||
509 | ret = DELAYED; | ||
510 | } | ||
511 | |||
512 | return ret; | ||
513 | } | ||
514 | |||
515 | static int me_swapcache_clean(struct page *p, unsigned long pfn) | ||
516 | { | ||
517 | int ret = FAILED; | ||
518 | |||
519 | if (!isolate_lru_page(p)) { | ||
520 | page_cache_release(p); | ||
521 | ret = RECOVERED; | ||
522 | } | ||
523 | delete_from_swap_cache(p); | ||
524 | return ret; | ||
525 | } | ||
526 | |||
527 | /* | ||
528 | * Huge pages. Needs work. | ||
529 | * Issues: | ||
530 | * No rmap support so we cannot find the original mapper. In theory could walk | ||
531 | * all MMs and look for the mappings, but that would be non atomic and racy. | ||
532 | * Need rmap for hugepages for this. Alternatively we could employ a heuristic, | ||
533 | * like just walking the current process and hoping it has it mapped (that | ||
534 | * should be usually true for the common "shared database cache" case) | ||
535 | * Should handle free huge pages and dequeue them too, but this needs to | ||
536 | * handle huge page accounting correctly. | ||
537 | */ | ||
538 | static int me_huge_page(struct page *p, unsigned long pfn) | ||
539 | { | ||
540 | return FAILED; | ||
541 | } | ||
542 | |||
543 | /* | ||
544 | * Various page states we can handle. | ||
545 | * | ||
546 | * A page state is defined by its current page->flags bits. | ||
547 | * The table matches them in order and calls the right handler. | ||
548 | * | ||
549 | * This is quite tricky because we can access page at any time | ||
550 | * in its live cycle, so all accesses have to be extremly careful. | ||
551 | * | ||
552 | * This is not complete. More states could be added. | ||
553 | * For any missing state don't attempt recovery. | ||
554 | */ | ||
555 | |||
556 | #define dirty (1UL << PG_dirty) | ||
557 | #define sc (1UL << PG_swapcache) | ||
558 | #define unevict (1UL << PG_unevictable) | ||
559 | #define mlock (1UL << PG_mlocked) | ||
560 | #define writeback (1UL << PG_writeback) | ||
561 | #define lru (1UL << PG_lru) | ||
562 | #define swapbacked (1UL << PG_swapbacked) | ||
563 | #define head (1UL << PG_head) | ||
564 | #define tail (1UL << PG_tail) | ||
565 | #define compound (1UL << PG_compound) | ||
566 | #define slab (1UL << PG_slab) | ||
567 | #define buddy (1UL << PG_buddy) | ||
568 | #define reserved (1UL << PG_reserved) | ||
569 | |||
570 | static struct page_state { | ||
571 | unsigned long mask; | ||
572 | unsigned long res; | ||
573 | char *msg; | ||
574 | int (*action)(struct page *p, unsigned long pfn); | ||
575 | } error_states[] = { | ||
576 | { reserved, reserved, "reserved kernel", me_ignore }, | ||
577 | { buddy, buddy, "free kernel", me_free }, | ||
578 | |||
579 | /* | ||
580 | * Could in theory check if slab page is free or if we can drop | ||
581 | * currently unused objects without touching them. But just | ||
582 | * treat it as standard kernel for now. | ||
583 | */ | ||
584 | { slab, slab, "kernel slab", me_kernel }, | ||
585 | |||
586 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | ||
587 | { head, head, "huge", me_huge_page }, | ||
588 | { tail, tail, "huge", me_huge_page }, | ||
589 | #else | ||
590 | { compound, compound, "huge", me_huge_page }, | ||
591 | #endif | ||
592 | |||
593 | { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, | ||
594 | { sc|dirty, sc, "swapcache", me_swapcache_clean }, | ||
595 | |||
596 | { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, | ||
597 | { unevict, unevict, "unevictable LRU", me_pagecache_clean}, | ||
598 | |||
599 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
600 | { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, | ||
601 | { mlock, mlock, "mlocked LRU", me_pagecache_clean }, | ||
602 | #endif | ||
603 | |||
604 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, | ||
605 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | ||
606 | { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, | ||
607 | |||
608 | /* | ||
609 | * Catchall entry: must be at end. | ||
610 | */ | ||
611 | { 0, 0, "unknown page state", me_unknown }, | ||
612 | }; | ||
613 | |||
614 | #undef lru | ||
615 | |||
616 | static void action_result(unsigned long pfn, char *msg, int result) | ||
617 | { | ||
618 | struct page *page = NULL; | ||
619 | if (pfn_valid(pfn)) | ||
620 | page = pfn_to_page(pfn); | ||
621 | |||
622 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", | ||
623 | pfn, | ||
624 | page && PageDirty(page) ? "dirty " : "", | ||
625 | msg, action_name[result]); | ||
626 | } | ||
627 | |||
628 | static int page_action(struct page_state *ps, struct page *p, | ||
629 | unsigned long pfn, int ref) | ||
630 | { | ||
631 | int result; | ||
632 | |||
633 | result = ps->action(p, pfn); | ||
634 | action_result(pfn, ps->msg, result); | ||
635 | if (page_count(p) != 1 + ref) | ||
636 | printk(KERN_ERR | ||
637 | "MCE %#lx: %s page still referenced by %d users\n", | ||
638 | pfn, ps->msg, page_count(p) - 1); | ||
639 | |||
640 | /* Could do more checks here if page looks ok */ | ||
641 | /* | ||
642 | * Could adjust zone counters here to correct for the missing page. | ||
643 | */ | ||
644 | |||
645 | return result == RECOVERED ? 0 : -EBUSY; | ||
646 | } | ||
647 | |||
648 | #define N_UNMAP_TRIES 5 | ||
649 | |||
650 | /* | ||
651 | * Do all that is necessary to remove user space mappings. Unmap | ||
652 | * the pages and send SIGBUS to the processes if the data was dirty. | ||
653 | */ | ||
654 | static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | ||
655 | int trapno) | ||
656 | { | ||
657 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | ||
658 | struct address_space *mapping; | ||
659 | LIST_HEAD(tokill); | ||
660 | int ret; | ||
661 | int i; | ||
662 | int kill = 1; | ||
663 | |||
664 | if (PageReserved(p) || PageCompound(p) || PageSlab(p)) | ||
665 | return; | ||
666 | |||
667 | if (!PageLRU(p)) | ||
668 | lru_add_drain_all(); | ||
669 | |||
670 | /* | ||
671 | * This check implies we don't kill processes if their pages | ||
672 | * are in the swap cache early. Those are always late kills. | ||
673 | */ | ||
674 | if (!page_mapped(p)) | ||
675 | return; | ||
676 | |||
677 | if (PageSwapCache(p)) { | ||
678 | printk(KERN_ERR | ||
679 | "MCE %#lx: keeping poisoned page in swap cache\n", pfn); | ||
680 | ttu |= TTU_IGNORE_HWPOISON; | ||
681 | } | ||
682 | |||
683 | /* | ||
684 | * Propagate the dirty bit from PTEs to struct page first, because we | ||
685 | * need this to decide if we should kill or just drop the page. | ||
686 | */ | ||
687 | mapping = page_mapping(p); | ||
688 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { | ||
689 | if (page_mkclean(p)) { | ||
690 | SetPageDirty(p); | ||
691 | } else { | ||
692 | kill = 0; | ||
693 | ttu |= TTU_IGNORE_HWPOISON; | ||
694 | printk(KERN_INFO | ||
695 | "MCE %#lx: corrupted page was clean: dropped without side effects\n", | ||
696 | pfn); | ||
697 | } | ||
698 | } | ||
699 | |||
700 | /* | ||
701 | * First collect all the processes that have the page | ||
702 | * mapped in dirty form. This has to be done before try_to_unmap, | ||
703 | * because ttu takes the rmap data structures down. | ||
704 | * | ||
705 | * Error handling: We ignore errors here because | ||
706 | * there's nothing that can be done. | ||
707 | */ | ||
708 | if (kill) | ||
709 | collect_procs(p, &tokill); | ||
710 | |||
711 | /* | ||
712 | * try_to_unmap can fail temporarily due to races. | ||
713 | * Try a few times (RED-PEN better strategy?) | ||
714 | */ | ||
715 | for (i = 0; i < N_UNMAP_TRIES; i++) { | ||
716 | ret = try_to_unmap(p, ttu); | ||
717 | if (ret == SWAP_SUCCESS) | ||
718 | break; | ||
719 | pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); | ||
720 | } | ||
721 | |||
722 | if (ret != SWAP_SUCCESS) | ||
723 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | ||
724 | pfn, page_mapcount(p)); | ||
725 | |||
726 | /* | ||
727 | * Now that the dirty bit has been propagated to the | ||
728 | * struct page and all unmaps done we can decide if | ||
729 | * killing is needed or not. Only kill when the page | ||
730 | * was dirty, otherwise the tokill list is merely | ||
731 | * freed. When there was a problem unmapping earlier | ||
732 | * use a more force-full uncatchable kill to prevent | ||
733 | * any accesses to the poisoned memory. | ||
734 | */ | ||
735 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, | ||
736 | ret != SWAP_SUCCESS, pfn); | ||
737 | } | ||
738 | |||
739 | int __memory_failure(unsigned long pfn, int trapno, int ref) | ||
740 | { | ||
741 | struct page_state *ps; | ||
742 | struct page *p; | ||
743 | int res; | ||
744 | |||
745 | if (!sysctl_memory_failure_recovery) | ||
746 | panic("Memory failure from trap %d on page %lx", trapno, pfn); | ||
747 | |||
748 | if (!pfn_valid(pfn)) { | ||
749 | action_result(pfn, "memory outside kernel control", IGNORED); | ||
750 | return -EIO; | ||
751 | } | ||
752 | |||
753 | p = pfn_to_page(pfn); | ||
754 | if (TestSetPageHWPoison(p)) { | ||
755 | action_result(pfn, "already hardware poisoned", IGNORED); | ||
756 | return 0; | ||
757 | } | ||
758 | |||
759 | atomic_long_add(1, &mce_bad_pages); | ||
760 | |||
761 | /* | ||
762 | * We need/can do nothing about count=0 pages. | ||
763 | * 1) it's a free page, and therefore in safe hand: | ||
764 | * prep_new_page() will be the gate keeper. | ||
765 | * 2) it's part of a non-compound high order page. | ||
766 | * Implies some kernel user: cannot stop them from | ||
767 | * R/W the page; let's pray that the page has been | ||
768 | * used and will be freed some time later. | ||
769 | * In fact it's dangerous to directly bump up page count from 0, | ||
770 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. | ||
771 | */ | ||
772 | if (!get_page_unless_zero(compound_head(p))) { | ||
773 | action_result(pfn, "free or high order kernel", IGNORED); | ||
774 | return PageBuddy(compound_head(p)) ? 0 : -EBUSY; | ||
775 | } | ||
776 | |||
777 | /* | ||
778 | * Lock the page and wait for writeback to finish. | ||
779 | * It's very difficult to mess with pages currently under IO | ||
780 | * and in many cases impossible, so we just avoid it here. | ||
781 | */ | ||
782 | lock_page_nosync(p); | ||
783 | wait_on_page_writeback(p); | ||
784 | |||
785 | /* | ||
786 | * Now take care of user space mappings. | ||
787 | */ | ||
788 | hwpoison_user_mappings(p, pfn, trapno); | ||
789 | |||
790 | /* | ||
791 | * Torn down by someone else? | ||
792 | */ | ||
793 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { | ||
794 | action_result(pfn, "already truncated LRU", IGNORED); | ||
795 | res = 0; | ||
796 | goto out; | ||
797 | } | ||
798 | |||
799 | res = -EBUSY; | ||
800 | for (ps = error_states;; ps++) { | ||
801 | if ((p->flags & ps->mask) == ps->res) { | ||
802 | res = page_action(ps, p, pfn, ref); | ||
803 | break; | ||
804 | } | ||
805 | } | ||
806 | out: | ||
807 | unlock_page(p); | ||
808 | return res; | ||
809 | } | ||
810 | EXPORT_SYMBOL_GPL(__memory_failure); | ||
811 | |||
812 | /** | ||
813 | * memory_failure - Handle memory failure of a page. | ||
814 | * @pfn: Page Number of the corrupted page | ||
815 | * @trapno: Trap number reported in the signal to user space. | ||
816 | * | ||
817 | * This function is called by the low level machine check code | ||
818 | * of an architecture when it detects hardware memory corruption | ||
819 | * of a page. It tries its best to recover, which includes | ||
820 | * dropping pages, killing processes etc. | ||
821 | * | ||
822 | * The function is primarily of use for corruptions that | ||
823 | * happen outside the current execution context (e.g. when | ||
824 | * detected by a background scrubber) | ||
825 | * | ||
826 | * Must run in process context (e.g. a work queue) with interrupts | ||
827 | * enabled and no spinlocks hold. | ||
828 | */ | ||
829 | void memory_failure(unsigned long pfn, int trapno) | ||
830 | { | ||
831 | __memory_failure(pfn, trapno, 0); | ||
832 | } | ||
diff --git a/mm/memory.c b/mm/memory.c index b1443ac07c00..7e91b5f9f690 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -297,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
297 | unsigned long addr = vma->vm_start; | 297 | unsigned long addr = vma->vm_start; |
298 | 298 | ||
299 | /* | 299 | /* |
300 | * Hide vma from rmap and vmtruncate before freeing pgtables | 300 | * Hide vma from rmap and truncate_pagecache before freeing |
301 | * pgtables | ||
301 | */ | 302 | */ |
302 | anon_vma_unlink(vma); | 303 | anon_vma_unlink(vma); |
303 | unlink_file_vma(vma); | 304 | unlink_file_vma(vma); |
@@ -1325,7 +1326,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1325 | if (ret & VM_FAULT_ERROR) { | 1326 | if (ret & VM_FAULT_ERROR) { |
1326 | if (ret & VM_FAULT_OOM) | 1327 | if (ret & VM_FAULT_OOM) |
1327 | return i ? i : -ENOMEM; | 1328 | return i ? i : -ENOMEM; |
1328 | else if (ret & VM_FAULT_SIGBUS) | 1329 | if (ret & |
1330 | (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) | ||
1329 | return i ? i : -EFAULT; | 1331 | return i ? i : -EFAULT; |
1330 | BUG(); | 1332 | BUG(); |
1331 | } | 1333 | } |
@@ -2407,7 +2409,7 @@ restart: | |||
2407 | * @mapping: the address space containing mmaps to be unmapped. | 2409 | * @mapping: the address space containing mmaps to be unmapped. |
2408 | * @holebegin: byte in first page to unmap, relative to the start of | 2410 | * @holebegin: byte in first page to unmap, relative to the start of |
2409 | * the underlying file. This will be rounded down to a PAGE_SIZE | 2411 | * the underlying file. This will be rounded down to a PAGE_SIZE |
2410 | * boundary. Note that this is different from vmtruncate(), which | 2412 | * boundary. Note that this is different from truncate_pagecache(), which |
2411 | * must keep the partial page. In contrast, we must get rid of | 2413 | * must keep the partial page. In contrast, we must get rid of |
2412 | * partial pages. | 2414 | * partial pages. |
2413 | * @holelen: size of prospective hole in bytes. This will be rounded | 2415 | * @holelen: size of prospective hole in bytes. This will be rounded |
@@ -2458,63 +2460,6 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2458 | } | 2460 | } |
2459 | EXPORT_SYMBOL(unmap_mapping_range); | 2461 | EXPORT_SYMBOL(unmap_mapping_range); |
2460 | 2462 | ||
2461 | /** | ||
2462 | * vmtruncate - unmap mappings "freed" by truncate() syscall | ||
2463 | * @inode: inode of the file used | ||
2464 | * @offset: file offset to start truncating | ||
2465 | * | ||
2466 | * NOTE! We have to be ready to update the memory sharing | ||
2467 | * between the file and the memory map for a potential last | ||
2468 | * incomplete page. Ugly, but necessary. | ||
2469 | */ | ||
2470 | int vmtruncate(struct inode * inode, loff_t offset) | ||
2471 | { | ||
2472 | if (inode->i_size < offset) { | ||
2473 | unsigned long limit; | ||
2474 | |||
2475 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | ||
2476 | if (limit != RLIM_INFINITY && offset > limit) | ||
2477 | goto out_sig; | ||
2478 | if (offset > inode->i_sb->s_maxbytes) | ||
2479 | goto out_big; | ||
2480 | i_size_write(inode, offset); | ||
2481 | } else { | ||
2482 | struct address_space *mapping = inode->i_mapping; | ||
2483 | |||
2484 | /* | ||
2485 | * truncation of in-use swapfiles is disallowed - it would | ||
2486 | * cause subsequent swapout to scribble on the now-freed | ||
2487 | * blocks. | ||
2488 | */ | ||
2489 | if (IS_SWAPFILE(inode)) | ||
2490 | return -ETXTBSY; | ||
2491 | i_size_write(inode, offset); | ||
2492 | |||
2493 | /* | ||
2494 | * unmap_mapping_range is called twice, first simply for | ||
2495 | * efficiency so that truncate_inode_pages does fewer | ||
2496 | * single-page unmaps. However after this first call, and | ||
2497 | * before truncate_inode_pages finishes, it is possible for | ||
2498 | * private pages to be COWed, which remain after | ||
2499 | * truncate_inode_pages finishes, hence the second | ||
2500 | * unmap_mapping_range call must be made for correctness. | ||
2501 | */ | ||
2502 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | ||
2503 | truncate_inode_pages(mapping, offset); | ||
2504 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | ||
2505 | } | ||
2506 | |||
2507 | if (inode->i_op->truncate) | ||
2508 | inode->i_op->truncate(inode); | ||
2509 | return 0; | ||
2510 | |||
2511 | out_sig: | ||
2512 | send_sig(SIGXFSZ, current, 0); | ||
2513 | out_big: | ||
2514 | return -EFBIG; | ||
2515 | } | ||
2516 | EXPORT_SYMBOL(vmtruncate); | ||
2517 | |||
2518 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | 2463 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) |
2519 | { | 2464 | { |
2520 | struct address_space *mapping = inode->i_mapping; | 2465 | struct address_space *mapping = inode->i_mapping; |
@@ -2559,8 +2504,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2559 | goto out; | 2504 | goto out; |
2560 | 2505 | ||
2561 | entry = pte_to_swp_entry(orig_pte); | 2506 | entry = pte_to_swp_entry(orig_pte); |
2562 | if (is_migration_entry(entry)) { | 2507 | if (unlikely(non_swap_entry(entry))) { |
2563 | migration_entry_wait(mm, pmd, address); | 2508 | if (is_migration_entry(entry)) { |
2509 | migration_entry_wait(mm, pmd, address); | ||
2510 | } else if (is_hwpoison_entry(entry)) { | ||
2511 | ret = VM_FAULT_HWPOISON; | ||
2512 | } else { | ||
2513 | print_bad_pte(vma, address, orig_pte, NULL); | ||
2514 | ret = VM_FAULT_OOM; | ||
2515 | } | ||
2564 | goto out; | 2516 | goto out; |
2565 | } | 2517 | } |
2566 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2518 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
@@ -2584,6 +2536,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2584 | /* Had to read the page from swap area: Major fault */ | 2536 | /* Had to read the page from swap area: Major fault */ |
2585 | ret = VM_FAULT_MAJOR; | 2537 | ret = VM_FAULT_MAJOR; |
2586 | count_vm_event(PGMAJFAULT); | 2538 | count_vm_event(PGMAJFAULT); |
2539 | } else if (PageHWPoison(page)) { | ||
2540 | ret = VM_FAULT_HWPOISON; | ||
2541 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | ||
2542 | goto out; | ||
2587 | } | 2543 | } |
2588 | 2544 | ||
2589 | lock_page(page); | 2545 | lock_page(page); |
@@ -2760,6 +2716,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2760 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) | 2716 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) |
2761 | return ret; | 2717 | return ret; |
2762 | 2718 | ||
2719 | if (unlikely(PageHWPoison(vmf.page))) { | ||
2720 | if (ret & VM_FAULT_LOCKED) | ||
2721 | unlock_page(vmf.page); | ||
2722 | return VM_FAULT_HWPOISON; | ||
2723 | } | ||
2724 | |||
2763 | /* | 2725 | /* |
2764 | * For consistency in subsequent calls, make the faulted page always | 2726 | * For consistency in subsequent calls, make the faulted page always |
2765 | * locked. | 2727 | * locked. |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index efe3e0ec2e61..821dee596377 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -413,7 +413,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
413 | if (!populated_zone(zone)) | 413 | if (!populated_zone(zone)) |
414 | need_zonelists_rebuild = 1; | 414 | need_zonelists_rebuild = 1; |
415 | 415 | ||
416 | ret = walk_memory_resource(pfn, nr_pages, &onlined_pages, | 416 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, |
417 | online_pages_range); | 417 | online_pages_range); |
418 | if (ret) { | 418 | if (ret) { |
419 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", | 419 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", |
@@ -705,7 +705,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, | |||
705 | static void | 705 | static void |
706 | offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | 706 | offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) |
707 | { | 707 | { |
708 | walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL, | 708 | walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, |
709 | offline_isolated_pages_cb); | 709 | offline_isolated_pages_cb); |
710 | } | 710 | } |
711 | 711 | ||
@@ -731,7 +731,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
731 | long offlined = 0; | 731 | long offlined = 0; |
732 | int ret; | 732 | int ret; |
733 | 733 | ||
734 | ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined, | 734 | ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, |
735 | check_pages_isolated_cb); | 735 | check_pages_isolated_cb); |
736 | if (ret < 0) | 736 | if (ret < 0) |
737 | offlined = (long)ret; | 737 | offlined = (long)ret; |
diff --git a/mm/migrate.c b/mm/migrate.c index 16052e80aaac..1a4bf4813780 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -675,7 +675,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
675 | } | 675 | } |
676 | 676 | ||
677 | /* Establish migration ptes or remove ptes */ | 677 | /* Establish migration ptes or remove ptes */ |
678 | try_to_unmap(page, 1); | 678 | try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); |
679 | 679 | ||
680 | skip_unmap: | 680 | skip_unmap: |
681 | if (!page_mapped(page)) | 681 | if (!page_mapped(page)) |
diff --git a/mm/mremap.c b/mm/mremap.c index 20a07dba6be0..97bff2547719 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -86,8 +86,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
86 | if (vma->vm_file) { | 86 | if (vma->vm_file) { |
87 | /* | 87 | /* |
88 | * Subtle point from Rajesh Venkatasubramanian: before | 88 | * Subtle point from Rajesh Venkatasubramanian: before |
89 | * moving file-based ptes, we must lock vmtruncate out, | 89 | * moving file-based ptes, we must lock truncate_pagecache |
90 | * since it might clean the dst vma before the src vma, | 90 | * out, since it might clean the dst vma before the src vma, |
91 | * and we propagate stale pages into the dst afterward. | 91 | * and we propagate stale pages into the dst afterward. |
92 | */ | 92 | */ |
93 | mapping = vma->vm_file->f_mapping; | 93 | mapping = vma->vm_file->f_mapping; |
diff --git a/mm/nommu.c b/mm/nommu.c index 1a4473faac48..c73aa4753d79 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -61,6 +61,7 @@ void *high_memory; | |||
61 | struct page *mem_map; | 61 | struct page *mem_map; |
62 | unsigned long max_mapnr; | 62 | unsigned long max_mapnr; |
63 | unsigned long num_physpages; | 63 | unsigned long num_physpages; |
64 | unsigned long highest_memmap_pfn; | ||
64 | struct percpu_counter vm_committed_as; | 65 | struct percpu_counter vm_committed_as; |
65 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 66 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
66 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 67 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
@@ -82,46 +83,6 @@ struct vm_operations_struct generic_file_vm_ops = { | |||
82 | }; | 83 | }; |
83 | 84 | ||
84 | /* | 85 | /* |
85 | * Handle all mappings that got truncated by a "truncate()" | ||
86 | * system call. | ||
87 | * | ||
88 | * NOTE! We have to be ready to update the memory sharing | ||
89 | * between the file and the memory map for a potential last | ||
90 | * incomplete page. Ugly, but necessary. | ||
91 | */ | ||
92 | int vmtruncate(struct inode *inode, loff_t offset) | ||
93 | { | ||
94 | struct address_space *mapping = inode->i_mapping; | ||
95 | unsigned long limit; | ||
96 | |||
97 | if (inode->i_size < offset) | ||
98 | goto do_expand; | ||
99 | i_size_write(inode, offset); | ||
100 | |||
101 | truncate_inode_pages(mapping, offset); | ||
102 | goto out_truncate; | ||
103 | |||
104 | do_expand: | ||
105 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | ||
106 | if (limit != RLIM_INFINITY && offset > limit) | ||
107 | goto out_sig; | ||
108 | if (offset > inode->i_sb->s_maxbytes) | ||
109 | goto out; | ||
110 | i_size_write(inode, offset); | ||
111 | |||
112 | out_truncate: | ||
113 | if (inode->i_op->truncate) | ||
114 | inode->i_op->truncate(inode); | ||
115 | return 0; | ||
116 | out_sig: | ||
117 | send_sig(SIGXFSZ, current, 0); | ||
118 | out: | ||
119 | return -EFBIG; | ||
120 | } | ||
121 | |||
122 | EXPORT_SYMBOL(vmtruncate); | ||
123 | |||
124 | /* | ||
125 | * Return the total memory allocated for this pointer, not | 86 | * Return the total memory allocated for this pointer, not |
126 | * just what the caller asked for. | 87 | * just what the caller asked for. |
127 | * | 88 | * |
@@ -169,7 +130,7 @@ unsigned int kobjsize(const void *objp) | |||
169 | } | 130 | } |
170 | 131 | ||
171 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 132 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
172 | unsigned long start, int nr_pages, int foll_flags, | 133 | unsigned long start, int nr_pages, unsigned int foll_flags, |
173 | struct page **pages, struct vm_area_struct **vmas) | 134 | struct page **pages, struct vm_area_struct **vmas) |
174 | { | 135 | { |
175 | struct vm_area_struct *vma; | 136 | struct vm_area_struct *vma; |
@@ -865,7 +826,7 @@ static int validate_mmap_request(struct file *file, | |||
865 | int ret; | 826 | int ret; |
866 | 827 | ||
867 | /* do the simple checks first */ | 828 | /* do the simple checks first */ |
868 | if (flags & MAP_FIXED || addr) { | 829 | if (flags & MAP_FIXED) { |
869 | printk(KERN_DEBUG | 830 | printk(KERN_DEBUG |
870 | "%d: Can't do fixed-address/overlay mmap of RAM\n", | 831 | "%d: Can't do fixed-address/overlay mmap of RAM\n", |
871 | current->pid); | 832 | current->pid); |
@@ -1073,7 +1034,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) | |||
1073 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | 1034 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); |
1074 | if (ret == 0) { | 1035 | if (ret == 0) { |
1075 | vma->vm_region->vm_top = vma->vm_region->vm_end; | 1036 | vma->vm_region->vm_top = vma->vm_region->vm_end; |
1076 | return ret; | 1037 | return 0; |
1077 | } | 1038 | } |
1078 | if (ret != -ENOSYS) | 1039 | if (ret != -ENOSYS) |
1079 | return ret; | 1040 | return ret; |
@@ -1090,7 +1051,8 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) | |||
1090 | */ | 1051 | */ |
1091 | static int do_mmap_private(struct vm_area_struct *vma, | 1052 | static int do_mmap_private(struct vm_area_struct *vma, |
1092 | struct vm_region *region, | 1053 | struct vm_region *region, |
1093 | unsigned long len) | 1054 | unsigned long len, |
1055 | unsigned long capabilities) | ||
1094 | { | 1056 | { |
1095 | struct page *pages; | 1057 | struct page *pages; |
1096 | unsigned long total, point, n, rlen; | 1058 | unsigned long total, point, n, rlen; |
@@ -1101,13 +1063,13 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1101 | * shared mappings on devices or memory | 1063 | * shared mappings on devices or memory |
1102 | * - VM_MAYSHARE will be set if it may attempt to share | 1064 | * - VM_MAYSHARE will be set if it may attempt to share |
1103 | */ | 1065 | */ |
1104 | if (vma->vm_file) { | 1066 | if (capabilities & BDI_CAP_MAP_DIRECT) { |
1105 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | 1067 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); |
1106 | if (ret == 0) { | 1068 | if (ret == 0) { |
1107 | /* shouldn't return success if we're not sharing */ | 1069 | /* shouldn't return success if we're not sharing */ |
1108 | BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); | 1070 | BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); |
1109 | vma->vm_region->vm_top = vma->vm_region->vm_end; | 1071 | vma->vm_region->vm_top = vma->vm_region->vm_end; |
1110 | return ret; | 1072 | return 0; |
1111 | } | 1073 | } |
1112 | if (ret != -ENOSYS) | 1074 | if (ret != -ENOSYS) |
1113 | return ret; | 1075 | return ret; |
@@ -1220,9 +1182,6 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1220 | 1182 | ||
1221 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); | 1183 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); |
1222 | 1184 | ||
1223 | if (!(flags & MAP_FIXED)) | ||
1224 | addr = round_hint_to_min(addr); | ||
1225 | |||
1226 | /* decide whether we should attempt the mapping, and if so what sort of | 1185 | /* decide whether we should attempt the mapping, and if so what sort of |
1227 | * mapping */ | 1186 | * mapping */ |
1228 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, | 1187 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, |
@@ -1232,6 +1191,9 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1232 | return ret; | 1191 | return ret; |
1233 | } | 1192 | } |
1234 | 1193 | ||
1194 | /* we ignore the address hint */ | ||
1195 | addr = 0; | ||
1196 | |||
1235 | /* we've determined that we can make the mapping, now translate what we | 1197 | /* we've determined that we can make the mapping, now translate what we |
1236 | * now know into VMA flags */ | 1198 | * now know into VMA flags */ |
1237 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); | 1199 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); |
@@ -1345,7 +1307,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1345 | * - this is the hook for quasi-memory character devices to | 1307 | * - this is the hook for quasi-memory character devices to |
1346 | * tell us the location of a shared mapping | 1308 | * tell us the location of a shared mapping |
1347 | */ | 1309 | */ |
1348 | if (file && file->f_op->get_unmapped_area) { | 1310 | if (capabilities & BDI_CAP_MAP_DIRECT) { |
1349 | addr = file->f_op->get_unmapped_area(file, addr, len, | 1311 | addr = file->f_op->get_unmapped_area(file, addr, len, |
1350 | pgoff, flags); | 1312 | pgoff, flags); |
1351 | if (IS_ERR((void *) addr)) { | 1313 | if (IS_ERR((void *) addr)) { |
@@ -1369,15 +1331,17 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1369 | } | 1331 | } |
1370 | 1332 | ||
1371 | vma->vm_region = region; | 1333 | vma->vm_region = region; |
1372 | add_nommu_region(region); | ||
1373 | 1334 | ||
1374 | /* set up the mapping */ | 1335 | /* set up the mapping |
1336 | * - the region is filled in if BDI_CAP_MAP_DIRECT is still set | ||
1337 | */ | ||
1375 | if (file && vma->vm_flags & VM_SHARED) | 1338 | if (file && vma->vm_flags & VM_SHARED) |
1376 | ret = do_mmap_shared_file(vma); | 1339 | ret = do_mmap_shared_file(vma); |
1377 | else | 1340 | else |
1378 | ret = do_mmap_private(vma, region, len); | 1341 | ret = do_mmap_private(vma, region, len, capabilities); |
1379 | if (ret < 0) | 1342 | if (ret < 0) |
1380 | goto error_put_region; | 1343 | goto error_just_free; |
1344 | add_nommu_region(region); | ||
1381 | 1345 | ||
1382 | /* okay... we have a mapping; now we have to register it */ | 1346 | /* okay... we have a mapping; now we have to register it */ |
1383 | result = vma->vm_start; | 1347 | result = vma->vm_start; |
@@ -1395,19 +1359,6 @@ share: | |||
1395 | kleave(" = %lx", result); | 1359 | kleave(" = %lx", result); |
1396 | return result; | 1360 | return result; |
1397 | 1361 | ||
1398 | error_put_region: | ||
1399 | __put_nommu_region(region); | ||
1400 | if (vma) { | ||
1401 | if (vma->vm_file) { | ||
1402 | fput(vma->vm_file); | ||
1403 | if (vma->vm_flags & VM_EXECUTABLE) | ||
1404 | removed_exe_file_vma(vma->vm_mm); | ||
1405 | } | ||
1406 | kmem_cache_free(vm_area_cachep, vma); | ||
1407 | } | ||
1408 | kleave(" = %d [pr]", ret); | ||
1409 | return ret; | ||
1410 | |||
1411 | error_just_free: | 1362 | error_just_free: |
1412 | up_write(&nommu_region_sem); | 1363 | up_write(&nommu_region_sem); |
1413 | error: | 1364 | error: |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8bef063125b1..69b5fbabc8bd 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -158,37 +158,37 @@ static void update_completion_period(void) | |||
158 | } | 158 | } |
159 | 159 | ||
160 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | 160 | int dirty_background_ratio_handler(struct ctl_table *table, int write, |
161 | struct file *filp, void __user *buffer, size_t *lenp, | 161 | void __user *buffer, size_t *lenp, |
162 | loff_t *ppos) | 162 | loff_t *ppos) |
163 | { | 163 | { |
164 | int ret; | 164 | int ret; |
165 | 165 | ||
166 | ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | 166 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
167 | if (ret == 0 && write) | 167 | if (ret == 0 && write) |
168 | dirty_background_bytes = 0; | 168 | dirty_background_bytes = 0; |
169 | return ret; | 169 | return ret; |
170 | } | 170 | } |
171 | 171 | ||
172 | int dirty_background_bytes_handler(struct ctl_table *table, int write, | 172 | int dirty_background_bytes_handler(struct ctl_table *table, int write, |
173 | struct file *filp, void __user *buffer, size_t *lenp, | 173 | void __user *buffer, size_t *lenp, |
174 | loff_t *ppos) | 174 | loff_t *ppos) |
175 | { | 175 | { |
176 | int ret; | 176 | int ret; |
177 | 177 | ||
178 | ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); | 178 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
179 | if (ret == 0 && write) | 179 | if (ret == 0 && write) |
180 | dirty_background_ratio = 0; | 180 | dirty_background_ratio = 0; |
181 | return ret; | 181 | return ret; |
182 | } | 182 | } |
183 | 183 | ||
184 | int dirty_ratio_handler(struct ctl_table *table, int write, | 184 | int dirty_ratio_handler(struct ctl_table *table, int write, |
185 | struct file *filp, void __user *buffer, size_t *lenp, | 185 | void __user *buffer, size_t *lenp, |
186 | loff_t *ppos) | 186 | loff_t *ppos) |
187 | { | 187 | { |
188 | int old_ratio = vm_dirty_ratio; | 188 | int old_ratio = vm_dirty_ratio; |
189 | int ret; | 189 | int ret; |
190 | 190 | ||
191 | ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | 191 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
192 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { | 192 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { |
193 | update_completion_period(); | 193 | update_completion_period(); |
194 | vm_dirty_bytes = 0; | 194 | vm_dirty_bytes = 0; |
@@ -198,13 +198,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write, | |||
198 | 198 | ||
199 | 199 | ||
200 | int dirty_bytes_handler(struct ctl_table *table, int write, | 200 | int dirty_bytes_handler(struct ctl_table *table, int write, |
201 | struct file *filp, void __user *buffer, size_t *lenp, | 201 | void __user *buffer, size_t *lenp, |
202 | loff_t *ppos) | 202 | loff_t *ppos) |
203 | { | 203 | { |
204 | unsigned long old_bytes = vm_dirty_bytes; | 204 | unsigned long old_bytes = vm_dirty_bytes; |
205 | int ret; | 205 | int ret; |
206 | 206 | ||
207 | ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); | 207 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
208 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { | 208 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { |
209 | update_completion_period(); | 209 | update_completion_period(); |
210 | vm_dirty_ratio = 0; | 210 | vm_dirty_ratio = 0; |
@@ -690,9 +690,9 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); | |||
690 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs | 690 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs |
691 | */ | 691 | */ |
692 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, | 692 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, |
693 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 693 | void __user *buffer, size_t *length, loff_t *ppos) |
694 | { | 694 | { |
695 | proc_dointvec(table, write, file, buffer, length, ppos); | 695 | proc_dointvec(table, write, buffer, length, ppos); |
696 | return 0; | 696 | return 0; |
697 | } | 697 | } |
698 | 698 | ||
@@ -1153,6 +1153,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) | |||
1153 | EXPORT_SYMBOL(redirty_page_for_writepage); | 1153 | EXPORT_SYMBOL(redirty_page_for_writepage); |
1154 | 1154 | ||
1155 | /* | 1155 | /* |
1156 | * Dirty a page. | ||
1157 | * | ||
1158 | * For pages with a mapping this should be done under the page lock | ||
1159 | * for the benefit of asynchronous memory errors who prefer a consistent | ||
1160 | * dirty state. This rule can be broken in some special cases, | ||
1161 | * but should be better not to. | ||
1162 | * | ||
1156 | * If the mapping doesn't provide a set_page_dirty a_op, then | 1163 | * If the mapping doesn't provide a set_page_dirty a_op, then |
1157 | * just fall through and assume that it wants buffer_heads. | 1164 | * just fall through and assume that it wants buffer_heads. |
1158 | */ | 1165 | */ |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5717f27a0704..bf720550b44d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -234,6 +234,12 @@ static void bad_page(struct page *page) | |||
234 | static unsigned long nr_shown; | 234 | static unsigned long nr_shown; |
235 | static unsigned long nr_unshown; | 235 | static unsigned long nr_unshown; |
236 | 236 | ||
237 | /* Don't complain about poisoned pages */ | ||
238 | if (PageHWPoison(page)) { | ||
239 | __ClearPageBuddy(page); | ||
240 | return; | ||
241 | } | ||
242 | |||
237 | /* | 243 | /* |
238 | * Allow a burst of 60 reports, then keep quiet for that minute; | 244 | * Allow a burst of 60 reports, then keep quiet for that minute; |
239 | * or allow a steady drip of one report per second. | 245 | * or allow a steady drip of one report per second. |
@@ -666,7 +672,7 @@ static inline void expand(struct zone *zone, struct page *page, | |||
666 | /* | 672 | /* |
667 | * This page is about to be returned from the page allocator | 673 | * This page is about to be returned from the page allocator |
668 | */ | 674 | */ |
669 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | 675 | static inline int check_new_page(struct page *page) |
670 | { | 676 | { |
671 | if (unlikely(page_mapcount(page) | | 677 | if (unlikely(page_mapcount(page) | |
672 | (page->mapping != NULL) | | 678 | (page->mapping != NULL) | |
@@ -675,6 +681,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
675 | bad_page(page); | 681 | bad_page(page); |
676 | return 1; | 682 | return 1; |
677 | } | 683 | } |
684 | return 0; | ||
685 | } | ||
686 | |||
687 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | ||
688 | { | ||
689 | int i; | ||
690 | |||
691 | for (i = 0; i < (1 << order); i++) { | ||
692 | struct page *p = page + i; | ||
693 | if (unlikely(check_new_page(p))) | ||
694 | return 1; | ||
695 | } | ||
678 | 696 | ||
679 | set_page_private(page, 0); | 697 | set_page_private(page, 0); |
680 | set_page_refcounted(page); | 698 | set_page_refcounted(page); |
@@ -2373,7 +2391,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); | |||
2373 | * sysctl handler for numa_zonelist_order | 2391 | * sysctl handler for numa_zonelist_order |
2374 | */ | 2392 | */ |
2375 | int numa_zonelist_order_handler(ctl_table *table, int write, | 2393 | int numa_zonelist_order_handler(ctl_table *table, int write, |
2376 | struct file *file, void __user *buffer, size_t *length, | 2394 | void __user *buffer, size_t *length, |
2377 | loff_t *ppos) | 2395 | loff_t *ppos) |
2378 | { | 2396 | { |
2379 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; | 2397 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; |
@@ -2382,7 +2400,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
2382 | if (write) | 2400 | if (write) |
2383 | strncpy(saved_string, (char*)table->data, | 2401 | strncpy(saved_string, (char*)table->data, |
2384 | NUMA_ZONELIST_ORDER_LEN); | 2402 | NUMA_ZONELIST_ORDER_LEN); |
2385 | ret = proc_dostring(table, write, file, buffer, length, ppos); | 2403 | ret = proc_dostring(table, write, buffer, length, ppos); |
2386 | if (ret) | 2404 | if (ret) |
2387 | return ret; | 2405 | return ret; |
2388 | if (write) { | 2406 | if (write) { |
@@ -4706,9 +4724,9 @@ module_init(init_per_zone_wmark_min) | |||
4706 | * changes. | 4724 | * changes. |
4707 | */ | 4725 | */ |
4708 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | 4726 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, |
4709 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 4727 | void __user *buffer, size_t *length, loff_t *ppos) |
4710 | { | 4728 | { |
4711 | proc_dointvec(table, write, file, buffer, length, ppos); | 4729 | proc_dointvec(table, write, buffer, length, ppos); |
4712 | if (write) | 4730 | if (write) |
4713 | setup_per_zone_wmarks(); | 4731 | setup_per_zone_wmarks(); |
4714 | return 0; | 4732 | return 0; |
@@ -4716,12 +4734,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | |||
4716 | 4734 | ||
4717 | #ifdef CONFIG_NUMA | 4735 | #ifdef CONFIG_NUMA |
4718 | int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, | 4736 | int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, |
4719 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 4737 | void __user *buffer, size_t *length, loff_t *ppos) |
4720 | { | 4738 | { |
4721 | struct zone *zone; | 4739 | struct zone *zone; |
4722 | int rc; | 4740 | int rc; |
4723 | 4741 | ||
4724 | rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 4742 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); |
4725 | if (rc) | 4743 | if (rc) |
4726 | return rc; | 4744 | return rc; |
4727 | 4745 | ||
@@ -4732,12 +4750,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, | |||
4732 | } | 4750 | } |
4733 | 4751 | ||
4734 | int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | 4752 | int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, |
4735 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 4753 | void __user *buffer, size_t *length, loff_t *ppos) |
4736 | { | 4754 | { |
4737 | struct zone *zone; | 4755 | struct zone *zone; |
4738 | int rc; | 4756 | int rc; |
4739 | 4757 | ||
4740 | rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 4758 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); |
4741 | if (rc) | 4759 | if (rc) |
4742 | return rc; | 4760 | return rc; |
4743 | 4761 | ||
@@ -4758,9 +4776,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | |||
4758 | * if in function of the boot time zone sizes. | 4776 | * if in function of the boot time zone sizes. |
4759 | */ | 4777 | */ |
4760 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | 4778 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, |
4761 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 4779 | void __user *buffer, size_t *length, loff_t *ppos) |
4762 | { | 4780 | { |
4763 | proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 4781 | proc_dointvec_minmax(table, write, buffer, length, ppos); |
4764 | setup_per_zone_lowmem_reserve(); | 4782 | setup_per_zone_lowmem_reserve(); |
4765 | return 0; | 4783 | return 0; |
4766 | } | 4784 | } |
@@ -4772,13 +4790,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | |||
4772 | */ | 4790 | */ |
4773 | 4791 | ||
4774 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | 4792 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, |
4775 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 4793 | void __user *buffer, size_t *length, loff_t *ppos) |
4776 | { | 4794 | { |
4777 | struct zone *zone; | 4795 | struct zone *zone; |
4778 | unsigned int cpu; | 4796 | unsigned int cpu; |
4779 | int ret; | 4797 | int ret; |
4780 | 4798 | ||
4781 | ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 4799 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); |
4782 | if (!write || (ret == -EINVAL)) | 4800 | if (!write || (ret == -EINVAL)) |
4783 | return ret; | 4801 | return ret; |
4784 | for_each_populated_zone(zone) { | 4802 | for_each_populated_zone(zone) { |
diff --git a/mm/quicklist.c b/mm/quicklist.c index 6eedf7e473d1..6633965bb27b 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c | |||
@@ -29,7 +29,6 @@ static unsigned long max_pages(unsigned long min_pages) | |||
29 | int node = numa_node_id(); | 29 | int node = numa_node_id(); |
30 | struct zone *zones = NODE_DATA(node)->node_zones; | 30 | struct zone *zones = NODE_DATA(node)->node_zones; |
31 | int num_cpus_on_node; | 31 | int num_cpus_on_node; |
32 | const struct cpumask *cpumask_on_node = cpumask_of_node(node); | ||
33 | 32 | ||
34 | node_free_pages = | 33 | node_free_pages = |
35 | #ifdef CONFIG_ZONE_DMA | 34 | #ifdef CONFIG_ZONE_DMA |
@@ -42,7 +41,7 @@ static unsigned long max_pages(unsigned long min_pages) | |||
42 | 41 | ||
43 | max = node_free_pages / FRACTION_OF_NODE_MEM; | 42 | max = node_free_pages / FRACTION_OF_NODE_MEM; |
44 | 43 | ||
45 | num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); | 44 | num_cpus_on_node = cpumask_weight(cpumask_of_node(node)); |
46 | max /= num_cpus_on_node; | 45 | max /= num_cpus_on_node; |
47 | 46 | ||
48 | return max(max, min_pages); | 47 | return max(max, min_pages); |
@@ -36,6 +36,11 @@ | |||
36 | * mapping->tree_lock (widely used, in set_page_dirty, | 36 | * mapping->tree_lock (widely used, in set_page_dirty, |
37 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within inode_lock in __sync_single_inode) | 38 | * within inode_lock in __sync_single_inode) |
39 | * | ||
40 | * (code doesn't rely on that order so it could be switched around) | ||
41 | * ->tasklist_lock | ||
42 | * anon_vma->lock (memory_failure, collect_procs_anon) | ||
43 | * pte map lock | ||
39 | */ | 44 | */ |
40 | 45 | ||
41 | #include <linux/mm.h> | 46 | #include <linux/mm.h> |
@@ -191,7 +196,7 @@ void __init anon_vma_init(void) | |||
191 | * Getting a lock on a stable anon_vma from a page off the LRU is | 196 | * Getting a lock on a stable anon_vma from a page off the LRU is |
192 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. | 197 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. |
193 | */ | 198 | */ |
194 | static struct anon_vma *page_lock_anon_vma(struct page *page) | 199 | struct anon_vma *page_lock_anon_vma(struct page *page) |
195 | { | 200 | { |
196 | struct anon_vma *anon_vma; | 201 | struct anon_vma *anon_vma; |
197 | unsigned long anon_mapping; | 202 | unsigned long anon_mapping; |
@@ -211,7 +216,7 @@ out: | |||
211 | return NULL; | 216 | return NULL; |
212 | } | 217 | } |
213 | 218 | ||
214 | static void page_unlock_anon_vma(struct anon_vma *anon_vma) | 219 | void page_unlock_anon_vma(struct anon_vma *anon_vma) |
215 | { | 220 | { |
216 | spin_unlock(&anon_vma->lock); | 221 | spin_unlock(&anon_vma->lock); |
217 | rcu_read_unlock(); | 222 | rcu_read_unlock(); |
@@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, | |||
311 | * if the page is not mapped into the page tables of this VMA. Only | 316 | * if the page is not mapped into the page tables of this VMA. Only |
312 | * valid for normal file or anonymous VMAs. | 317 | * valid for normal file or anonymous VMAs. |
313 | */ | 318 | */ |
314 | static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | 319 | int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) |
315 | { | 320 | { |
316 | unsigned long address; | 321 | unsigned long address; |
317 | pte_t *pte; | 322 | pte_t *pte; |
@@ -756,7 +761,7 @@ void page_remove_rmap(struct page *page) | |||
756 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | 761 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. |
757 | */ | 762 | */ |
758 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 763 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
759 | int migration) | 764 | enum ttu_flags flags) |
760 | { | 765 | { |
761 | struct mm_struct *mm = vma->vm_mm; | 766 | struct mm_struct *mm = vma->vm_mm; |
762 | unsigned long address; | 767 | unsigned long address; |
@@ -778,11 +783,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
778 | * If it's recently referenced (perhaps page_referenced | 783 | * If it's recently referenced (perhaps page_referenced |
779 | * skipped over this mm) then we should reactivate it. | 784 | * skipped over this mm) then we should reactivate it. |
780 | */ | 785 | */ |
781 | if (!migration) { | 786 | if (!(flags & TTU_IGNORE_MLOCK)) { |
782 | if (vma->vm_flags & VM_LOCKED) { | 787 | if (vma->vm_flags & VM_LOCKED) { |
783 | ret = SWAP_MLOCK; | 788 | ret = SWAP_MLOCK; |
784 | goto out_unmap; | 789 | goto out_unmap; |
785 | } | 790 | } |
791 | } | ||
792 | if (!(flags & TTU_IGNORE_ACCESS)) { | ||
786 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 793 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
787 | ret = SWAP_FAIL; | 794 | ret = SWAP_FAIL; |
788 | goto out_unmap; | 795 | goto out_unmap; |
@@ -800,7 +807,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
800 | /* Update high watermark before we lower rss */ | 807 | /* Update high watermark before we lower rss */ |
801 | update_hiwater_rss(mm); | 808 | update_hiwater_rss(mm); |
802 | 809 | ||
803 | if (PageAnon(page)) { | 810 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
811 | if (PageAnon(page)) | ||
812 | dec_mm_counter(mm, anon_rss); | ||
813 | else | ||
814 | dec_mm_counter(mm, file_rss); | ||
815 | set_pte_at(mm, address, pte, | ||
816 | swp_entry_to_pte(make_hwpoison_entry(page))); | ||
817 | } else if (PageAnon(page)) { | ||
804 | swp_entry_t entry = { .val = page_private(page) }; | 818 | swp_entry_t entry = { .val = page_private(page) }; |
805 | 819 | ||
806 | if (PageSwapCache(page)) { | 820 | if (PageSwapCache(page)) { |
@@ -822,12 +836,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
822 | * pte. do_swap_page() will wait until the migration | 836 | * pte. do_swap_page() will wait until the migration |
823 | * pte is removed and then restart fault handling. | 837 | * pte is removed and then restart fault handling. |
824 | */ | 838 | */ |
825 | BUG_ON(!migration); | 839 | BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); |
826 | entry = make_migration_entry(page, pte_write(pteval)); | 840 | entry = make_migration_entry(page, pte_write(pteval)); |
827 | } | 841 | } |
828 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 842 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
829 | BUG_ON(pte_file(*pte)); | 843 | BUG_ON(pte_file(*pte)); |
830 | } else if (PAGE_MIGRATION && migration) { | 844 | } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { |
831 | /* Establish migration entry for a file page */ | 845 | /* Establish migration entry for a file page */ |
832 | swp_entry_t entry; | 846 | swp_entry_t entry; |
833 | entry = make_migration_entry(page, pte_write(pteval)); | 847 | entry = make_migration_entry(page, pte_write(pteval)); |
@@ -996,12 +1010,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) | |||
996 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | 1010 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be |
997 | * 'LOCKED. | 1011 | * 'LOCKED. |
998 | */ | 1012 | */ |
999 | static int try_to_unmap_anon(struct page *page, int unlock, int migration) | 1013 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) |
1000 | { | 1014 | { |
1001 | struct anon_vma *anon_vma; | 1015 | struct anon_vma *anon_vma; |
1002 | struct vm_area_struct *vma; | 1016 | struct vm_area_struct *vma; |
1003 | unsigned int mlocked = 0; | 1017 | unsigned int mlocked = 0; |
1004 | int ret = SWAP_AGAIN; | 1018 | int ret = SWAP_AGAIN; |
1019 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
1005 | 1020 | ||
1006 | if (MLOCK_PAGES && unlikely(unlock)) | 1021 | if (MLOCK_PAGES && unlikely(unlock)) |
1007 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | 1022 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ |
@@ -1017,7 +1032,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) | |||
1017 | continue; /* must visit all unlocked vmas */ | 1032 | continue; /* must visit all unlocked vmas */ |
1018 | ret = SWAP_MLOCK; /* saw at least one mlocked vma */ | 1033 | ret = SWAP_MLOCK; /* saw at least one mlocked vma */ |
1019 | } else { | 1034 | } else { |
1020 | ret = try_to_unmap_one(page, vma, migration); | 1035 | ret = try_to_unmap_one(page, vma, flags); |
1021 | if (ret == SWAP_FAIL || !page_mapped(page)) | 1036 | if (ret == SWAP_FAIL || !page_mapped(page)) |
1022 | break; | 1037 | break; |
1023 | } | 1038 | } |
@@ -1041,8 +1056,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) | |||
1041 | /** | 1056 | /** |
1042 | * try_to_unmap_file - unmap/unlock file page using the object-based rmap method | 1057 | * try_to_unmap_file - unmap/unlock file page using the object-based rmap method |
1043 | * @page: the page to unmap/unlock | 1058 | * @page: the page to unmap/unlock |
1044 | * @unlock: request for unlock rather than unmap [unlikely] | 1059 | * @flags: action and flags |
1045 | * @migration: unmapping for migration - ignored if @unlock | ||
1046 | * | 1060 | * |
1047 | * Find all the mappings of a page using the mapping pointer and the vma chains | 1061 | * Find all the mappings of a page using the mapping pointer and the vma chains |
1048 | * contained in the address_space struct it points to. | 1062 | * contained in the address_space struct it points to. |
@@ -1054,7 +1068,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) | |||
1054 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | 1068 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be |
1055 | * 'LOCKED. | 1069 | * 'LOCKED. |
1056 | */ | 1070 | */ |
1057 | static int try_to_unmap_file(struct page *page, int unlock, int migration) | 1071 | static int try_to_unmap_file(struct page *page, enum ttu_flags flags) |
1058 | { | 1072 | { |
1059 | struct address_space *mapping = page->mapping; | 1073 | struct address_space *mapping = page->mapping; |
1060 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1074 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
@@ -1066,6 +1080,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) | |||
1066 | unsigned long max_nl_size = 0; | 1080 | unsigned long max_nl_size = 0; |
1067 | unsigned int mapcount; | 1081 | unsigned int mapcount; |
1068 | unsigned int mlocked = 0; | 1082 | unsigned int mlocked = 0; |
1083 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
1069 | 1084 | ||
1070 | if (MLOCK_PAGES && unlikely(unlock)) | 1085 | if (MLOCK_PAGES && unlikely(unlock)) |
1071 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | 1086 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ |
@@ -1078,7 +1093,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) | |||
1078 | continue; /* must visit all vmas */ | 1093 | continue; /* must visit all vmas */ |
1079 | ret = SWAP_MLOCK; | 1094 | ret = SWAP_MLOCK; |
1080 | } else { | 1095 | } else { |
1081 | ret = try_to_unmap_one(page, vma, migration); | 1096 | ret = try_to_unmap_one(page, vma, flags); |
1082 | if (ret == SWAP_FAIL || !page_mapped(page)) | 1097 | if (ret == SWAP_FAIL || !page_mapped(page)) |
1083 | goto out; | 1098 | goto out; |
1084 | } | 1099 | } |
@@ -1103,7 +1118,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) | |||
1103 | ret = SWAP_MLOCK; /* leave mlocked == 0 */ | 1118 | ret = SWAP_MLOCK; /* leave mlocked == 0 */ |
1104 | goto out; /* no need to look further */ | 1119 | goto out; /* no need to look further */ |
1105 | } | 1120 | } |
1106 | if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) | 1121 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && |
1122 | (vma->vm_flags & VM_LOCKED)) | ||
1107 | continue; | 1123 | continue; |
1108 | cursor = (unsigned long) vma->vm_private_data; | 1124 | cursor = (unsigned long) vma->vm_private_data; |
1109 | if (cursor > max_nl_cursor) | 1125 | if (cursor > max_nl_cursor) |
@@ -1137,7 +1153,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) | |||
1137 | do { | 1153 | do { |
1138 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1154 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1139 | shared.vm_set.list) { | 1155 | shared.vm_set.list) { |
1140 | if (!MLOCK_PAGES && !migration && | 1156 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && |
1141 | (vma->vm_flags & VM_LOCKED)) | 1157 | (vma->vm_flags & VM_LOCKED)) |
1142 | continue; | 1158 | continue; |
1143 | cursor = (unsigned long) vma->vm_private_data; | 1159 | cursor = (unsigned long) vma->vm_private_data; |
@@ -1177,7 +1193,7 @@ out: | |||
1177 | /** | 1193 | /** |
1178 | * try_to_unmap - try to remove all page table mappings to a page | 1194 | * try_to_unmap - try to remove all page table mappings to a page |
1179 | * @page: the page to get unmapped | 1195 | * @page: the page to get unmapped |
1180 | * @migration: migration flag | 1196 | * @flags: action and flags |
1181 | * | 1197 | * |
1182 | * Tries to remove all the page table entries which are mapping this | 1198 | * Tries to remove all the page table entries which are mapping this |
1183 | * page, used in the pageout path. Caller must hold the page lock. | 1199 | * page, used in the pageout path. Caller must hold the page lock. |
@@ -1188,16 +1204,16 @@ out: | |||
1188 | * SWAP_FAIL - the page is unswappable | 1204 | * SWAP_FAIL - the page is unswappable |
1189 | * SWAP_MLOCK - page is mlocked. | 1205 | * SWAP_MLOCK - page is mlocked. |
1190 | */ | 1206 | */ |
1191 | int try_to_unmap(struct page *page, int migration) | 1207 | int try_to_unmap(struct page *page, enum ttu_flags flags) |
1192 | { | 1208 | { |
1193 | int ret; | 1209 | int ret; |
1194 | 1210 | ||
1195 | BUG_ON(!PageLocked(page)); | 1211 | BUG_ON(!PageLocked(page)); |
1196 | 1212 | ||
1197 | if (PageAnon(page)) | 1213 | if (PageAnon(page)) |
1198 | ret = try_to_unmap_anon(page, 0, migration); | 1214 | ret = try_to_unmap_anon(page, flags); |
1199 | else | 1215 | else |
1200 | ret = try_to_unmap_file(page, 0, migration); | 1216 | ret = try_to_unmap_file(page, flags); |
1201 | if (ret != SWAP_MLOCK && !page_mapped(page)) | 1217 | if (ret != SWAP_MLOCK && !page_mapped(page)) |
1202 | ret = SWAP_SUCCESS; | 1218 | ret = SWAP_SUCCESS; |
1203 | return ret; | 1219 | return ret; |
@@ -1222,8 +1238,8 @@ int try_to_munlock(struct page *page) | |||
1222 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); | 1238 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); |
1223 | 1239 | ||
1224 | if (PageAnon(page)) | 1240 | if (PageAnon(page)) |
1225 | return try_to_unmap_anon(page, 1, 0); | 1241 | return try_to_unmap_anon(page, TTU_MUNLOCK); |
1226 | else | 1242 | else |
1227 | return try_to_unmap_file(page, 1, 0); | 1243 | return try_to_unmap_file(page, TTU_MUNLOCK); |
1228 | } | 1244 | } |
1229 | 1245 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index aa9481166aae..ccf446a9faa1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1634,8 +1634,8 @@ shmem_write_end(struct file *file, struct address_space *mapping, | |||
1634 | if (pos + copied > inode->i_size) | 1634 | if (pos + copied > inode->i_size) |
1635 | i_size_write(inode, pos + copied); | 1635 | i_size_write(inode, pos + copied); |
1636 | 1636 | ||
1637 | unlock_page(page); | ||
1638 | set_page_dirty(page); | 1637 | set_page_dirty(page); |
1638 | unlock_page(page); | ||
1639 | page_cache_release(page); | 1639 | page_cache_release(page); |
1640 | 1640 | ||
1641 | return copied; | 1641 | return copied; |
@@ -1972,13 +1972,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
1972 | iput(inode); | 1972 | iput(inode); |
1973 | return error; | 1973 | return error; |
1974 | } | 1974 | } |
1975 | unlock_page(page); | ||
1976 | inode->i_mapping->a_ops = &shmem_aops; | 1975 | inode->i_mapping->a_ops = &shmem_aops; |
1977 | inode->i_op = &shmem_symlink_inode_operations; | 1976 | inode->i_op = &shmem_symlink_inode_operations; |
1978 | kaddr = kmap_atomic(page, KM_USER0); | 1977 | kaddr = kmap_atomic(page, KM_USER0); |
1979 | memcpy(kaddr, symname, len); | 1978 | memcpy(kaddr, symname, len); |
1980 | kunmap_atomic(kaddr, KM_USER0); | 1979 | kunmap_atomic(kaddr, KM_USER0); |
1981 | set_page_dirty(page); | 1980 | set_page_dirty(page); |
1981 | unlock_page(page); | ||
1982 | page_cache_release(page); | 1982 | page_cache_release(page); |
1983 | } | 1983 | } |
1984 | if (dir->i_mode & S_ISGID) | 1984 | if (dir->i_mode & S_ISGID) |
@@ -2421,6 +2421,7 @@ static const struct address_space_operations shmem_aops = { | |||
2421 | .write_end = shmem_write_end, | 2421 | .write_end = shmem_write_end, |
2422 | #endif | 2422 | #endif |
2423 | .migratepage = migrate_page, | 2423 | .migratepage = migrate_page, |
2424 | .error_remove_page = generic_error_remove_page, | ||
2424 | }; | 2425 | }; |
2425 | 2426 | ||
2426 | static const struct file_operations shmem_file_operations = { | 2427 | static const struct file_operations shmem_file_operations = { |
diff --git a/mm/swapfile.c b/mm/swapfile.c index f1bf19daadc6..4de7f02f820b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -699,7 +699,7 @@ int free_swap_and_cache(swp_entry_t entry) | |||
699 | struct swap_info_struct *p; | 699 | struct swap_info_struct *p; |
700 | struct page *page = NULL; | 700 | struct page *page = NULL; |
701 | 701 | ||
702 | if (is_migration_entry(entry)) | 702 | if (non_swap_entry(entry)) |
703 | return 1; | 703 | return 1; |
704 | 704 | ||
705 | p = swap_info_get(entry); | 705 | p = swap_info_get(entry); |
@@ -2085,7 +2085,7 @@ static int __swap_duplicate(swp_entry_t entry, bool cache) | |||
2085 | int count; | 2085 | int count; |
2086 | bool has_cache; | 2086 | bool has_cache; |
2087 | 2087 | ||
2088 | if (is_migration_entry(entry)) | 2088 | if (non_swap_entry(entry)) |
2089 | return -EINVAL; | 2089 | return -EINVAL; |
2090 | 2090 | ||
2091 | type = swp_type(entry); | 2091 | type = swp_type(entry); |
diff --git a/mm/truncate.c b/mm/truncate.c index ccc3ecf7cb98..450cebdabfc0 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -93,11 +93,11 @@ EXPORT_SYMBOL(cancel_dirty_page); | |||
93 | * its lock, b) when a concurrent invalidate_mapping_pages got there first and | 93 | * its lock, b) when a concurrent invalidate_mapping_pages got there first and |
94 | * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. | 94 | * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. |
95 | */ | 95 | */ |
96 | static void | 96 | static int |
97 | truncate_complete_page(struct address_space *mapping, struct page *page) | 97 | truncate_complete_page(struct address_space *mapping, struct page *page) |
98 | { | 98 | { |
99 | if (page->mapping != mapping) | 99 | if (page->mapping != mapping) |
100 | return; | 100 | return -EIO; |
101 | 101 | ||
102 | if (page_has_private(page)) | 102 | if (page_has_private(page)) |
103 | do_invalidatepage(page, 0); | 103 | do_invalidatepage(page, 0); |
@@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
108 | remove_from_page_cache(page); | 108 | remove_from_page_cache(page); |
109 | ClearPageMappedToDisk(page); | 109 | ClearPageMappedToDisk(page); |
110 | page_cache_release(page); /* pagecache ref */ | 110 | page_cache_release(page); /* pagecache ref */ |
111 | return 0; | ||
111 | } | 112 | } |
112 | 113 | ||
113 | /* | 114 | /* |
@@ -135,6 +136,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) | |||
135 | return ret; | 136 | return ret; |
136 | } | 137 | } |
137 | 138 | ||
139 | int truncate_inode_page(struct address_space *mapping, struct page *page) | ||
140 | { | ||
141 | if (page_mapped(page)) { | ||
142 | unmap_mapping_range(mapping, | ||
143 | (loff_t)page->index << PAGE_CACHE_SHIFT, | ||
144 | PAGE_CACHE_SIZE, 0); | ||
145 | } | ||
146 | return truncate_complete_page(mapping, page); | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * Used to get rid of pages on hardware memory corruption. | ||
151 | */ | ||
152 | int generic_error_remove_page(struct address_space *mapping, struct page *page) | ||
153 | { | ||
154 | if (!mapping) | ||
155 | return -EINVAL; | ||
156 | /* | ||
157 | * Only punch for normal data pages for now. | ||
158 | * Handling other types like directories would need more auditing. | ||
159 | */ | ||
160 | if (!S_ISREG(mapping->host->i_mode)) | ||
161 | return -EIO; | ||
162 | return truncate_inode_page(mapping, page); | ||
163 | } | ||
164 | EXPORT_SYMBOL(generic_error_remove_page); | ||
165 | |||
166 | /* | ||
167 | * Safely invalidate one page from its pagecache mapping. | ||
168 | * It only drops clean, unused pages. The page must be locked. | ||
169 | * | ||
170 | * Returns 1 if the page is successfully invalidated, otherwise 0. | ||
171 | */ | ||
172 | int invalidate_inode_page(struct page *page) | ||
173 | { | ||
174 | struct address_space *mapping = page_mapping(page); | ||
175 | if (!mapping) | ||
176 | return 0; | ||
177 | if (PageDirty(page) || PageWriteback(page)) | ||
178 | return 0; | ||
179 | if (page_mapped(page)) | ||
180 | return 0; | ||
181 | return invalidate_complete_page(mapping, page); | ||
182 | } | ||
183 | |||
138 | /** | 184 | /** |
139 | * truncate_inode_pages - truncate range of pages specified by start & end byte offsets | 185 | * truncate_inode_pages - truncate range of pages specified by start & end byte offsets |
140 | * @mapping: mapping to truncate | 186 | * @mapping: mapping to truncate |
@@ -196,12 +242,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
196 | unlock_page(page); | 242 | unlock_page(page); |
197 | continue; | 243 | continue; |
198 | } | 244 | } |
199 | if (page_mapped(page)) { | 245 | truncate_inode_page(mapping, page); |
200 | unmap_mapping_range(mapping, | ||
201 | (loff_t)page_index<<PAGE_CACHE_SHIFT, | ||
202 | PAGE_CACHE_SIZE, 0); | ||
203 | } | ||
204 | truncate_complete_page(mapping, page); | ||
205 | unlock_page(page); | 246 | unlock_page(page); |
206 | } | 247 | } |
207 | pagevec_release(&pvec); | 248 | pagevec_release(&pvec); |
@@ -238,15 +279,10 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
238 | break; | 279 | break; |
239 | lock_page(page); | 280 | lock_page(page); |
240 | wait_on_page_writeback(page); | 281 | wait_on_page_writeback(page); |
241 | if (page_mapped(page)) { | 282 | truncate_inode_page(mapping, page); |
242 | unmap_mapping_range(mapping, | ||
243 | (loff_t)page->index<<PAGE_CACHE_SHIFT, | ||
244 | PAGE_CACHE_SIZE, 0); | ||
245 | } | ||
246 | if (page->index > next) | 283 | if (page->index > next) |
247 | next = page->index; | 284 | next = page->index; |
248 | next++; | 285 | next++; |
249 | truncate_complete_page(mapping, page); | ||
250 | unlock_page(page); | 286 | unlock_page(page); |
251 | } | 287 | } |
252 | pagevec_release(&pvec); | 288 | pagevec_release(&pvec); |
@@ -311,12 +347,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
311 | if (lock_failed) | 347 | if (lock_failed) |
312 | continue; | 348 | continue; |
313 | 349 | ||
314 | if (PageDirty(page) || PageWriteback(page)) | 350 | ret += invalidate_inode_page(page); |
315 | goto unlock; | 351 | |
316 | if (page_mapped(page)) | ||
317 | goto unlock; | ||
318 | ret += invalidate_complete_page(mapping, page); | ||
319 | unlock: | ||
320 | unlock_page(page); | 352 | unlock_page(page); |
321 | if (next > end) | 353 | if (next > end) |
322 | break; | 354 | break; |
@@ -465,3 +497,67 @@ int invalidate_inode_pages2(struct address_space *mapping) | |||
465 | return invalidate_inode_pages2_range(mapping, 0, -1); | 497 | return invalidate_inode_pages2_range(mapping, 0, -1); |
466 | } | 498 | } |
467 | EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | 499 | EXPORT_SYMBOL_GPL(invalidate_inode_pages2); |
500 | |||
501 | /** | ||
502 | * truncate_pagecache - unmap and remove pagecache that has been truncated | ||
503 | * @inode: inode | ||
504 | * @old: old file offset | ||
505 | * @new: new file offset | ||
506 | * | ||
507 | * inode's new i_size must already be written before truncate_pagecache | ||
508 | * is called. | ||
509 | * | ||
510 | * This function should typically be called before the filesystem | ||
511 | * releases resources associated with the freed range (eg. deallocates | ||
512 | * blocks). This way, pagecache will always stay logically coherent | ||
513 | * with on-disk format, and the filesystem would not have to deal with | ||
514 | * situations such as writepage being called for a page that has already | ||
515 | * had its underlying blocks deallocated. | ||
516 | */ | ||
517 | void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) | ||
518 | { | ||
519 | if (new < old) { | ||
520 | struct address_space *mapping = inode->i_mapping; | ||
521 | |||
522 | /* | ||
523 | * unmap_mapping_range is called twice, first simply for | ||
524 | * efficiency so that truncate_inode_pages does fewer | ||
525 | * single-page unmaps. However after this first call, and | ||
526 | * before truncate_inode_pages finishes, it is possible for | ||
527 | * private pages to be COWed, which remain after | ||
528 | * truncate_inode_pages finishes, hence the second | ||
529 | * unmap_mapping_range call must be made for correctness. | ||
530 | */ | ||
531 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | ||
532 | truncate_inode_pages(mapping, new); | ||
533 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | ||
534 | } | ||
535 | } | ||
536 | EXPORT_SYMBOL(truncate_pagecache); | ||
537 | |||
538 | /** | ||
539 | * vmtruncate - unmap mappings "freed" by truncate() syscall | ||
540 | * @inode: inode of the file used | ||
541 | * @offset: file offset to start truncating | ||
542 | * | ||
543 | * NOTE! We have to be ready to update the memory sharing | ||
544 | * between the file and the memory map for a potential last | ||
545 | * incomplete page. Ugly, but necessary. | ||
546 | */ | ||
547 | int vmtruncate(struct inode *inode, loff_t offset) | ||
548 | { | ||
549 | loff_t oldsize; | ||
550 | int error; | ||
551 | |||
552 | error = inode_newsize_ok(inode, offset); | ||
553 | if (error) | ||
554 | return error; | ||
555 | oldsize = inode->i_size; | ||
556 | i_size_write(inode, offset); | ||
557 | truncate_pagecache(inode, oldsize, offset); | ||
558 | if (inode->i_op->truncate) | ||
559 | inode->i_op->truncate(inode); | ||
560 | |||
561 | return error; | ||
562 | } | ||
563 | EXPORT_SYMBOL(vmtruncate); | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5535da1d6961..69511e663234 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -184,7 +184,7 @@ static int vmap_page_range(unsigned long start, unsigned long end, | |||
184 | return ret; | 184 | return ret; |
185 | } | 185 | } |
186 | 186 | ||
187 | static inline int is_vmalloc_or_module_addr(const void *x) | 187 | int is_vmalloc_or_module_addr(const void *x) |
188 | { | 188 | { |
189 | /* | 189 | /* |
190 | * ARM, x86-64 and sparc64 put modules in a special place, | 190 | * ARM, x86-64 and sparc64 put modules in a special place, |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 359c3c57ef85..64e438898832 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -663,7 +663,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
663 | * processes. Try to unmap it here. | 663 | * processes. Try to unmap it here. |
664 | */ | 664 | */ |
665 | if (page_mapped(page) && mapping) { | 665 | if (page_mapped(page) && mapping) { |
666 | switch (try_to_unmap(page, 0)) { | 666 | switch (try_to_unmap(page, TTU_UNMAP)) { |
667 | case SWAP_FAIL: | 667 | case SWAP_FAIL: |
668 | goto activate_locked; | 668 | goto activate_locked; |
669 | case SWAP_AGAIN: | 669 | case SWAP_AGAIN: |
@@ -1836,11 +1836,45 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
1836 | 1836 | ||
1837 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 1837 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
1838 | 1838 | ||
1839 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | ||
1840 | gfp_t gfp_mask, bool noswap, | ||
1841 | unsigned int swappiness, | ||
1842 | struct zone *zone, int nid) | ||
1843 | { | ||
1844 | struct scan_control sc = { | ||
1845 | .may_writepage = !laptop_mode, | ||
1846 | .may_unmap = 1, | ||
1847 | .may_swap = !noswap, | ||
1848 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
1849 | .swappiness = swappiness, | ||
1850 | .order = 0, | ||
1851 | .mem_cgroup = mem, | ||
1852 | .isolate_pages = mem_cgroup_isolate_pages, | ||
1853 | }; | ||
1854 | nodemask_t nm = nodemask_of_node(nid); | ||
1855 | |||
1856 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | ||
1857 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | ||
1858 | sc.nodemask = &nm; | ||
1859 | sc.nr_reclaimed = 0; | ||
1860 | sc.nr_scanned = 0; | ||
1861 | /* | ||
1862 | * NOTE: Although we can get the priority field, using it | ||
1863 | * here is not a good idea, since it limits the pages we can scan. | ||
1864 | * if we don't reclaim here, the shrink_zone from balance_pgdat | ||
1865 | * will pick up pages from other mem cgroup's as well. We hack | ||
1866 | * the priority and make it zero. | ||
1867 | */ | ||
1868 | shrink_zone(0, zone, &sc); | ||
1869 | return sc.nr_reclaimed; | ||
1870 | } | ||
1871 | |||
1839 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | 1872 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, |
1840 | gfp_t gfp_mask, | 1873 | gfp_t gfp_mask, |
1841 | bool noswap, | 1874 | bool noswap, |
1842 | unsigned int swappiness) | 1875 | unsigned int swappiness) |
1843 | { | 1876 | { |
1877 | struct zonelist *zonelist; | ||
1844 | struct scan_control sc = { | 1878 | struct scan_control sc = { |
1845 | .may_writepage = !laptop_mode, | 1879 | .may_writepage = !laptop_mode, |
1846 | .may_unmap = 1, | 1880 | .may_unmap = 1, |
@@ -1852,7 +1886,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1852 | .isolate_pages = mem_cgroup_isolate_pages, | 1886 | .isolate_pages = mem_cgroup_isolate_pages, |
1853 | .nodemask = NULL, /* we don't care the placement */ | 1887 | .nodemask = NULL, /* we don't care the placement */ |
1854 | }; | 1888 | }; |
1855 | struct zonelist *zonelist; | ||
1856 | 1889 | ||
1857 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 1890 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
1858 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 1891 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
@@ -1974,6 +2007,7 @@ loop_again: | |||
1974 | for (i = 0; i <= end_zone; i++) { | 2007 | for (i = 0; i <= end_zone; i++) { |
1975 | struct zone *zone = pgdat->node_zones + i; | 2008 | struct zone *zone = pgdat->node_zones + i; |
1976 | int nr_slab; | 2009 | int nr_slab; |
2010 | int nid, zid; | ||
1977 | 2011 | ||
1978 | if (!populated_zone(zone)) | 2012 | if (!populated_zone(zone)) |
1979 | continue; | 2013 | continue; |
@@ -1988,6 +2022,15 @@ loop_again: | |||
1988 | temp_priority[i] = priority; | 2022 | temp_priority[i] = priority; |
1989 | sc.nr_scanned = 0; | 2023 | sc.nr_scanned = 0; |
1990 | note_zone_scanning_priority(zone, priority); | 2024 | note_zone_scanning_priority(zone, priority); |
2025 | |||
2026 | nid = pgdat->node_id; | ||
2027 | zid = zone_idx(zone); | ||
2028 | /* | ||
2029 | * Call soft limit reclaim before calling shrink_zone. | ||
2030 | * For now we ignore the return value | ||
2031 | */ | ||
2032 | mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, | ||
2033 | nid, zid); | ||
1991 | /* | 2034 | /* |
1992 | * We put equal pressure on every zone, unless one | 2035 | * We put equal pressure on every zone, unless one |
1993 | * zone has way too many pages free already. | 2036 | * zone has way too many pages free already. |
@@ -2801,10 +2844,10 @@ static void scan_all_zones_unevictable_pages(void) | |||
2801 | unsigned long scan_unevictable_pages; | 2844 | unsigned long scan_unevictable_pages; |
2802 | 2845 | ||
2803 | int scan_unevictable_handler(struct ctl_table *table, int write, | 2846 | int scan_unevictable_handler(struct ctl_table *table, int write, |
2804 | struct file *file, void __user *buffer, | 2847 | void __user *buffer, |
2805 | size_t *length, loff_t *ppos) | 2848 | size_t *length, loff_t *ppos) |
2806 | { | 2849 | { |
2807 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | 2850 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
2808 | 2851 | ||
2809 | if (write && *(unsigned long *)table->data) | 2852 | if (write && *(unsigned long *)table->data) |
2810 | scan_all_zones_unevictable_pages(); | 2853 | scan_all_zones_unevictable_pages(); |